From 296e75da20f336a77e9698f2f4788db5499a46ee Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 19 Jul 2023 12:58:00 -0700 Subject: [PATCH 01/40] update Pgenlib to >=0.90.1 --- poetry.lock | 249 +++++++++++++++++++++++++++++++++---------------- pyproject.toml | 2 +- 2 files changed, 171 insertions(+), 80 deletions(-) diff --git a/poetry.lock b/poetry.lock index d5b353f5..37053a06 100644 --- a/poetry.lock +++ b/poetry.lock @@ -153,7 +153,7 @@ development = ["black", "flake8", "mypy", "pytest", "types-colorama"] [[package]] name = "coverage" -version = "7.1.0" +version = "7.2.7" description = "Code coverage measurement for Python" category = "dev" optional = false @@ -173,6 +173,14 @@ category = "main" optional = false python-versions = ">=3.6" +[[package]] +name = "cython" +version = "3.0.0" +description = "The Cython compiler for writing C extensions in the Python language." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + [[package]] name = "cyvcf2" version = "0.30.18" @@ -526,7 +534,7 @@ ptyprocess = ">=0.5" [[package]] name = "pgenlib" -version = "0.81.3" +version = "0.90.1" description = "Python wrapper for pgenlib's basic reader and writer." category = "main" optional = true @@ -646,11 +654,14 @@ python-versions = "*" [[package]] name = "pysam" -version = "0.19.0" -description = "pysam" +version = "0.21.0" +description = "pysam - a python module for reading, manipulating and writing genomic data sets." category = "main" optional = false -python-versions = "*" +python-versions = ">=3.6" + +[package.dependencies] +cython = "*" [[package]] name = "pytest" @@ -992,7 +1003,7 @@ tests = [] [metadata] lock-version = "1.1" python-versions = ">=3.7,<3.11" -content-hash = "c06b769c137c78eec0fa14f907979cf92c1f4c7d2473e9f2a12f2a5d8eb31225" +content-hash = "30ca6f38b3c8fc1f4f13fff84d6923bba6b00acdd037498c81d0183eebee50d4" [metadata.files] alabaster = [ @@ -1067,62 +1078,131 @@ colorlog = [ {file = "colorlog-6.7.0.tar.gz", hash = "sha256:bd94bd21c1e13fac7bd3153f4bc3a7dc0eb0974b8bc2fdf1a989e474f6e582e5"}, ] coverage = [ - {file = "coverage-7.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3b946bbcd5a8231383450b195cfb58cb01cbe7f8949f5758566b881df4b33baf"}, - {file = "coverage-7.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ec8e767f13be637d056f7e07e61d089e555f719b387a7070154ad80a0ff31801"}, - {file = "coverage-7.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d4a5a5879a939cb84959d86869132b00176197ca561c664fc21478c1eee60d75"}, - {file = "coverage-7.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b643cb30821e7570c0aaf54feaf0bfb630b79059f85741843e9dc23f33aaca2c"}, - {file = "coverage-7.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32df215215f3af2c1617a55dbdfb403b772d463d54d219985ac7cd3bf124cada"}, - {file = "coverage-7.1.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:33d1ae9d4079e05ac4cc1ef9e20c648f5afabf1a92adfaf2ccf509c50b85717f"}, - {file = "coverage-7.1.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:29571503c37f2ef2138a306d23e7270687c0efb9cab4bd8038d609b5c2393a3a"}, - {file = "coverage-7.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:63ffd21aa133ff48c4dff7adcc46b7ec8b565491bfc371212122dd999812ea1c"}, - {file = "coverage-7.1.0-cp310-cp310-win32.whl", hash = "sha256:4b14d5e09c656de5038a3f9bfe5228f53439282abcab87317c9f7f1acb280352"}, - {file = "coverage-7.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:8361be1c2c073919500b6601220a6f2f98ea0b6d2fec5014c1d9cfa23dd07038"}, - {file = "coverage-7.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:da9b41d4539eefd408c46725fb76ecba3a50a3367cafb7dea5f250d0653c1040"}, - {file = "coverage-7.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c5b15ed7644ae4bee0ecf74fee95808dcc34ba6ace87e8dfbf5cb0dc20eab45a"}, - {file = "coverage-7.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d12d076582507ea460ea2a89a8c85cb558f83406c8a41dd641d7be9a32e1274f"}, - {file = "coverage-7.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2617759031dae1bf183c16cef8fcfb3de7617f394c813fa5e8e46e9b82d4222"}, - {file = "coverage-7.1.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c4e4881fa9e9667afcc742f0c244d9364d197490fbc91d12ac3b5de0bf2df146"}, - {file = "coverage-7.1.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:9d58885215094ab4a86a6aef044e42994a2bd76a446dc59b352622655ba6621b"}, - {file = "coverage-7.1.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:ffeeb38ee4a80a30a6877c5c4c359e5498eec095878f1581453202bfacc8fbc2"}, - {file = "coverage-7.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3baf5f126f30781b5e93dbefcc8271cb2491647f8283f20ac54d12161dff080e"}, - {file = "coverage-7.1.0-cp311-cp311-win32.whl", hash = "sha256:ded59300d6330be27bc6cf0b74b89ada58069ced87c48eaf9344e5e84b0072f7"}, - {file = "coverage-7.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:6a43c7823cd7427b4ed763aa7fb63901ca8288591323b58c9cd6ec31ad910f3c"}, - {file = "coverage-7.1.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:7a726d742816cb3a8973c8c9a97539c734b3a309345236cd533c4883dda05b8d"}, - {file = "coverage-7.1.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bc7c85a150501286f8b56bd8ed3aa4093f4b88fb68c0843d21ff9656f0009d6a"}, - {file = "coverage-7.1.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f5b4198d85a3755d27e64c52f8c95d6333119e49fd001ae5798dac872c95e0f8"}, - {file = "coverage-7.1.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddb726cb861c3117a553f940372a495fe1078249ff5f8a5478c0576c7be12050"}, - {file = "coverage-7.1.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:51b236e764840a6df0661b67e50697aaa0e7d4124ca95e5058fa3d7cbc240b7c"}, - {file = "coverage-7.1.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:7ee5c9bb51695f80878faaa5598040dd6c9e172ddcf490382e8aedb8ec3fec8d"}, - {file = "coverage-7.1.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:c31b75ae466c053a98bf26843563b3b3517b8f37da4d47b1c582fdc703112bc3"}, - {file = "coverage-7.1.0-cp37-cp37m-win32.whl", hash = "sha256:3b155caf3760408d1cb903b21e6a97ad4e2bdad43cbc265e3ce0afb8e0057e73"}, - {file = "coverage-7.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:2a60d6513781e87047c3e630b33b4d1e89f39836dac6e069ffee28c4786715f5"}, - {file = "coverage-7.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f2cba5c6db29ce991029b5e4ac51eb36774458f0a3b8d3137241b32d1bb91f06"}, - {file = "coverage-7.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:beeb129cacea34490ffd4d6153af70509aa3cda20fdda2ea1a2be870dfec8d52"}, - {file = "coverage-7.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0c45948f613d5d18c9ec5eaa203ce06a653334cf1bd47c783a12d0dd4fd9c851"}, - {file = "coverage-7.1.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ef382417db92ba23dfb5864a3fc9be27ea4894e86620d342a116b243ade5d35d"}, - {file = "coverage-7.1.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7c7c0d0827e853315c9bbd43c1162c006dd808dbbe297db7ae66cd17b07830f0"}, - {file = "coverage-7.1.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e5cdbb5cafcedea04924568d990e20ce7f1945a1dd54b560f879ee2d57226912"}, - {file = "coverage-7.1.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:9817733f0d3ea91bea80de0f79ef971ae94f81ca52f9b66500c6a2fea8e4b4f8"}, - {file = "coverage-7.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:218fe982371ac7387304153ecd51205f14e9d731b34fb0568181abaf7b443ba0"}, - {file = "coverage-7.1.0-cp38-cp38-win32.whl", hash = "sha256:04481245ef966fbd24ae9b9e537ce899ae584d521dfbe78f89cad003c38ca2ab"}, - {file = "coverage-7.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:8ae125d1134bf236acba8b83e74c603d1b30e207266121e76484562bc816344c"}, - {file = "coverage-7.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2bf1d5f2084c3932b56b962a683074a3692bce7cabd3aa023c987a2a8e7612f6"}, - {file = "coverage-7.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:98b85dd86514d889a2e3dd22ab3c18c9d0019e696478391d86708b805f4ea0fa"}, - {file = "coverage-7.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38da2db80cc505a611938d8624801158e409928b136c8916cd2e203970dde4dc"}, - {file = "coverage-7.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3164d31078fa9efe406e198aecd2a02d32a62fecbdef74f76dad6a46c7e48311"}, - {file = "coverage-7.1.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db61a79c07331e88b9a9974815c075fbd812bc9dbc4dc44b366b5368a2936063"}, - {file = "coverage-7.1.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9ccb092c9ede70b2517a57382a601619d20981f56f440eae7e4d7eaafd1d1d09"}, - {file = "coverage-7.1.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:33ff26d0f6cc3ca8de13d14fde1ff8efe1456b53e3f0273e63cc8b3c84a063d8"}, - {file = "coverage-7.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d47dd659a4ee952e90dc56c97d78132573dc5c7b09d61b416a9deef4ebe01a0c"}, - {file = "coverage-7.1.0-cp39-cp39-win32.whl", hash = "sha256:d248cd4a92065a4d4543b8331660121b31c4148dd00a691bfb7a5cdc7483cfa4"}, - {file = "coverage-7.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:7ed681b0f8e8bcbbffa58ba26fcf5dbc8f79e7997595bf071ed5430d8c08d6f3"}, - {file = "coverage-7.1.0-pp37.pp38.pp39-none-any.whl", hash = "sha256:755e89e32376c850f826c425ece2c35a4fc266c081490eb0a841e7c1cb0d3bda"}, - {file = "coverage-7.1.0.tar.gz", hash = "sha256:10188fe543560ec4874f974b5305cd1a8bdcfa885ee00ea3a03733464c4ca265"}, + {file = "coverage-7.2.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d39b5b4f2a66ccae8b7263ac3c8170994b65266797fb96cbbfd3fb5b23921db8"}, + {file = "coverage-7.2.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6d040ef7c9859bb11dfeb056ff5b3872436e3b5e401817d87a31e1750b9ae2fb"}, + {file = "coverage-7.2.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ba90a9563ba44a72fda2e85302c3abc71c5589cea608ca16c22b9804262aaeb6"}, + {file = "coverage-7.2.7-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7d9405291c6928619403db1d10bd07888888ec1abcbd9748fdaa971d7d661b2"}, + {file = "coverage-7.2.7-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31563e97dae5598556600466ad9beea39fb04e0229e61c12eaa206e0aa202063"}, + {file = "coverage-7.2.7-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:ebba1cd308ef115925421d3e6a586e655ca5a77b5bf41e02eb0e4562a111f2d1"}, + {file = "coverage-7.2.7-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:cb017fd1b2603ef59e374ba2063f593abe0fc45f2ad9abdde5b4d83bd922a353"}, + {file = "coverage-7.2.7-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d62a5c7dad11015c66fbb9d881bc4caa5b12f16292f857842d9d1871595f4495"}, + {file = "coverage-7.2.7-cp310-cp310-win32.whl", hash = "sha256:ee57190f24fba796e36bb6d3aa8a8783c643d8fa9760c89f7a98ab5455fbf818"}, + {file = "coverage-7.2.7-cp310-cp310-win_amd64.whl", hash = "sha256:f75f7168ab25dd93110c8a8117a22450c19976afbc44234cbf71481094c1b850"}, + {file = "coverage-7.2.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:06a9a2be0b5b576c3f18f1a241f0473575c4a26021b52b2a85263a00f034d51f"}, + {file = "coverage-7.2.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5baa06420f837184130752b7c5ea0808762083bf3487b5038d68b012e5937dbe"}, + {file = "coverage-7.2.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fdec9e8cbf13a5bf63290fc6013d216a4c7232efb51548594ca3631a7f13c3a3"}, + {file = "coverage-7.2.7-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:52edc1a60c0d34afa421c9c37078817b2e67a392cab17d97283b64c5833f427f"}, + {file = "coverage-7.2.7-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:63426706118b7f5cf6bb6c895dc215d8a418d5952544042c8a2d9fe87fcf09cb"}, + {file = "coverage-7.2.7-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:afb17f84d56068a7c29f5fa37bfd38d5aba69e3304af08ee94da8ed5b0865833"}, + {file = "coverage-7.2.7-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:48c19d2159d433ccc99e729ceae7d5293fbffa0bdb94952d3579983d1c8c9d97"}, + {file = "coverage-7.2.7-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0e1f928eaf5469c11e886fe0885ad2bf1ec606434e79842a879277895a50942a"}, + {file = "coverage-7.2.7-cp311-cp311-win32.whl", hash = "sha256:33d6d3ea29d5b3a1a632b3c4e4f4ecae24ef170b0b9ee493883f2df10039959a"}, + {file = "coverage-7.2.7-cp311-cp311-win_amd64.whl", hash = "sha256:5b7540161790b2f28143191f5f8ec02fb132660ff175b7747b95dcb77ac26562"}, + {file = "coverage-7.2.7-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f2f67fe12b22cd130d34d0ef79206061bfb5eda52feb6ce0dba0644e20a03cf4"}, + {file = "coverage-7.2.7-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a342242fe22407f3c17f4b499276a02b01e80f861f1682ad1d95b04018e0c0d4"}, + {file = "coverage-7.2.7-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:171717c7cb6b453aebac9a2ef603699da237f341b38eebfee9be75d27dc38e01"}, + {file = "coverage-7.2.7-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49969a9f7ffa086d973d91cec8d2e31080436ef0fb4a359cae927e742abfaaa6"}, + {file = "coverage-7.2.7-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:b46517c02ccd08092f4fa99f24c3b83d8f92f739b4657b0f146246a0ca6a831d"}, + {file = "coverage-7.2.7-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:a3d33a6b3eae87ceaefa91ffdc130b5e8536182cd6dfdbfc1aa56b46ff8c86de"}, + {file = "coverage-7.2.7-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:976b9c42fb2a43ebf304fa7d4a310e5f16cc99992f33eced91ef6f908bd8f33d"}, + {file = "coverage-7.2.7-cp312-cp312-win32.whl", hash = "sha256:8de8bb0e5ad103888d65abef8bca41ab93721647590a3f740100cd65c3b00511"}, + {file = "coverage-7.2.7-cp312-cp312-win_amd64.whl", hash = "sha256:9e31cb64d7de6b6f09702bb27c02d1904b3aebfca610c12772452c4e6c21a0d3"}, + {file = "coverage-7.2.7-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:58c2ccc2f00ecb51253cbe5d8d7122a34590fac9646a960d1430d5b15321d95f"}, + {file = "coverage-7.2.7-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d22656368f0e6189e24722214ed8d66b8022db19d182927b9a248a2a8a2f67eb"}, + {file = "coverage-7.2.7-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a895fcc7b15c3fc72beb43cdcbdf0ddb7d2ebc959edac9cef390b0d14f39f8a9"}, + {file = "coverage-7.2.7-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e84606b74eb7de6ff581a7915e2dab7a28a0517fbe1c9239eb227e1354064dcd"}, + {file = "coverage-7.2.7-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:0a5f9e1dbd7fbe30196578ca36f3fba75376fb99888c395c5880b355e2875f8a"}, + {file = "coverage-7.2.7-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:419bfd2caae268623dd469eff96d510a920c90928b60f2073d79f8fe2bbc5959"}, + {file = "coverage-7.2.7-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:2aee274c46590717f38ae5e4650988d1af340fe06167546cc32fe2f58ed05b02"}, + {file = "coverage-7.2.7-cp37-cp37m-win32.whl", hash = "sha256:61b9a528fb348373c433e8966535074b802c7a5d7f23c4f421e6c6e2f1697a6f"}, + {file = "coverage-7.2.7-cp37-cp37m-win_amd64.whl", hash = "sha256:b1c546aca0ca4d028901d825015dc8e4d56aac4b541877690eb76490f1dc8ed0"}, + {file = "coverage-7.2.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:54b896376ab563bd38453cecb813c295cf347cf5906e8b41d340b0321a5433e5"}, + {file = "coverage-7.2.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3d376df58cc111dc8e21e3b6e24606b5bb5dee6024f46a5abca99124b2229ef5"}, + {file = "coverage-7.2.7-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5e330fc79bd7207e46c7d7fd2bb4af2963f5f635703925543a70b99574b0fea9"}, + {file = "coverage-7.2.7-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e9d683426464e4a252bf70c3498756055016f99ddaec3774bf368e76bbe02b6"}, + {file = "coverage-7.2.7-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d13c64ee2d33eccf7437961b6ea7ad8673e2be040b4f7fd4fd4d4d28d9ccb1e"}, + {file = "coverage-7.2.7-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:b7aa5f8a41217360e600da646004f878250a0d6738bcdc11a0a39928d7dc2050"}, + {file = "coverage-7.2.7-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:8fa03bce9bfbeeef9f3b160a8bed39a221d82308b4152b27d82d8daa7041fee5"}, + {file = "coverage-7.2.7-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:245167dd26180ab4c91d5e1496a30be4cd721a5cf2abf52974f965f10f11419f"}, + {file = "coverage-7.2.7-cp38-cp38-win32.whl", hash = "sha256:d2c2db7fd82e9b72937969bceac4d6ca89660db0a0967614ce2481e81a0b771e"}, + {file = "coverage-7.2.7-cp38-cp38-win_amd64.whl", hash = "sha256:2e07b54284e381531c87f785f613b833569c14ecacdcb85d56b25c4622c16c3c"}, + {file = "coverage-7.2.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:537891ae8ce59ef63d0123f7ac9e2ae0fc8b72c7ccbe5296fec45fd68967b6c9"}, + {file = "coverage-7.2.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:06fb182e69f33f6cd1d39a6c597294cff3143554b64b9825d1dc69d18cc2fff2"}, + {file = "coverage-7.2.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:201e7389591af40950a6480bd9edfa8ed04346ff80002cec1a66cac4549c1ad7"}, + {file = "coverage-7.2.7-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f6951407391b639504e3b3be51b7ba5f3528adbf1a8ac3302b687ecababf929e"}, + {file = "coverage-7.2.7-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f48351d66575f535669306aa7d6d6f71bc43372473b54a832222803eb956fd1"}, + {file = "coverage-7.2.7-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b29019c76039dc3c0fd815c41392a044ce555d9bcdd38b0fb60fb4cd8e475ba9"}, + {file = "coverage-7.2.7-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:81c13a1fc7468c40f13420732805a4c38a105d89848b7c10af65a90beff25250"}, + {file = "coverage-7.2.7-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:975d70ab7e3c80a3fe86001d8751f6778905ec723f5b110aed1e450da9d4b7f2"}, + {file = "coverage-7.2.7-cp39-cp39-win32.whl", hash = "sha256:7ee7d9d4822c8acc74a5e26c50604dff824710bc8de424904c0982e25c39c6cb"}, + {file = "coverage-7.2.7-cp39-cp39-win_amd64.whl", hash = "sha256:eb393e5ebc85245347950143969b241d08b52b88a3dc39479822e073a1a8eb27"}, + {file = "coverage-7.2.7-pp37.pp38.pp39-none-any.whl", hash = "sha256:b7b4c971f05e6ae490fef852c218b0e79d4e52f79ef0c8475566584a8fb3e01d"}, + {file = "coverage-7.2.7.tar.gz", hash = "sha256:924d94291ca674905fe9481f12294eb11f2d3d3fd1adb20314ba89e94f44ed59"}, ] cycler = [ {file = "cycler-0.11.0-py3-none-any.whl", hash = "sha256:3a27e95f763a428a739d2add979fa7494c912a32c17c4c38c4d5f082cad165a3"}, {file = "cycler-0.11.0.tar.gz", hash = "sha256:9c87405839a19696e837b3b818fed3f5f69f16f1eec1a1ad77e043dcea9c772f"}, ] +cython = [ + {file = "Cython-3.0.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7c7d728e1a49ad01d41181e3a9ea80b8d14e825f4679e4dd837cbf7bca7998a5"}, + {file = "Cython-3.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:626a4a6ef4b7ced87c348ea805488e4bd39dad9d0b39659aa9e1040b62bbfedf"}, + {file = "Cython-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:33c900d1ca9f622b969ac7d8fc44bdae140a4a6c7d8819413b51f3ccd0586a09"}, + {file = "Cython-3.0.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a65bc50dc1bc2faeafd9425defbdef6a468974f5c4192497ff7f14adccfdcd32"}, + {file = "Cython-3.0.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:3b71b399b10b038b056ad12dce1e317a8aa7a96e99de7e4fa2fa5d1c9415cfb9"}, + {file = "Cython-3.0.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f42f304c097cc53e9eb5f1a1d150380353d5018a3191f1b77f0de353c762181e"}, + {file = "Cython-3.0.0-cp310-cp310-win32.whl", hash = "sha256:3e234e2549e808d9259fdb23ebcfd145be30c638c65118326ec33a8d29248dc2"}, + {file = "Cython-3.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:829c8333195100448a23863cf64a07e1334fae6a275aefe871458937911531b6"}, + {file = "Cython-3.0.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:06db81b1a01858fcc406616f8528e686ffb6cf7c3d78fb83767832bfecea8ad8"}, + {file = "Cython-3.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c93634845238645ce7abf63a56b1c5b6248189005c7caff898fd4a0dac1c5e1e"}, + {file = "Cython-3.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa606675c6bd23478b1d174e2a84e3c5a2c660968f97dc455afe0fae198f9d3d"}, + {file = "Cython-3.0.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d3355e6f690184f984eeb108b0f5bbc4bcf8b9444f8168933acf79603abf7baf"}, + {file = "Cython-3.0.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:93a34e1ca8afa4b7075b02ed14a7e4969256297029fb1bfd4cbe48f7290dbcff"}, + {file = "Cython-3.0.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:bb1165ca9e78823f9ad1efa5b3d83156f868eabd679a615d140a3021bb92cd65"}, + {file = "Cython-3.0.0-cp311-cp311-win32.whl", hash = "sha256:2fadde1da055944f5e1e17625055f54ddd11f451889110278ef30e07bd5e1695"}, + {file = "Cython-3.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:254ed1f03a6c237fa64f0c6e44862058de65bfa2e6a3b48ca3c205492e0653aa"}, + {file = "Cython-3.0.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:4e212237b7531759befb92699c452cd65074a78051ae4ee36ff8b237395ecf3d"}, + {file = "Cython-3.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9f29307463eba53747b31f71394ed087e3e3e264dcc433e62de1d51f5c0c966c"}, + {file = "Cython-3.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:53328a8af0806bebbdb48a4191883b11ee9d9dfb084d84f58fa5a8ab58baefc9"}, + {file = "Cython-3.0.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5962e70b15e863e72bed6910e8c6ffef77d36cc98e2b31c474378f3b9e49b0e3"}, + {file = "Cython-3.0.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9e69139f4e60ab14c50767a568612ea64d6907e9c8e0289590a170eb495e005f"}, + {file = "Cython-3.0.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:c40bdbcb2286f0aeeb5df9ce53d45da2d2a9b36a16b331cd0809d212d22a8fc7"}, + {file = "Cython-3.0.0-cp312-cp312-win32.whl", hash = "sha256:8abb8915eb2e57fa53d918afe641c05d1bcc6ed1913682ec1f28de71f4e3f398"}, + {file = "Cython-3.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:30a4bd2481e59bd7ab2539f835b78edc19fc455811e476916f56026b93afd28b"}, + {file = "Cython-3.0.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:0e1e4b7e4bfbf22fecfa5b852f0e499c442d4853b7ebd33ae37cdec9826ed5d8"}, + {file = "Cython-3.0.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b00df42cdd1a285a64491ba23de08ab14169d3257c840428d40eb7e8e9979af"}, + {file = "Cython-3.0.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:650d03ddddc08b051b4659778733f0f173ca7d327415755c05d265a6c1ba02fb"}, + {file = "Cython-3.0.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4965f2ebade17166f21a508d66dd60d2a0b3a3b90abe3f72003baa17ae020dd6"}, + {file = "Cython-3.0.0-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:4123c8d03167803df31da6b39de167cb9c04ac0aa4e35d4e5aa9d08ad511b84d"}, + {file = "Cython-3.0.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:296c53b6c0030cf82987eef163444e8d7631cc139d995f9d58679d9fd1ddbf31"}, + {file = "Cython-3.0.0-cp36-cp36m-win32.whl", hash = "sha256:0d2c1e172f1c81bafcca703093608e10dc16e3e2d24c5644c17606c7fdb1792c"}, + {file = "Cython-3.0.0-cp36-cp36m-win_amd64.whl", hash = "sha256:bc816d8eb3686d6f8d165f4156bac18c1147e1035dc28a76742d0b7fb5b7c032"}, + {file = "Cython-3.0.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8d86651347bbdbac1aca1824696c5e4c0a3b162946c422edcca2be12a03744d1"}, + {file = "Cython-3.0.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:84176bd04ce9f3cc8799b47ec6d1959fa1ea5e71424507df7bbf0b0915bbedef"}, + {file = "Cython-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:35abcf07b8277ec95bbe49a07b5c8760a2d941942ccfe759a94c8d2fe5602e9f"}, + {file = "Cython-3.0.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a44d6b9a29b2bff38bb648577b2fcf6a68cf8b1783eee89c2eb749f69494b98d"}, + {file = "Cython-3.0.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:4dc6bbe7cf079db37f1ebb9b0f10d0d7f29e293bb8688e92d50b5ea7a91d82f3"}, + {file = "Cython-3.0.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:e28763e75e380b8be62b02266a7995a781997c97c119efbdccb8fb954bcd7574"}, + {file = "Cython-3.0.0-cp37-cp37m-win32.whl", hash = "sha256:edae615cb4af51d5173e76ba9aea212424d025c57012e9cdf2f131f774c5ba71"}, + {file = "Cython-3.0.0-cp37-cp37m-win_amd64.whl", hash = "sha256:20c604e974832aaf8b7a1f5455ee7274b34df62a35ee095cd7d2ed7e818e6c53"}, + {file = "Cython-3.0.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c85fd2b1cbd9400d60ebe074795bb9a9188752f1612be3b35b0831a24879b91f"}, + {file = "Cython-3.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:090256c687106932339f87f888b95f0d69c617bc9b18801555545b695d29d8ab"}, + {file = "Cython-3.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cec2a67a0a7d9d4399758c0657ca03e5912e37218859cfbf046242cc532bfb3b"}, + {file = "Cython-3.0.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a1cdd01ce45333bc264a218c6e183700d6b998f029233f586a53c9b13455c2d2"}, + {file = "Cython-3.0.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ecee663d2d50ca939fc5db81f2f8a219c2417b4651ad84254c50a03a9cb1aadd"}, + {file = "Cython-3.0.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:30f10e79393b411af7677c270ea69807acb9fc30205c8ff25561f4deef780ec1"}, + {file = "Cython-3.0.0-cp38-cp38-win32.whl", hash = "sha256:609777d3a7a0a23b225e84d967af4ad2485c8bdfcacef8037cf197e87d431ca0"}, + {file = "Cython-3.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:7f4a6dfd42ae0a45797f50fc4f6add702abf46ab3e7cd61811a6c6a97a40e1a2"}, + {file = "Cython-3.0.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2d8158277c8942c0b20ff4c074fe6a51c5b89e6ac60cef606818de8c92773596"}, + {file = "Cython-3.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54e34f99b2a8c1e11478541b2822e6408c132b98b6b8f5ed89411e5e906631ea"}, + {file = "Cython-3.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:877d1c8745df59dd2061a0636c602729e9533ba13f13aa73a498f68662e1cbde"}, + {file = "Cython-3.0.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:204690be60f0ff32eb70b04f28ef0d1e50ffd7b3f77ba06a7dc2389ee3b848e0"}, + {file = "Cython-3.0.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:06fcb4628ccce2ba5abc8630adbeaf4016f63a359b4c6c3827b2d80e0673981c"}, + {file = "Cython-3.0.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:090e24cfa31c926d0b13d8bb2ef48175acdd061ae1413343c94a2b12a4a4fa6f"}, + {file = "Cython-3.0.0-cp39-cp39-win32.whl", hash = "sha256:4cd00f2158dc00f7f93a92444d0f663eda124c9c29bbbd658964f4e89c357fe8"}, + {file = "Cython-3.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:5b4cc896d49ce2bae8d6a030f9a4c64965b59c38acfbf4617685e17f7fcf1731"}, + {file = "Cython-3.0.0-py2.py3-none-any.whl", hash = "sha256:ff1aef1a03cfe293237c7a86ae9625b0411b2df30c53d1a7f29a8d381f38a1df"}, + {file = "Cython-3.0.0.tar.gz", hash = "sha256:350b18f9673e63101dbbfcf774ee2f57c20ac4636d255741d76ca79016b1bd82"}, +] cyvcf2 = [ {file = "cyvcf2-0.30.18-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:405d9c3a2d1c5c211a34222ce4b100b5bcf7fe32b0d25db7020311ae4ef9323d"}, {file = "cyvcf2-0.30.18-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f61dd4503c4d8ffc49cc1ebe4b08a990430aaa7f3963330a5a3c6d0c0ed2783c"}, @@ -1417,13 +1497,24 @@ pexpect = [ {file = "pexpect-4.8.0.tar.gz", hash = "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c"}, ] pgenlib = [ - {file = "Pgenlib-0.81.3.tar.gz", hash = "sha256:d7ab19e6b5c8befdfdf417834986c625abe6fcb86eb6d88190706271b93c465f"}, + {file = "Pgenlib-0.90.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c5823a8f3bb558c2d60243eb19ca6220cc2d5c759fc50139e5c09b179353b68"}, + {file = "Pgenlib-0.90.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:758b3927228029bb10758a28a8ded86a71844da973d6196b44913c4765cdf23d"}, + {file = "Pgenlib-0.90.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72427c7e92993859be40e7ad326a5d86025d2218c0109ac3b3fa1620fd37bec2"}, + {file = "Pgenlib-0.90.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85fb2bd56d12a338ec3605149c3f0782b04c861588ad7e4afb41f3ff13f90c00"}, + {file = "Pgenlib-0.90.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8283d5b6cb1196dc859f7c489b36d3cd2673174a19e507fa52a869ef25333127"}, + {file = "Pgenlib-0.90.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89d5baf59cb19f1f53ae1ec48fb33775e3bd84c0c4ed3d8582a6b45051f26e1a"}, + {file = "Pgenlib-0.90.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f06160710bfbf69655bb30ba8231e1329d4903f0a71f5dcb52df9d04c2f94282"}, + {file = "Pgenlib-0.90.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:af42bb7fc0812a562e3c17218019650174d4bd3598f0a031b2604ea534c2e361"}, + {file = "Pgenlib-0.90.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72072417b51b9dc0189594c0b705968898b22e43f8a81e9e2a2e4ad19df9ba50"}, + {file = "Pgenlib-0.90.1.tar.gz", hash = "sha256:069db673c1cc5baffcb9b20c2e6e26cc9fdaadd0095f55a188b41502d0511cb9"}, ] pickleshare = [ {file = "pickleshare-0.7.5-py2.py3-none-any.whl", hash = "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56"}, {file = "pickleshare-0.7.5.tar.gz", hash = "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca"}, ] pillow = [ + {file = "Pillow-9.3.0-1-cp37-cp37m-win32.whl", hash = "sha256:e6ea6b856a74d560d9326c0f5895ef8050126acfdc7ca08ad703eb0081e82b74"}, + {file = "Pillow-9.3.0-1-cp37-cp37m-win_amd64.whl", hash = "sha256:32a44128c4bdca7f31de5be641187367fe2a450ad83b833ef78910397db491aa"}, {file = "Pillow-9.3.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:0b7257127d646ff8676ec8a15520013a698d1fdc48bc2a79ba4e53df792526f2"}, {file = "Pillow-9.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b90f7616ea170e92820775ed47e136208e04c967271c9ef615b6fbd08d9af0e3"}, {file = "Pillow-9.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:68943d632f1f9e3dce98908e873b3a090f6cba1cbb1b892a9e8d97c938871fbe"}, @@ -1522,27 +1613,27 @@ pyreadline3 = [ {file = "pyreadline3-3.4.1.tar.gz", hash = "sha256:6f3d1f7b8a31ba32b73917cefc1f28cc660562f39aea8646d30bd6eff21f7bae"}, ] pysam = [ - {file = "pysam-0.19.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:44de1a3af7c7eb5f404d6337f0c9c4ee88c34c2d2fee1a7896ccd8e7d2aa475a"}, - {file = "pysam-0.19.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0ceb07c6253598ec70fef6ac0c0f7ab0d299562c1a91e737adb07239afba22d6"}, - {file = "pysam-0.19.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2057f3b8cc20562fd010e7971e83ab78978f17975563a711c94bca583ce8a2d3"}, - {file = "pysam-0.19.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a1a9ee6cd6dfa50973dcb51cd2245ea7d4d64d4e962d60e5e1a088f7b790e3e"}, - {file = "pysam-0.19.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:b831170ff810bfd1242dbce4ddf8e693e95e39a4332d5903d416233d3d1be50a"}, - {file = "pysam-0.19.0-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:fa4a5d334986d0696522246820f295cbf6c18dc1b78798f800a2d57d56207789"}, - {file = "pysam-0.19.0-cp36-cp36m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ab58d7d9b140e2e8a4918fc00661aa901ba461d9bccd4b499102b0828f2d897e"}, - {file = "pysam-0.19.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b1d964c29fedf55d2a5d095227d19d915c2e0e5e42b1e0bdf7fab30cd1d2a850"}, - {file = "pysam-0.19.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8c78f9b84f7fe69530eaccf5b7f08636b3419f0017b5050aa7013f1c59d3d654"}, - {file = "pysam-0.19.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:ada92b376717ac4c9c9924a096af9186035115d29a113eaa997d1c020ce421b9"}, - {file = "pysam-0.19.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:80b8f5b929c7f660b6e1497790a13c61f386b310a31ca54ad6ba110674d11c55"}, - {file = "pysam-0.19.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:30017bed8d002d41f83fa7e10569525c811a8e3860d73a880ee653ef29fd4fbc"}, - {file = "pysam-0.19.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5e2e465522ba2c34cb96c013a28f9c9db303f8ab1350b4c11cca73677f1e9594"}, - {file = "pysam-0.19.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:2103b4b6d9b0cc0b4aaccf64e87a92bfdabb7dc92810cf84be35ffe78fafa002"}, - {file = "pysam-0.19.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:eaf342b9c71ed83a63237df2000e3bc1f0236165d48fd7240c4c78b66f28d63b"}, - {file = "pysam-0.19.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bc0021b154206edfbaa13515cb67523c76c576b7a670e72a793f2da4f03139f4"}, - {file = "pysam-0.19.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3101e0fcd2f6accbfa72a554a71baf83f1c096bb0f9045059b3ead35901ce128"}, - {file = "pysam-0.19.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:5118dcabac574638df43739941f0ee20cc4e9197aee4a8f10f195542a13f18e3"}, - {file = "pysam-0.19.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:4fe1f1fae0e559d3412625dc7d4d08b477e80211b3fe5b267ba341a84f78b8be"}, - {file = "pysam-0.19.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2911e3760dd2b709c686f075146787b8bda35629093352c28a7e52ac00ddaffc"}, - {file = "pysam-0.19.0.tar.gz", hash = "sha256:dcc052566f9509fd93b2a2664f094a13f016fd60cdd189e05fb4eafa0c89505b"}, + {file = "pysam-0.21.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7012207f6afc181a7009c70f5ab6969ac46c06090df63e798ead9efe433098a5"}, + {file = "pysam-0.21.0-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:3501296b6e8af6092f8b3c01b271d7c3e03b37b81ed5462d3c8041b00f8fa73a"}, + {file = "pysam-0.21.0-cp310-cp310-manylinux_2_24_i686.whl", hash = "sha256:ed21d9138e927eaed24e05295a9ec4619f97fbfd4db2d488cc969a4bc15a3db5"}, + {file = "pysam-0.21.0-cp310-cp310-manylinux_2_24_x86_64.whl", hash = "sha256:37e0ed17c05f7cb3b141c2c78ad23b3592479e8adc47453ae5681c6ffd0d5e00"}, + {file = "pysam-0.21.0-cp311-cp311-manylinux_2_24_aarch64.whl", hash = "sha256:7eba09d7f711f2d3759924178950ec4c38585bce7f0c18e90cb3133bc411808e"}, + {file = "pysam-0.21.0-cp311-cp311-manylinux_2_24_i686.whl", hash = "sha256:7a4f7c34b66379b8565e4d4105a8f8b68f9b308c25adebfd357af8671d909ff5"}, + {file = "pysam-0.21.0-cp311-cp311-manylinux_2_24_x86_64.whl", hash = "sha256:6d9f453048aefb3908b9cd40edb15d3140f8dea15c24ed0df11796a4367e9718"}, + {file = "pysam-0.21.0-cp36-cp36m-manylinux_2_24_aarch64.whl", hash = "sha256:36d78fad182f3230ca8618ea1f4de4c4ed9b4437bc54aa9462d8f5e22f074c22"}, + {file = "pysam-0.21.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:246ce42bf6321516ee8e26763907eaa5aa36a959af6927efba0d0f97127618e9"}, + {file = "pysam-0.21.0-cp37-cp37m-manylinux_2_24_aarch64.whl", hash = "sha256:d7555782b2de082551e008fae1236dbe10365df0551776b4ec203953f6bda381"}, + {file = "pysam-0.21.0-cp37-cp37m-manylinux_2_24_i686.whl", hash = "sha256:962136c9b2d456a4c7e7bad1cd1b680ed27334dbd6ee28a6381dfc698d84951d"}, + {file = "pysam-0.21.0-cp37-cp37m-manylinux_2_24_x86_64.whl", hash = "sha256:f773e1b02ec47b665ad1e4813ea34a4eb1a17d137200f313ad00525aef08a94e"}, + {file = "pysam-0.21.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cff4afedb902def2fb464e7023c0a95fcecbc3f9a1437b31b29cd805735014e5"}, + {file = "pysam-0.21.0-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:6260ff6a0965d33141af753f0b81495d4579299ea906ce3a3f172294fc6454d7"}, + {file = "pysam-0.21.0-cp38-cp38-manylinux_2_24_i686.whl", hash = "sha256:feaa464620ae221fb4878ef1b4f5370c319291522cf52ef497e8beba64c269a0"}, + {file = "pysam-0.21.0-cp38-cp38-manylinux_2_24_x86_64.whl", hash = "sha256:eb283c0de1d197737b53f70d3ea04a2f42c66134803862c43669b9cbef6e45c0"}, + {file = "pysam-0.21.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2087be3b6e754aed3a9178b9f9dd8dc4688c16df8197cdca5a3145098732f786"}, + {file = "pysam-0.21.0-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:4daa4a9927c6c3bf6da6aa8ffce3242b989fe995733c790913dc2c586bc67d4a"}, + {file = "pysam-0.21.0-cp39-cp39-manylinux_2_24_i686.whl", hash = "sha256:f58c86d76cf4e87b879a9aa8f860426a074a2d9f55d3a0078f5cf00e2b32f34c"}, + {file = "pysam-0.21.0-cp39-cp39-manylinux_2_24_x86_64.whl", hash = "sha256:d6babe094fa79af483fe935fe1ea2ebc8e956bbe79dfb9b0a5ce4b2240d00389"}, + {file = "pysam-0.21.0.tar.gz", hash = "sha256:5c9645ddd87668e36ff0a1966391e26f9c403bf85b1bc06c53fe2fcd592da2ce"}, ] pytest = [ {file = "pytest-7.2.0-py3-none-any.whl", hash = "sha256:892f933d339f068883b6fd5a459f03d85bfcb355e4981e146d2c7616c21fef71"}, diff --git a/pyproject.toml b/pyproject.toml index 2791ff88..dc2b5ad0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ click = ">=8.0.3" pysam = ">=0.19.0" cyvcf2 = ">=0.30.14" matplotlib = ">=3.5.1" -Pgenlib = {version = ">=0.81.3", optional = true, extras = ["files"]} +Pgenlib = {version = ">=0.90.1", optional = true, extras = ["files"]} # docs # these belong in dev-dependencies, but RTD doesn't support that yet -- see From 5b24aa9236a42f24a17b277ac260ddf8b476fc38 Mon Sep 17 00:00:00 2001 From: Gonzalo <138239951+gonzalogc1@users.noreply.github.com> Date: Mon, 24 Jul 2023 17:09:52 -0600 Subject: [PATCH 02/40] Here it's the test_write_multiallelic() --- tests/test_data.py | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/tests/test_data.py b/tests/test_data.py index 2932d146..3c8c8608 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -5,7 +5,7 @@ import pytest import numpy as np import numpy.lib.recfunctions as rfn - +import pgenlib from haptools.sim_phenotype import Haplotype as HaptoolsHaplotype from haptools.sim_phenotype import Repeat as HaptoolsRepeat from haptools.data import ( @@ -344,7 +344,7 @@ def test_load_genotypes(self): for col in ("chrom", "pos", "id", "alleles"): assert gts.variants[col][i] == expected.variants[col][i] - @pytest.mark.xfail(reason="not implemented yet") + def test_load_genotypes_multiallelic(self): expected = self._get_fake_genotypes_multiallelic() @@ -531,6 +531,33 @@ def test_write_genotypes_unphased(self): fname.unlink() + def test_write_multiallelic(self): + # Create fake multi-allelic genotype data to write to the PLINK file + gts_multiallelic = self._get_fake_genotypes_multiallelic() + # Name of the PLINK file where the data will be written + fname = DATADIR / "test_write_multiallelic.pgen" + + # Write multiallelic genotype data to the PLINK file + gts_multiallelic.fname = fname + gts_multiallelic.write() + + #Read the data from the newly created PLINK file + new_gts = GenotypesPLINK(fname) + new_gts.read() + new_gts.check_phase() + + # Verify tqhat the uploaded data matches the original data + np.testing.assert_allclose(gts_multiallelic.data, new_gts.data) + assert gts_multiallelic.samples == new_gts.samples + for i in range(len(new_gts.variants)): + for col in ("chrom", "pos", "id", "alleles"): + assert gts_multiallelic.variants[col][i] == new_gts.variants[col][i] + + #clean the files created after the test + fname.with_suffix(".psam").unlink() + fname.with_suffix(".pvar").unlink() + fname.unlink() + class TestPhenotypes: def _get_expected_phenotypes(self): # create a phen matrix with shape: samples x phenotypes From d0fb1060be1b35b99fc7b8c690115540965b7841 Mon Sep 17 00:00:00 2001 From: Gonzalo <138239951+gonzalogc1@users.noreply.github.com> Date: Wed, 26 Jul 2023 10:50:04 -0600 Subject: [PATCH 03/40] I add multiallelic variants support for genotypePLINK --- haptools/data/genotypes.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index f8ea318d..97764e66 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -1074,12 +1074,16 @@ def _variant_arr( npt.NDArray A row from the :py:attr:`~.GenotypesPLINK.variants` array """ + # Parse the REF and ALT alleles from the PVAR record + ref_allele = record[cid["REF"]] + alt_alleles = record[cid["ALT"]].split(",") + alleles = [ref_allele] + alt_alleles return np.array( ( record[cid["ID"]], record[cid["CHROM"]], record[cid["POS"]], - (record[cid["REF"]], record[cid["ALT"]]), + tuple(alleles), ), dtype=self.variants.dtype, ) @@ -1240,10 +1244,12 @@ def read( """ super(Genotypes, self).read() import pgenlib - + sample_idxs = self.read_samples(samples) + pv = pgenlib.PvarReader(bytes(str(self.fname.with_suffix(".pvar")), "utf8")) + with pgenlib.PgenReader( - bytes(str(self.fname), "utf8"), sample_subset=sample_idxs + bytes(str(self.fname), "utf8"), sample_subset=sample_idxs, pvar= pv ) as pgen: # how many variants to load? if variants is not None: @@ -1465,10 +1471,13 @@ def write(self): chunks = len(self.variants) # write the pgen file + pv = pgenlib.PvarReader(bytes(str(self.fname.with_suffix(".pvar")), "utf8")) with pgenlib.PgenWriter( + filename=bytes(str(self.fname), "utf8"), sample_ct=len(self.samples), variant_ct=len(self.variants), + allele_ct_limit = pv.get_max_allele_ct(), nonref_flags=False, hardcall_phase_present=True, ) as pgen: From b8cbffca70fcb2738e78b96be756d208fd2676b9 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 26 Jul 2023 09:55:14 -0700 Subject: [PATCH 04/40] create skeleton for GenotypesPLINKTR class --- haptools/data/__init__.py | 2 +- haptools/data/genotypes.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/haptools/data/__init__.py b/haptools/data/__init__.py index a0561484..65a78c6a 100644 --- a/haptools/data/__init__.py +++ b/haptools/data/__init__.py @@ -3,4 +3,4 @@ from .covariates import Covariates from .breakpoints import Breakpoints, HapBlock from .haplotypes import Extra, Repeat, Variant, Haplotype, Haplotypes -from .genotypes import Genotypes, GenotypesVCF, GenotypesTR, GenotypesPLINK +from .genotypes import Genotypes, GenotypesVCF, GenotypesTR, GenotypesPLINK, GenotypesPLINKTR diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index 97764e66..2f937fe1 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -908,7 +908,6 @@ def _return_data(self, variant: trh.TRRecord): self.log.warning( "The current variant in the VCF only has one allele per sample." ) - data = [] # Only one GT so phase will always be 0 zeros = np.zeros((gts.shape[0], 1)) missing = -1 * np.ones((gts.shape[0],)) @@ -1515,3 +1514,8 @@ def write(self): del subset_data del missing gc.collect() + + +class GenotypesPLINKTR(GenotypesPLINK): + # TODO: implement this class + pass From a4a9584d83e6c94e38a2eefe0a53ba38ea8438e6 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 26 Jul 2023 09:56:00 -0700 Subject: [PATCH 05/40] add plink2 TR test files --- tests/data/simple_tr.pgen | Bin 0 -> 53 bytes tests/data/simple_tr.psam | 6 ++++++ tests/data/simple_tr.pvar | 16 ++++++++++++++++ 3 files changed, 22 insertions(+) create mode 100644 tests/data/simple_tr.pgen create mode 100644 tests/data/simple_tr.psam create mode 100644 tests/data/simple_tr.pvar diff --git a/tests/data/simple_tr.pgen b/tests/data/simple_tr.pgen new file mode 100644 index 0000000000000000000000000000000000000000..317aa19f140151892387be304e15acaef19a2965 GIT binary patch literal 53 zcmd0i7GPyyU;tqkIUoT75&|p|44ll2984>j`1uu`Q_C5a0)^)>GcYnT`!k(m1^{(1 B1=Rom literal 0 HcmV?d00001 diff --git a/tests/data/simple_tr.psam b/tests/data/simple_tr.psam new file mode 100644 index 00000000..62f619e0 --- /dev/null +++ b/tests/data/simple_tr.psam @@ -0,0 +1,6 @@ +#IID SEX +HG00096 NA +HG00097 NA +HG00099 NA +HG00100 NA +HG00101 NA diff --git a/tests/data/simple_tr.pvar b/tests/data/simple_tr.pvar new file mode 100644 index 00000000..416b02e7 --- /dev/null +++ b/tests/data/simple_tr.pvar @@ -0,0 +1,16 @@ +##fileformat=VCFv4.3 +##fileDate=20230724 +##source=PLINKv2.00 +##command=HipSTR-v0.7 --test +##filedate=20180225 +##INFO= +##INFO= +##INFO= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 10114 1:10114:GTT GTT GTTGTT,GTTGTTGTT,GTTGTTGTTGTT,GTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTTGTTGTT . . START=10114;END=10999;PERIOD=3 +1 11000 1:10116:ACACAC ACACAC AC . . START=11000;END=11016;PERIOD=2 +1 12117 1:10117:AAA AAA AAAA . . START=12117;END=12143;PERIOD=1 +1 13122 1:10122:GTGT GTGT GTGTGT,GTGTGTGT,GTGTGTGTGTGTGT,GTGTGTGTGTGTGTGTGTGTGT . . START=13122;END=13150;PERIOD=2 +X 20122 1:20122:GGG GGG GGGGG . . START=20122;END=20150;PERIOD=1 From 5f7ab286eb7c191daddb986e0bbd45a87454255d Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 26 Jul 2023 09:57:32 -0700 Subject: [PATCH 06/40] clean up tests in test_data.py --- tests/test_data.py | 27 +++------------------------ 1 file changed, 3 insertions(+), 24 deletions(-) diff --git a/tests/test_data.py b/tests/test_data.py index 3c8c8608..352668c4 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -530,7 +530,6 @@ def test_write_genotypes_unphased(self): fname.with_suffix(".pvar").unlink() fname.unlink() - def test_write_multiallelic(self): # Create fake multi-allelic genotype data to write to the PLINK file gts_multiallelic = self._get_fake_genotypes_multiallelic() @@ -1655,14 +1654,13 @@ def test_read_tr(self): [[5, 255, 0], [255, 255, 0], [3, 255, 0], [255, 255, 0], [255, 255, 0]], ], dtype=np.uint8, - ) + ).transpose((1, 0, 2)) + gts = GenotypesTR(DATADIR / "simple_tr.vcf") gts.read() # check genotypes - for i, variants in enumerate(expected_alleles): - for j, sample_var in enumerate(variants): - assert tuple(sample_var) == tuple(gts.data[j, i]) + np.testing.assert_allclose(expected_alleles, gts.data) class TestBreakpoints: @@ -1822,25 +1820,6 @@ def test_breakpoints_to_pop_array_chrom_no_match(self): [("chr1", 59423086), ("1", 59423090), ("1", 239403770), ("2", 229668150)], dtype=[("chrom", "U10"), ("pos", np.uint32)], ) - expected_pop_arr = np.array( - [ - [ - [0, 0], - [1, 0], - [1, 0], - [0, 1], - ], - [ - [1, 1], - [0, 1], - [0, 1], - [1, 0], - ], - ], - dtype=np.uint8, - ) - labels = {"YRI": 0, "CEU": 1} - labels_map = np.vectorize({v: k for k, v in labels.items()}.get) expected = self._get_expected_breakpoints() From f07751ac70891c7aca4ebfbfb6098728d1358aec Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 26 Jul 2023 09:58:16 -0700 Subject: [PATCH 07/40] add TestGenotypesPLINKTR._fake_genotypes_multiallelic() dummy method --- tests/test_data.py | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/tests/test_data.py b/tests/test_data.py index 352668c4..9ece725a 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -21,8 +21,9 @@ Breakpoints, GenotypesTR, GenotypesVCF, - GenotypesPLINK, GenotypesTR, + GenotypesPLINK, + GenotypesPLINKTR, ) @@ -557,6 +558,43 @@ def test_write_multiallelic(self): fname.with_suffix(".pvar").unlink() fname.unlink() + +class TestGenotypesPLINKTR: + def _get_fake_genotypes_multiallelic(self): + # define the code for an empty allele + empty = np.iinfo(np.uint8).max + + # fill out the class with the same variants and samples as GenotypesPLINK + gts_plink = TestGenotypesPLINK()._get_fake_genotypes_multiallelic() + gts = GenotypesPLINKTR(gts_plink.fname) + gts.variants = gts_plink.variants + gts.samples = gts_plink.samples + + # fill out the data array with the repeat unit counts + data = np.array( + [ + [[1, 2], [3, 4], [5, 6], [7, 8], [9, 9]], + [[3, 1], [3, 1], [1, 1], [1, 1], [3, 3]], + [[3, 3], [3, 3], [3, 3], [3, 3], [3, 3]], + [[11, 11], [empty, empty], [7, 2], [3, 4], [11, empty]], + [[5, empty], [empty, empty], [3, empty], [empty, empty], [empty, empty]], + ], + dtype=np.uint8, + ) + + # append the phasing info, transpose, and clean up + phasing = np.ones(data.shape[:2] + (1,)).astype(np.uint8) + gts.data = np.concatenate((data, phasing), axis=-1).transpose((1, 0, 2)) + + # handle half-calls and chrX according to the flag: + # --vcf-half-call m + gts.data[4, 3, 0] = empty + gts.data[:, 4, 1] = gts.data[:, 4, 0] + # TODO: figure out whether this is the correct way to handle half-calls and try + # to see if we can change chrX representations to properly simulate from it + + return gts + class TestPhenotypes: def _get_expected_phenotypes(self): # create a phen matrix with shape: samples x phenotypes From 449c5543435d1b413d9cdf58e1ca47456e29cf4f Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 26 Jul 2023 17:00:57 -0700 Subject: [PATCH 08/40] create new simple-tr-valid VCF and PGEN files also, add test for reading TRs in TestGenotypesPLINK --- .../{simple_tr.pgen => simple-tr-valid.pgen} | Bin .../{simple_tr.psam => simple-tr-valid.psam} | 0 .../{simple_tr.pvar => simple-tr-valid.pvar} | 0 tests/data/simple-tr-valid.vcf | 15 +++++ tests/test_data.py | 61 +++++++++++++++++- 5 files changed, 75 insertions(+), 1 deletion(-) rename tests/data/{simple_tr.pgen => simple-tr-valid.pgen} (100%) rename tests/data/{simple_tr.psam => simple-tr-valid.psam} (100%) rename tests/data/{simple_tr.pvar => simple-tr-valid.pvar} (100%) create mode 100644 tests/data/simple-tr-valid.vcf diff --git a/tests/data/simple_tr.pgen b/tests/data/simple-tr-valid.pgen similarity index 100% rename from tests/data/simple_tr.pgen rename to tests/data/simple-tr-valid.pgen diff --git a/tests/data/simple_tr.psam b/tests/data/simple-tr-valid.psam similarity index 100% rename from tests/data/simple_tr.psam rename to tests/data/simple-tr-valid.psam diff --git a/tests/data/simple_tr.pvar b/tests/data/simple-tr-valid.pvar similarity index 100% rename from tests/data/simple_tr.pvar rename to tests/data/simple-tr-valid.pvar diff --git a/tests/data/simple-tr-valid.vcf b/tests/data/simple-tr-valid.vcf new file mode 100644 index 00000000..396aaf08 --- /dev/null +++ b/tests/data/simple-tr-valid.vcf @@ -0,0 +1,15 @@ +##fileformat=VCFv4.2 +##command=HipSTR-v0.7 --test +##filedate=20180225 +##INFO= +##INFO= +##INFO= +##FORMAT= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00096 HG00097 HG00099 HG00100 HG00101 +1 10114 1:10114:GTT GTT GTTGTT,GTTGTTGTT,GTTGTTGTTGTT,GTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTTGTTGTT . . START=10114;END=10999;PERIOD=3 GT 0|1 2|3 4|5 6|7 8|8 +1 11000 1:10116:ACACAC ACACAC AC . . START=11000;END=11016;PERIOD=2 GT 0|1 0|1 1|1 1|1 0|0 +1 12117 1:10117:AAA AAA AAAA . . START=12117;END=12143;PERIOD=1 GT 0|0 0|0 0|0 0|0 0|0 +1 13122 1:10122:GTGT GTGT GTGTGT,GTGTGTGT,GTGTGTGTGTGTGT,GTGTGTGTGTGTGTGTGTGTGT . . START=13122;END=13150;PERIOD=2 GT 4|4 . 3|0 1|2 4|. +X 20122 1:20122:GGG GGG GGGGG . . START=20122;END=20150;PERIOD=1 GT 1 . 0 . . diff --git a/tests/test_data.py b/tests/test_data.py index 9ece725a..191d1129 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -331,6 +331,53 @@ def _get_fake_genotypes_multiallelic(self): gts.variants = gts_ref_alt.variants return gts + def _get_fake_genotypes_multiallelic_tr(self): + pgenlib = pytest.importorskip("pgenlib") + + gts_tr = GenotypesTR(DATADIR / "simple_tr.vcf") + gts_tr.read() + + gts = GenotypesPLINK(fname="") + gts.samples = ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101") + + na = np.iinfo(np.uint8).max + data = np.array( + [ + [[0, 1], [2, 3], [4, 5], [6, 7], [8, 8]], + [[0, 1], [0, 1], [1, 1], [1, 1], [0, 0]], + [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]], + [[4, 4], [na, na], [3, 0], [1, 2], [4, na]], + [[1, na], [na, na], [0, na], [na, na], [na, na]], + ], + dtype=np.uint8, + ) + # append the phasing info, transpose, and clean up + phasing = np.ones(data.shape[:2] + (1,)).astype(np.uint8) + phasing[4, [1, 3, 4]] = 0 + phasing[3, [1, 4]] = 0 + gts.data = np.concatenate((data, phasing), axis=-1).transpose((1, 0, 2)) + # handle half-calls and chrX according to the flag: + # --vcf-half-call m + gts.data[4, 3, 0] = na + gts.data[:, 4, 1] = gts.data[:, 4, 0] + # TODO: figure out whether this is the correct way to handle half-calls and try + # to see if we can change chrX representations to properly simulate from it + + # add the appropriate alleles + ref_alt = [ + tuple("GTT" * i for i in range(1, 10)), + ("ACACAC", "AC"), + ("AAA", "AAAA"), + ("GTGT", "GTGTGT", "GTGTGTGT", "GTGTGTGTGTGTGT", "GTGTGTGTGTGTGTGTGTGTGT"), + ("GGG", "GGGGG"), + ] + gts.variants = np.array( + [tuple(rec) + (ref_alt[idx],) for idx, rec in enumerate(gts_tr.variants)], + dtype=gts.variants.dtype, + ) + + return gts + def test_load_genotypes(self): expected = self._get_fake_genotypes_plink() @@ -345,7 +392,6 @@ def test_load_genotypes(self): for col in ("chrom", "pos", "id", "alleles"): assert gts.variants[col][i] == expected.variants[col][i] - def test_load_genotypes_multiallelic(self): expected = self._get_fake_genotypes_multiallelic() @@ -360,6 +406,19 @@ def test_load_genotypes_multiallelic(self): for col in ("chrom", "pos", "id", "alleles"): assert gts.variants[col][i] == expected.variants[col][i] + def test_load_genotypes_multiallelic_tr(self): + expected = self._get_fake_genotypes_multiallelic_tr() + + gts = GenotypesPLINK(DATADIR / "simple-tr-valid.pgen") + gts.read() + + # check that everything matches what we expected + np.testing.assert_array_equal(gts.data, expected.data) + assert gts.samples == expected.samples + for i, x in enumerate(expected.variants): + for col in ("chrom", "pos", "id", "alleles"): + assert gts.variants[col][i] == expected.variants[col][i] + def test_load_genotypes_chunked(self): expected = self._get_fake_genotypes_plink() From fd6b31873be2b823e19bb745008b83161e32e995 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 26 Jul 2023 17:02:19 -0700 Subject: [PATCH 09/40] leverage new dummy method in TestGenotypesPLINK --- haptools/data/__init__.py | 8 +++++++- tests/test_data.py | 13 +++++++------ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/haptools/data/__init__.py b/haptools/data/__init__.py index 65a78c6a..a5d2c2f4 100644 --- a/haptools/data/__init__.py +++ b/haptools/data/__init__.py @@ -3,4 +3,10 @@ from .covariates import Covariates from .breakpoints import Breakpoints, HapBlock from .haplotypes import Extra, Repeat, Variant, Haplotype, Haplotypes -from .genotypes import Genotypes, GenotypesVCF, GenotypesTR, GenotypesPLINK, GenotypesPLINKTR +from .genotypes import ( + Genotypes, + GenotypesVCF, + GenotypesTR, + GenotypesPLINK, + GenotypesPLINKTR, +) diff --git a/tests/test_data.py b/tests/test_data.py index 191d1129..959faaec 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -621,10 +621,10 @@ def test_write_multiallelic(self): class TestGenotypesPLINKTR: def _get_fake_genotypes_multiallelic(self): # define the code for an empty allele - empty = np.iinfo(np.uint8).max + na = np.iinfo(np.uint8).max # fill out the class with the same variants and samples as GenotypesPLINK - gts_plink = TestGenotypesPLINK()._get_fake_genotypes_multiallelic() + gts_plink = TestGenotypesPLINK()._get_fake_genotypes_multiallelic_tr() gts = GenotypesPLINKTR(gts_plink.fname) gts.variants = gts_plink.variants gts.samples = gts_plink.samples @@ -635,25 +635,26 @@ def _get_fake_genotypes_multiallelic(self): [[1, 2], [3, 4], [5, 6], [7, 8], [9, 9]], [[3, 1], [3, 1], [1, 1], [1, 1], [3, 3]], [[3, 3], [3, 3], [3, 3], [3, 3], [3, 3]], - [[11, 11], [empty, empty], [7, 2], [3, 4], [11, empty]], - [[5, empty], [empty, empty], [3, empty], [empty, empty], [empty, empty]], + [[11, 11], [na, na], [7, 2], [3, 4], [11, na]], + [[5, na], [na, na], [3, na], [na, na], [na, na]], ], dtype=np.uint8, ) # append the phasing info, transpose, and clean up - phasing = np.ones(data.shape[:2] + (1,)).astype(np.uint8) + phasing = gts_plink.data[:, :, 2] gts.data = np.concatenate((data, phasing), axis=-1).transpose((1, 0, 2)) # handle half-calls and chrX according to the flag: # --vcf-half-call m - gts.data[4, 3, 0] = empty + gts.data[4, 3, 0] = na gts.data[:, 4, 1] = gts.data[:, 4, 0] # TODO: figure out whether this is the correct way to handle half-calls and try # to see if we can change chrX representations to properly simulate from it return gts + class TestPhenotypes: def _get_expected_phenotypes(self): # create a phen matrix with shape: samples x phenotypes From 7f1db2d95be71e0449794929da69ed2cd93723f0 Mon Sep 17 00:00:00 2001 From: Gonzalo <138239951+gonzalogc1@users.noreply.github.com> Date: Fri, 28 Jul 2023 17:33:20 -0600 Subject: [PATCH 10/40] I just added test_read_plinktr and test_iter in the TestGenotypesPLINKTR class of the test/test_data.py file, also I already added the __init__ function in the GenotypesPLINKTR class in genotype.py in the data file --- haptools/data/genotypes.py | 6 ++++++ tests/test_data.py | 21 +++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index 2f937fe1..79f3db79 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -1517,5 +1517,11 @@ def write(self): class GenotypesPLINKTR(GenotypesPLINK): + def __init__(self, fname: Path | str, log: Logger = None, chunk_size: int = None , vcftype: str = "auto"): + super().__init__(fname, log, chunk_size) + self.vcftype = vcftype + + + # TODO: implement this class pass diff --git a/tests/test_data.py b/tests/test_data.py index 959faaec..d05d8ba8 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -653,6 +653,27 @@ def _get_fake_genotypes_multiallelic(self): # to see if we can change chrX representations to properly simulate from it return gts + + def test_iter(self): + # Get the expected data + expected = self. _get_fake_genotypes_multiallelic() + gts = GenotypesPLINKTR(DATADIR / "simple-tr-valid.pgen") + # Check that everything matches what we expected + for idx, line in enumerate(gts): + np.testing.assert_allclose(line.data[:, :3], expected.data[:, idx]) + for col in ("chrom", "pos", "id"): + assert line.variants[col] == expected.variants[col][idx] + assert line.variants["alleles"].tolist() == expected.variants["alleles"][idx] + + # Check samples + assert gts.samples == expected.samples + + def test_read_plinktr(self): + expected_alleles =self._get_fake_genotypes_multiallelic().data + gts = GenotypesPLINKTR(DATADIR / "simple-tr-valid.vcf") + gts.read() + # check genotypes + np.testing.assert_allclose(expected_alleles, gts.data) class TestPhenotypes: From 5b3edfbeef1b80c1555b78463ab4aea4f89887ba Mon Sep 17 00:00:00 2001 From: Gonzalo <138239951+gonzalogc1@users.noreply.github.com> Date: Thu, 3 Aug 2023 11:09:11 -0600 Subject: [PATCH 11/40] Added the load() method in the GenotypesPLINKTR class --- haptools/data/genotypes.py | 40 +++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index 79f3db79..22dc5192 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -1520,7 +1520,45 @@ class GenotypesPLINKTR(GenotypesPLINK): def __init__(self, fname: Path | str, log: Logger = None, chunk_size: int = None , vcftype: str = "auto"): super().__init__(fname, log, chunk_size) self.vcftype = vcftype - + + @classmethod + def load( + cls: GenotypesPLINKTR, + fname: Path | str, + region: str = None, + samples: list[str] = None, + variants: set[str] = None, + vcftype: str = "auto", + ) -> Genotypes: + """ + Load STR genotypes from a VCF file + + Read the file contents, check the genotype phase, and create the MAC matrix + + Parameters + ---------- + fname + See documentation for :py:attr:`~.Data.fname` + region : str, optional + See documentation for :py:meth:`~.Genotypes.read` + samples : list[str], optional + See documentation for :py:meth:`~.Genotypes.read` + variants : set[str], optional + See documentation for :py:meth:`~.Genotypes.read + vcftype : str, optional + The type of TR VCF currently being read. Options are: + {'auto', 'gangstr', 'advntr', 'hipstr', 'eh', 'popstr'} +` + + Returns + ------- + Genotypes + A Genotypes object with the data loaded into its properties + """ + genotypes = cls(fname, log=None, chunk_size=None, vcftype=vcftype) + genotypes.read(region, samples, variants) + genotypes.check_phase() + return genotypes # TODO: implement this class From 65d8bcfd3a2b2ca9a26361fee522b56a5c6d663f Mon Sep 17 00:00:00 2001 From: Gonzalo <138239951+gonzalogc1@users.noreply.github.com> Date: Mon, 7 Aug 2023 16:13:14 -0600 Subject: [PATCH 12/40] Add the following methods: read(), _iterate(), write(), write_variants(), check_biallelic(), and check_maf() for GenotypesPLINKTR --- haptools/data/genotypes.py | 103 ++++++++++++++++++++++++++++++++++++- tests/test_data.py | 5 +- 2 files changed, 104 insertions(+), 4 deletions(-) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index 22dc5192..c69c9a79 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -1,6 +1,7 @@ from __future__ import annotations import re import gc +import importlib.util from csv import reader from pathlib import Path from typing import Iterator @@ -1520,6 +1521,22 @@ class GenotypesPLINKTR(GenotypesPLINK): def __init__(self, fname: Path | str, log: Logger = None, chunk_size: int = None , vcftype: str = "auto"): super().__init__(fname, log, chunk_size) self.vcftype = vcftype + self._import_copy_trh_module() + + def _import_copy_trh_module(self): + """ + Import a copy of the tr_harmonizer module into the "trh" property of this class + Subsequent edits to the module will remain scoped to the class, only + + This code is adapted from https://stackoverflow.com/a/11285504/16815703 + """ + try: + SPEC = importlib.util.find_spec("trtools.utils.tr_harmonizer") + except ModuleNotFoundError: + SPEC = trh.__spec__ + self.trh = importlib.util.module_from_spec(SPEC) + SPEC.loader.exec_module(self.trh) + @classmethod def load( @@ -1559,7 +1576,89 @@ def load( genotypes.read(region, samples, variants) genotypes.check_phase() return genotypes + + + + def _iter_TRRecords(self,region: str = None): + gts_obj = self + def GetGenotypeIndicies(self): + n = np.argmax(self.vcfrecord.ID == gts_obj.variants["id"]) + return gts_obj.data[:, n] + + self.trh.TRRecord.GetGenotypeIndicies = GetGenotypeIndicies + vcf = VCF(self.fname.with_suffix(".pvar")) + yield from self.trh.TRRecordHarmonizer( + vcffile = vcf, vcfiter= vcf(region), region =region , vcftype =self.vcftype + ) + + + def read( + self, + region: str = None, + samples: list[str] = None, + variants: set[str] = None, + max_variants: int = None, + ): + """ + Read genotypes from a PGEN file into a numpy matrix stored in + :py:attr:`~.GenotypesPLINK.data` + + Parameters + ---------- + region : str, optional + See documentation for :py:attr:`~.GenotypesVCF.read` + samples : list[str], optional + See documentation for :py:attr:`~.GenotypesVCF.read` + variants : set[str], optional + See documentation for :py:attr:`~.GenotypesVCF.read` + max_variants : int, optional + See documentation for :py:attr:`~.GenotypesVCF.read` + """ + super().read(region,samples,variants,max_variants) + for idx, record in enumerate(self._iter_TRRecords(region)): + self.data[:, idx] = record.GetLengthGenotypes() + + def _iterate( + self, + pgen: pgenlib.PgenReader, + region: str = None, + variants: set[str] = None, + ): + """ + A generator over the lines of a PGEN-PVAR file pair + + This is a helper function for :py:meth:`~.GenotypesPLINK.__iter__` + + Parameters + ---------- + pgen: pgenlib.PgenReader + The pgenlib.PgenReader object from which to fetch variant records + region : str, optional + See documentation for :py:meth:`~.Genotypes.read` + variants : set[str], optional + See documentation for :py:meth:`~.Genotypes.read` + + Yields + ------ + Iterator[namedtuple] + An iterator over each line in the file, where each line is encoded as a + namedtuple containing each of the class properties + """ + variants = super()._iterate(pgen,region,variants) + for variant, record in zip(variants, self._iter_TRRecords(region)): + variant.data = record.GetLengthGenotypes() + yield variant + + def write(self): + raise NotImplementedError + + def write_variants(self): + raise NotImplementedError + + def check_biallelic(self): + raise NotImplementedError + + def check_maf(self): + raise NotImplementedError - # TODO: implement this class - pass diff --git a/tests/test_data.py b/tests/test_data.py index d05d8ba8..66c1c436 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -642,7 +642,7 @@ def _get_fake_genotypes_multiallelic(self): ) # append the phasing info, transpose, and clean up - phasing = gts_plink.data[:, :, 2] + phasing = gts_plink.data[:, :, 2][:, :, np.newaxis] gts.data = np.concatenate((data, phasing), axis=-1).transpose((1, 0, 2)) # handle half-calls and chrX according to the flag: @@ -670,8 +670,9 @@ def test_iter(self): def test_read_plinktr(self): expected_alleles =self._get_fake_genotypes_multiallelic().data - gts = GenotypesPLINKTR(DATADIR / "simple-tr-valid.vcf") + gts = GenotypesPLINKTR(DATADIR / "simple-tr-valid.pgen") gts.read() + breakpoint() # check genotypes np.testing.assert_allclose(expected_alleles, gts.data) From f444ab231bcbc3d34a59ce76c11432d7b84a750d Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Mon, 7 Aug 2023 17:28:48 -0700 Subject: [PATCH 13/40] handle missing and unphased genotypes in plinkgenotypestr --- haptools/data/genotypes.py | 5 +++-- tests/test_data.py | 2 ++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index c69c9a79..fb863920 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -1583,8 +1583,9 @@ def _iter_TRRecords(self,region: str = None): gts_obj = self def GetGenotypeIndicies(self): n = np.argmax(self.vcfrecord.ID == gts_obj.variants["id"]) - return gts_obj.data[:, n] - + gts_data = gts_obj.data[:, n].astype(np.int8) + gts_data[gts_data == np.iinfo(np.uint8).max] = -1 + return gts_data self.trh.TRRecord.GetGenotypeIndicies = GetGenotypeIndicies vcf = VCF(self.fname.with_suffix(".pvar")) yield from self.trh.TRRecordHarmonizer( diff --git a/tests/test_data.py b/tests/test_data.py index 66c1c436..ecf8e368 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -643,6 +643,8 @@ def _get_fake_genotypes_multiallelic(self): # append the phasing info, transpose, and clean up phasing = gts_plink.data[:, :, 2][:, :, np.newaxis] + phasing[[3,4], 1] = 0 + phasing[1, [3,4]] = 1 gts.data = np.concatenate((data, phasing), axis=-1).transpose((1, 0, 2)) # handle half-calls and chrX according to the flag: From 6b2c0c1f31750f12a939fe0be83b753c25b515b4 Mon Sep 17 00:00:00 2001 From: Gonzalo <138239951+gonzalogc1@users.noreply.github.com> Date: Wed, 9 Aug 2023 13:01:03 -0600 Subject: [PATCH 14/40] Start benchmarking of GenotypesPLINKTR --- tests/bench_genotypes_TR.py | 441 ++++++++++++++++++++++++++++++++++++ tests/test_data.py | 1 - 2 files changed, 441 insertions(+), 1 deletion(-) create mode 100755 tests/bench_genotypes_TR.py diff --git a/tests/bench_genotypes_TR.py b/tests/bench_genotypes_TR.py new file mode 100755 index 00000000..fc943530 --- /dev/null +++ b/tests/bench_genotypes_TR.py @@ -0,0 +1,441 @@ +#!/usr/bin/env python + +import sys +import pickle +import shutil +from pathlib import Path +from time import process_time + +import click +import matplotlib +import numpy as np +import matplotlib.pyplot as plt + +# Force matplotlib to not use any Xwindows backend. +matplotlib.use("Agg") + +from haptools import logging +from haptools.data import GenotypesVCF, GenotypesPLINK, GenotypesTR, GenotypesPLINKTR +from pysam import VariantFile + +# COMMAND FOR GENERATING UKB PLOT: +# tests/bench_genotypes.py \ +# --default-variants 18472 --default-samples 487409 --intervals-variants 1 80 4 \ +# --intervals-samples 1 80 4 -o plot.png -a results.pickle + +DATADIR = Path(__file__).parent.joinpath("data") + + +def create_variant_files(gts, intervals, num_samps): + samples = gts.samples[:num_samps] + variant_dir = gts.fname / "variant" + variant_dir.mkdir(parents=True, exist_ok=True) + for val in intervals: + variants = tuple(gts.variants["id"][:val]) + sub = gts.subset(samples=samples, variants=variants) + # write PLINK2 files + sub.fname = variant_dir / f"{val}.pgen" + # write VCF files + vcf = GenotypesTR(sub.fname.with_suffix(".vcf")) + vcf.data = sub.data + vcf.samples = sub.samples + vcf.variants = sub.variants + write_TR_VCF(vcf) + return variant_dir + +def write_TR_VCF(gts): + """ + Write the variants in this class to a VCF at :py:attr:`~.GenotypesVCF.fname` + """ + vcf = VariantFile(str(gts.fname), mode="w") + # make sure the header is properly structured + for contig in set(gts.variants["chrom"]): + vcf.header.contigs.add(contig) + vcf.header.add_meta( + "INFO", + items=[ + ("ID", "START"), + ("Number", 1), + ("Type", "Integer"), + ("Description", "Inclusive start coodinate for the repetitive portion of the reference allele"), + ], + ) + vcf.header.add_meta( + "INFO", + items=[ + ("ID", "END"), + ("Number", 1), + ("Type", "Integer"), + ("Description", "Inclusive end coordinate for the repetitive portion of the reference allele"), + ], + ) + vcf.header.add_meta( + "INFO", + items=[ + ("ID", "PERIOD"), + ("Number", 1), + ("Type", "Integer"), + ("Description", "Length of STR motif"), + ], + ) + vcf.header.add_meta( + "FORMAT", + items=[ + ("ID", "GT"), + ("Number", 1), + ("Type", "String"), + ("Description", "Genotype"), + ], + ) + try: + vcf.header.add_samples(gts.samples) + except AttributeError: + gts.log.warning( + "Upgrade to pysam >=0.19.1 to reduce the time required to create " + "VCFs. See https://github.com/pysam-developers/pysam/issues/1104" + ) + for sample in gts.samples: + vcf.header.add_sample(sample) + gts.log.info("Writing VCF records") + phased = gts._prephased or (gts.data.shape[2] < 3) + missing_val = np.iinfo(np.uint8).max + for var_idx, var in enumerate(gts.variants): + rec = { + "contig": var["chrom"], + "start": var["pos"], + "stop": var["pos"] + len(var["alleles"][0]) - 1, + "qual": None, + "alleles": var["alleles"], + "id": var["id"], + "filter": None, + "info":{"START":var["pos"] - 1,"END":var["pos"] + len(var["alleles"][0]) - 1,"PERIOD":len(var["alleles"][0])}, + } + # handle pysam increasing the start site by 1 + rec["start"] -= 1 + # parse the record into a pysam.VariantRecord + record = vcf.new_record(**rec) + for samp_idx, sample in enumerate(gts.samples): + record.samples[sample]["GT"] = tuple( + None if val == missing_val else val + for val in gts.data[samp_idx, var_idx, :2] + ) + # add proper phasing info + if phased: + record.samples[sample].phased = True + else: + record.samples[sample].phased = gts.data[samp_idx, var_idx, 2] + # write the record to a file + vcf.write(record) + vcf.close() + + + +def create_sample_files(gts, intervals, num_vars): + variants = tuple(gts.variants["id"][:num_vars]) + sample_dir = gts.fname / "sample" + sample_dir.mkdir(parents=True, exist_ok=True) + for val in intervals: + samples = gts.samples[:val] + sub = gts.subset(samples=samples, variants=variants) + # write PLINK2 files + sub.fname = sample_dir / f"{val}.pgen" + # write VCF files + vcf = GenotypesTR(sub.fname.with_suffix(".vcf")) + vcf.data = sub.data + vcf.samples = sub.samples + vcf.variants = sub.variants + write_TR_VCF(vcf) + return sample_dir + + +def time_vcf(vcf, max_variants, chunk_size=500): + GenotypesVCF(vcf).read(max_variants=max_variants) + + +def time_plink(pgen, max_variants, chunk_size=500): + GenotypesPLINK(pgen).read(max_variants=max_variants) + + +def time_plink_chunk(pgen, max_variants, chunk_size=500): + GenotypesPLINK(pgen, chunk_size=chunk_size).read(max_variants=max_variants) + + +def progressbar(it, prefix="", size=60, out=sys.stdout): # Python3.6+ + count = len(it) + + def show(j): + x = int(size * j / count) + print( + f"{prefix}[{u'â–ˆ'*x}{('.'*(size-x))}] {j}/{count}", + end="\r", + file=out, + flush=True, + ) + + show(0) + for i, item in enumerate(it): + yield item + show(i + 1) + print("\n", flush=True, file=out) + + +@click.command() +@click.option( + "-t", + "--temp", + type=click.Path(path_type=Path, file_okay=False, dir_okay=True), + default=Path("bench_temp_dir"), + show_default=True, + help="A temp directory into which to place generated temporary files", +) +@click.option( + "--default-variants", + type=int, + default=4, + show_default=True, + help="The number of variants to use when we vary the number of samples", +) +@click.option( + "--default-samples", + type=int, + default=5, + show_default=True, + help="The number of samples to use when we vary the number of variants", +) +@click.option( + "--intervals-variants", + type=int, + nargs=3, + default=(1, 4, 1), + show_default=True, + help="The start, end, and step values for the x-axis of the variants plot", +) +@click.option( + "--intervals-samples", + type=int, + nargs=3, + default=(1, 5, 1), + show_default=True, + help="The start, end, and step values for the x-axis of the samples plot", +) +@click.option( + "--reps", + type=int, + default=3, + show_default=True, + help="For each benchmark value, we take the mean of --reps X replicates", +) +@click.option( + "-o", + "--output", + type=click.Path(path_type=Path), + default=Path("bench-plink.png"), + show_default=True, + help="A PNG file containing the results of the benchmark", +) +@click.option( + "-p", + "--progress", + is_flag=True, + default=False, + show_default=True, + help="Whether to display a progress bar. Useful for large benchmarks", +) +@click.option( + "-s", + "--silent", + is_flag=True, + default=False, + show_default=True, + help="Whether to be entirely silent", +) +@click.option( + "-a", + "--archive", + type=click.Path(path_type=Path), + default=None, + show_default="do not save generated results", + help="A python pickle file into which to store results", +) +def main( + temp, + default_variants, + default_samples, + intervals_variants, + intervals_samples, + reps, + output, + progress=False, + silent=False, + archive=None, +): + """ + Benchmarks classes in the data.genotypes module + """ + DEFAULT_VARIANTS, DEFAULT_SAMPLES, INTERVALS_VARIANTS, INTERVALS_SAMPLES, REPS = ( + default_variants, + default_samples, + range(*intervals_variants), + range(*intervals_samples), + reps, + ) + LOG = logging.getLogger("run", 0 if silent else "DEBUG") + gts = GenotypesPLINK(None, log=LOG) + num_variants = max(DEFAULT_VARIANTS, INTERVALS_VARIANTS.stop) + sample_size = max(DEFAULT_SAMPLES, INTERVALS_SAMPLES.stop) + gts.samples = tuple(f"sample{i}" for i in range(sample_size)) + gts.variants = np.array( + [(f"id{i}", "chr0", i, ("A", "T")) for i in range(1, num_variants + 1)], + dtype=gts.variants.dtype, + ) + np.random.seed(12345) + if not temp.exists(): + LOG.info("Generating fake genotypes") + gts.data = np.random.choice( + [True, False], size=sample_size * num_variants * 2 + ).reshape((sample_size, num_variants, 2)) + gts.fname = temp + # set initial variables + SAMPLES = gts.samples + VARIANTS = gts.variants["id"] + DEFAULT_SAMPLES = min(DEFAULT_SAMPLES, len(SAMPLES)) + DEFAULT_VARIANTS = min(DEFAULT_VARIANTS, len(VARIANTS)) + if INTERVALS_VARIANTS.stop > len(VARIANTS): + INTERVALS_VARIANTS = range( + INTERVALS_VARIANTS.start, len(VARIANTS), INTERVALS_VARIANTS.step + ) + INTERVALS_VARIANTS = list(INTERVALS_VARIANTS) + if INTERVALS_SAMPLES.stop > len(SAMPLES): + INTERVALS_SAMPLES = range( + INTERVALS_SAMPLES.start, len(SAMPLES), INTERVALS_SAMPLES.step + ) + INTERVALS_SAMPLES = list(INTERVALS_SAMPLES) + FILE_TYPES = { + "vcf": "VCF", + "pgen": "PLINK2", + "chunked200": "PLINK2 chunk_size: 200", + "chunked400": "PLINK2 chunk_size: 400", + "chunked600": "PLINK2 chunk_size: 600", + "chunked800": "PLINK2 chunk_size: 800", + } + + # create the files we will try to load if they haven't been created already + if not gts.fname.exists(): + LOG.info("Creating VCF and PGEN files that we can load") + variant_dir = create_variant_files(gts, INTERVALS_VARIANTS, DEFAULT_SAMPLES) + sample_dir = create_sample_files(gts, INTERVALS_SAMPLES, DEFAULT_VARIANTS) + else: + variant_dir = gts.fname / "variant" + sample_dir = gts.fname / "sample" + LOG.info( + "Temp directory already exists. Assuming files have already been created", + ) + + LOG.setLevel(level="ERROR") + + # run each test + LOG.info("Benchmarking the loading of each file") + results = {} + len_variants = len(VARIANTS) + for arg in ("samples", "variants"): + genotype_dir = variant_dir + if arg == "samples": + genotype_dir = sample_dir + results[arg] = {} + intervals = INTERVALS_SAMPLES if arg == "samples" else INTERVALS_VARIANTS + item_iter = ( + progressbar(intervals, prefix=f"{arg}, {file_type}: ", out=sys.stderr) + if progress + else intervals + ) + for file_type in FILE_TYPES.keys(): + results[arg][file_type] = [] + for val in item_iter: + chunk_size = 500 + if file_type == "vcf": + func = time_vcf + file = genotype_dir / f"{val}.vcf" + elif file_type == "pgen" or file_type.startswith("chunked"): + func = time_plink if file_type == "pgen" else time_plink_chunk + file = genotype_dir / f"{val}.pgen" + if file_type.startswith("chunked"): + chunk_size = int(file_type[len("chunked") :]) + else: + continue + times = np.empty(REPS, dtype=np.float64) + for rep in range(REPS): + start = process_time() + func(file, max_variants=len_variants, chunk_size=chunk_size) + end = process_time() + times[rep] = end - start + results[arg][file_type].append(times.mean()) + + # plot the results + LOG.info("Generating plot of results") + fig, (ax_samples, ax_variants) = plt.subplots(1, 2, figsize=(10, 5)) + lab_font = {"fontsize": "xx-small"} + for file_type in FILE_TYPES.keys(): + x_vals = INTERVALS_SAMPLES + y_vals = results["samples"][file_type] + # fit a line to each so that we can report the slope + slope = np.polyfit(x_vals, y_vals, 1)[0] + ax_samples.plot( + x_vals, + y_vals, + marker="o", + label=FILE_TYPES[file_type], + ) + ax_samples.text( + x_vals[-1], + y_vals[-1] + (y_vals[-1] / 16), + f"m = {slope:.3E}", + fontdict=lab_font, + ) + ax_samples.set_xlabel(f"Number of samples\nnum_variants = {DEFAULT_VARIANTS}") + ax_samples.set_ylim(ymin=0) + for file_type in FILE_TYPES.keys(): + x_vals = INTERVALS_VARIANTS + y_vals = results["variants"][file_type] + # fit a line to each so that we can report the slope + slope = np.polyfit(x_vals, y_vals, 1)[0] + ax_variants.plot( + x_vals, + y_vals, + marker="o", + ) + ax_variants.text( + x_vals[-1], + y_vals[-1] + (y_vals[-1] / 16), + f"m = {slope:.3E}", + fontdict=lab_font, + ) + ax_variants.set_xlabel(f"Number of variants\nnum_samples = {DEFAULT_SAMPLES}") + ax_variants.set_ylim(ymin=0) + fig.supylabel("CPU Time (s)") + fig.legend(loc="lower left", fontsize="xx-small") + fig.tight_layout() + fig.savefig(str(output), bbox_inches="tight", dpi=400) + + # saving results + if archive is not None: + with open(archive, "wb") as fh: + pickle.dump(results, fh) + + +def test_bench_genotypes(): + tmp_dir = Path("bench_temp_dir") + tmp_plot = Path("bench-plink.png") + + try: + main(["-t", tmp_dir, "-o", tmp_plot, "--reps", 1, "-s"]) + except SystemExit as err: + # re-raise unless main() finished without an error + if err.code: + raise + + shutil.rmtree(tmp_dir) + tmp_plot.unlink() + + +if __name__ == "__main__": + main() diff --git a/tests/test_data.py b/tests/test_data.py index ecf8e368..326e5791 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -674,7 +674,6 @@ def test_read_plinktr(self): expected_alleles =self._get_fake_genotypes_multiallelic().data gts = GenotypesPLINKTR(DATADIR / "simple-tr-valid.pgen") gts.read() - breakpoint() # check genotypes np.testing.assert_allclose(expected_alleles, gts.data) From 3d98d089ddc7396f31e9433be381fb3271942a29 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Thu, 10 Aug 2023 14:10:00 -0700 Subject: [PATCH 15/40] add benchmarking of TRs to bench_genotypes.py --- tests/bench_genotypes.py | 220 ++++++++++++++++-- tests/bench_genotypes_TR.py | 441 ------------------------------------ 2 files changed, 200 insertions(+), 461 deletions(-) delete mode 100755 tests/bench_genotypes_TR.py diff --git a/tests/bench_genotypes.py b/tests/bench_genotypes.py index 58dd39b7..1a7e7c07 100755 --- a/tests/bench_genotypes.py +++ b/tests/bench_genotypes.py @@ -3,19 +3,26 @@ import sys import pickle import shutil +import subprocess from pathlib import Path from time import process_time import click import matplotlib import numpy as np +from pysam import VariantFile import matplotlib.pyplot as plt # Force matplotlib to not use any Xwindows backend. matplotlib.use("Agg") from haptools import logging -from haptools.data import GenotypesVCF, GenotypesPLINK +from haptools.data import ( + GenotypesTR, + GenotypesVCF, + GenotypesPLINK, + GenotypesPLINKTR, +) # COMMAND FOR GENERATING UKB PLOT: @@ -26,41 +33,168 @@ DATADIR = Path(__file__).parent.joinpath("data") -def create_variant_files(gts, intervals, num_samps): +class GenotypesVCFTR(GenotypesVCF): + def write(self): + """ + Write the variants in this class to a VCF at :py:attr:`~.GenotypesTR.fname` + + Output the VCF in HipSTR format + """ + vcf = VariantFile(str(self.fname), mode="w") + # make sure the header is properly structured + for contig in set(self.variants["chrom"]): + vcf.header.contigs.add(contig) + d = "Inclusive {} coodinate for the repetitive portion of the reference allele" + vcf.header.add_meta( + "INFO", + items=[ + ("ID", "START"), + ("Number", 1), + ("Type", "Integer"), + ("Description", d.format("start")), + ], + ) + vcf.header.add_meta( + "INFO", + items=[ + ("ID", "END"), + ("Number", 1), + ("Type", "Integer"), + ("Description", d.format("end")), + ], + ) + vcf.header.add_meta( + "INFO", + items=[ + ("ID", "PERIOD"), + ("Number", 1), + ("Type", "Integer"), + ("Description", "Length of STR motif"), + ], + ) + vcf.header.add_meta( + "FORMAT", + items=[ + ("ID", "GT"), + ("Number", 1), + ("Type", "String"), + ("Description", "Genotype"), + ], + ) + try: + vcf.header.add_samples(self.samples) + except AttributeError: + self.log.warning( + "Upgrade to pysam >=0.19.1 to reduce the time required to create " + "VCFs. See https://github.com/pysam-developers/pysam/issues/1104" + ) + for sample in self.samples: + vcf.header.add_sample(sample) + self.log.info("Writing VCF records") + phased = self._prephased or (self.data.shape[2] < 3) + missing_val = np.iinfo(np.uint8).max + for var_idx, var in enumerate(self.variants): + rec = { + "contig": var["chrom"], + "start": var["pos"], + "stop": var["pos"] + len(var["alleles"][0]), + "qual": None, + "alleles": var["alleles"], + "id": var["id"], + "filter": None, + } + # handle pysam increasing the start site by 1 + rec["start"] -= 1 + # parse the record into a pysam.VariantRecord + record = vcf.new_record(**rec) + # add INFO flags expected of HipSTR + # Note: this is only possible because we guarantee that the REF allele + # is a single copy of a dinucleotide repeat + record.info["START"] = int(rec["start"]) + record.stop = int(rec["stop"]) + record.info["PERIOD"] = len(var["alleles"][0]) + for samp_idx, sample in enumerate(self.samples): + record.samples[sample]["GT"] = tuple( + None if val == missing_val else val + for val in self.data[samp_idx, var_idx, :2] + ) + # add proper phasing info + if phased: + record.samples[sample].phased = True + else: + record.samples[sample].phased = self.data[samp_idx, var_idx, 2] + # write the record to a file + vcf.write(record) + vcf.close() + + +def write_tr_files(plink2: Path, vcf: Path, pgen: Path): + # first, add "HipSTR" to the header so that we can trick TRTools + subprocess.call(["sed", "-i", "2i##command=HipSTR-v0.7 --test/", vcf]) + subprocess.call( + [ + plink2, + "--vcf-half-call", + "m", + "--make-pgen", + "pvar-cols=vcfheader,qual,filter,info", + "--vcf", + vcf, + "--out", + pgen.with_suffix(""), + ], + stderr=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + ) + + +def create_variant_files(gts, intervals, num_samps, plink2: Path = None): samples = gts.samples[:num_samps] variant_dir = gts.fname / "variant" variant_dir.mkdir(parents=True, exist_ok=True) for val in intervals: variants = tuple(gts.variants["id"][:val]) sub = gts.subset(samples=samples, variants=variants) - # write PLINK2 files sub.fname = variant_dir / f"{val}.pgen" - sub.write() # write VCF files - vcf = GenotypesVCF(sub.fname.with_suffix(".vcf")) + if plink2: + vcf = GenotypesVCFTR(sub.fname.with_suffix(".vcf")) + else: + vcf = GenotypesVCF(sub.fname.with_suffix(".vcf")) vcf.data = sub.data vcf.samples = sub.samples vcf.variants = sub.variants vcf.write() + # write PLINK2 files + if plink2: + write_tr_files(plink2, vcf.fname, sub.fname) + else: + sub.write() return variant_dir -def create_sample_files(gts, intervals, num_vars): +def create_sample_files(gts, intervals, num_vars, plink2: Path = None): variants = tuple(gts.variants["id"][:num_vars]) sample_dir = gts.fname / "sample" sample_dir.mkdir(parents=True, exist_ok=True) for val in intervals: samples = gts.samples[:val] sub = gts.subset(samples=samples, variants=variants) - # write PLINK2 files sub.fname = sample_dir / f"{val}.pgen" - sub.write() # write VCF files - vcf = GenotypesVCF(sub.fname.with_suffix(".vcf")) + if plink2: + vcf = GenotypesVCFTR(sub.fname.with_suffix(".vcf")) + else: + vcf = GenotypesVCF(sub.fname.with_suffix(".vcf")) vcf.data = sub.data vcf.samples = sub.samples vcf.variants = sub.variants vcf.write() + # write PLINK2 files + if plink2: + write_tr_files(plink2, vcf.fname, sub.fname) + else: + sub.write() return sample_dir @@ -68,14 +202,28 @@ def time_vcf(vcf, max_variants, chunk_size=500): GenotypesVCF(vcf).read(max_variants=max_variants) +def time_vcf_tr(vcf, max_variants, chunk_size=500): + GenotypesTR(vcf, vcftype="hipstr").read(max_variants=max_variants) + + def time_plink(pgen, max_variants, chunk_size=500): GenotypesPLINK(pgen).read(max_variants=max_variants) +def time_plink_tr(pgen, max_variants, chunk_size=500): + GenotypesPLINKTR(pgen, vcftype="hipstr").read(max_variants=max_variants) + + def time_plink_chunk(pgen, max_variants, chunk_size=500): GenotypesPLINK(pgen, chunk_size=chunk_size).read(max_variants=max_variants) +def time_plink_chunk_tr(pgen, max_variants, chunk_size=500): + GenotypesPLINKTR(pgen, chunk_size=chunk_size, vcftype="hipstr").read( + max_variants=max_variants + ) + + def progressbar(it, prefix="", size=60, out=sys.stdout): # Python3.6+ count = len(it) @@ -95,6 +243,10 @@ def show(j): print("\n", flush=True, file=out) +def create_tr_alleles(): + return tuple("AT" * j for j in range(1, np.random.randint(2, 11) + 1)) + + @click.command() @click.option( "-t", @@ -173,6 +325,13 @@ def show(j): show_default="do not save generated results", help="A python pickle file into which to store results", ) +@click.option( + "--plink2", + type=click.Path(path_type=Path, exists=True), + default=None, + show_default="do not benchmark repeats", + help="Benchmark repeats in addition to SNPs. Create PGENs with this plink2 binary", +) def main( temp, default_variants, @@ -184,6 +343,7 @@ def main( progress=False, silent=False, archive=None, + plink2=None, ): """ Benchmarks classes in the data.genotypes module @@ -200,16 +360,29 @@ def main( num_variants = max(DEFAULT_VARIANTS, INTERVALS_VARIANTS.stop) sample_size = max(DEFAULT_SAMPLES, INTERVALS_SAMPLES.stop) gts.samples = tuple(f"sample{i}" for i in range(sample_size)) - gts.variants = np.array( - [(f"id{i}", "chr0", i, ("A", "T")) for i in range(1, num_variants + 1)], - dtype=gts.variants.dtype, - ) + if plink2: + gts.variants = np.array( + [ + (f"id{i}", "chr0", i, create_tr_alleles()) + for i in range(1, num_variants + 1) + ], + dtype=gts.variants.dtype, + ) + else: + gts.variants = np.array( + [(f"id{i}", "chr0", i, ("A", "T")) for i in range(1, num_variants + 1)], + dtype=gts.variants.dtype, + ) np.random.seed(12345) if not temp.exists(): LOG.info("Generating fake genotypes") - gts.data = np.random.choice( - [True, False], size=sample_size * num_variants * 2 - ).reshape((sample_size, num_variants, 2)) + gts.data = np.empty( + (sample_size, num_variants, 2), dtype=(np.uint8 if plink2 else np.bool_) + ) + for i in range(num_variants): + gts.data[:, i] = np.random.choice( + range(len(gts.variants[i]["alleles"])), size=sample_size * 2 + ).reshape((sample_size, 2)) gts.fname = temp # set initial variables SAMPLES = gts.samples @@ -238,8 +411,12 @@ def main( # create the files we will try to load if they haven't been created already if not gts.fname.exists(): LOG.info("Creating VCF and PGEN files that we can load") - variant_dir = create_variant_files(gts, INTERVALS_VARIANTS, DEFAULT_SAMPLES) - sample_dir = create_sample_files(gts, INTERVALS_SAMPLES, DEFAULT_VARIANTS) + variant_dir = create_variant_files( + gts, INTERVALS_VARIANTS, DEFAULT_SAMPLES, plink2 + ) + sample_dir = create_sample_files( + gts, INTERVALS_SAMPLES, DEFAULT_VARIANTS, plink2 + ) else: variant_dir = gts.fname / "variant" sample_dir = gts.fname / "sample" @@ -269,10 +446,13 @@ def main( for val in item_iter: chunk_size = 500 if file_type == "vcf": - func = time_vcf + func = time_vcf_tr if plink2 else time_vcf file = genotype_dir / f"{val}.vcf" elif file_type == "pgen" or file_type.startswith("chunked"): - func = time_plink if file_type == "pgen" else time_plink_chunk + if file_type == "pgen": + func = time_plink_tr if plink2 else time_plink + else: + funct = time_plink_chunk_tr if plink2 else time_plink_chunk file = genotype_dir / f"{val}.pgen" if file_type.startswith("chunked"): chunk_size = int(file_type[len("chunked") :]) diff --git a/tests/bench_genotypes_TR.py b/tests/bench_genotypes_TR.py deleted file mode 100755 index fc943530..00000000 --- a/tests/bench_genotypes_TR.py +++ /dev/null @@ -1,441 +0,0 @@ -#!/usr/bin/env python - -import sys -import pickle -import shutil -from pathlib import Path -from time import process_time - -import click -import matplotlib -import numpy as np -import matplotlib.pyplot as plt - -# Force matplotlib to not use any Xwindows backend. -matplotlib.use("Agg") - -from haptools import logging -from haptools.data import GenotypesVCF, GenotypesPLINK, GenotypesTR, GenotypesPLINKTR -from pysam import VariantFile - -# COMMAND FOR GENERATING UKB PLOT: -# tests/bench_genotypes.py \ -# --default-variants 18472 --default-samples 487409 --intervals-variants 1 80 4 \ -# --intervals-samples 1 80 4 -o plot.png -a results.pickle - -DATADIR = Path(__file__).parent.joinpath("data") - - -def create_variant_files(gts, intervals, num_samps): - samples = gts.samples[:num_samps] - variant_dir = gts.fname / "variant" - variant_dir.mkdir(parents=True, exist_ok=True) - for val in intervals: - variants = tuple(gts.variants["id"][:val]) - sub = gts.subset(samples=samples, variants=variants) - # write PLINK2 files - sub.fname = variant_dir / f"{val}.pgen" - # write VCF files - vcf = GenotypesTR(sub.fname.with_suffix(".vcf")) - vcf.data = sub.data - vcf.samples = sub.samples - vcf.variants = sub.variants - write_TR_VCF(vcf) - return variant_dir - -def write_TR_VCF(gts): - """ - Write the variants in this class to a VCF at :py:attr:`~.GenotypesVCF.fname` - """ - vcf = VariantFile(str(gts.fname), mode="w") - # make sure the header is properly structured - for contig in set(gts.variants["chrom"]): - vcf.header.contigs.add(contig) - vcf.header.add_meta( - "INFO", - items=[ - ("ID", "START"), - ("Number", 1), - ("Type", "Integer"), - ("Description", "Inclusive start coodinate for the repetitive portion of the reference allele"), - ], - ) - vcf.header.add_meta( - "INFO", - items=[ - ("ID", "END"), - ("Number", 1), - ("Type", "Integer"), - ("Description", "Inclusive end coordinate for the repetitive portion of the reference allele"), - ], - ) - vcf.header.add_meta( - "INFO", - items=[ - ("ID", "PERIOD"), - ("Number", 1), - ("Type", "Integer"), - ("Description", "Length of STR motif"), - ], - ) - vcf.header.add_meta( - "FORMAT", - items=[ - ("ID", "GT"), - ("Number", 1), - ("Type", "String"), - ("Description", "Genotype"), - ], - ) - try: - vcf.header.add_samples(gts.samples) - except AttributeError: - gts.log.warning( - "Upgrade to pysam >=0.19.1 to reduce the time required to create " - "VCFs. See https://github.com/pysam-developers/pysam/issues/1104" - ) - for sample in gts.samples: - vcf.header.add_sample(sample) - gts.log.info("Writing VCF records") - phased = gts._prephased or (gts.data.shape[2] < 3) - missing_val = np.iinfo(np.uint8).max - for var_idx, var in enumerate(gts.variants): - rec = { - "contig": var["chrom"], - "start": var["pos"], - "stop": var["pos"] + len(var["alleles"][0]) - 1, - "qual": None, - "alleles": var["alleles"], - "id": var["id"], - "filter": None, - "info":{"START":var["pos"] - 1,"END":var["pos"] + len(var["alleles"][0]) - 1,"PERIOD":len(var["alleles"][0])}, - } - # handle pysam increasing the start site by 1 - rec["start"] -= 1 - # parse the record into a pysam.VariantRecord - record = vcf.new_record(**rec) - for samp_idx, sample in enumerate(gts.samples): - record.samples[sample]["GT"] = tuple( - None if val == missing_val else val - for val in gts.data[samp_idx, var_idx, :2] - ) - # add proper phasing info - if phased: - record.samples[sample].phased = True - else: - record.samples[sample].phased = gts.data[samp_idx, var_idx, 2] - # write the record to a file - vcf.write(record) - vcf.close() - - - -def create_sample_files(gts, intervals, num_vars): - variants = tuple(gts.variants["id"][:num_vars]) - sample_dir = gts.fname / "sample" - sample_dir.mkdir(parents=True, exist_ok=True) - for val in intervals: - samples = gts.samples[:val] - sub = gts.subset(samples=samples, variants=variants) - # write PLINK2 files - sub.fname = sample_dir / f"{val}.pgen" - # write VCF files - vcf = GenotypesTR(sub.fname.with_suffix(".vcf")) - vcf.data = sub.data - vcf.samples = sub.samples - vcf.variants = sub.variants - write_TR_VCF(vcf) - return sample_dir - - -def time_vcf(vcf, max_variants, chunk_size=500): - GenotypesVCF(vcf).read(max_variants=max_variants) - - -def time_plink(pgen, max_variants, chunk_size=500): - GenotypesPLINK(pgen).read(max_variants=max_variants) - - -def time_plink_chunk(pgen, max_variants, chunk_size=500): - GenotypesPLINK(pgen, chunk_size=chunk_size).read(max_variants=max_variants) - - -def progressbar(it, prefix="", size=60, out=sys.stdout): # Python3.6+ - count = len(it) - - def show(j): - x = int(size * j / count) - print( - f"{prefix}[{u'â–ˆ'*x}{('.'*(size-x))}] {j}/{count}", - end="\r", - file=out, - flush=True, - ) - - show(0) - for i, item in enumerate(it): - yield item - show(i + 1) - print("\n", flush=True, file=out) - - -@click.command() -@click.option( - "-t", - "--temp", - type=click.Path(path_type=Path, file_okay=False, dir_okay=True), - default=Path("bench_temp_dir"), - show_default=True, - help="A temp directory into which to place generated temporary files", -) -@click.option( - "--default-variants", - type=int, - default=4, - show_default=True, - help="The number of variants to use when we vary the number of samples", -) -@click.option( - "--default-samples", - type=int, - default=5, - show_default=True, - help="The number of samples to use when we vary the number of variants", -) -@click.option( - "--intervals-variants", - type=int, - nargs=3, - default=(1, 4, 1), - show_default=True, - help="The start, end, and step values for the x-axis of the variants plot", -) -@click.option( - "--intervals-samples", - type=int, - nargs=3, - default=(1, 5, 1), - show_default=True, - help="The start, end, and step values for the x-axis of the samples plot", -) -@click.option( - "--reps", - type=int, - default=3, - show_default=True, - help="For each benchmark value, we take the mean of --reps X replicates", -) -@click.option( - "-o", - "--output", - type=click.Path(path_type=Path), - default=Path("bench-plink.png"), - show_default=True, - help="A PNG file containing the results of the benchmark", -) -@click.option( - "-p", - "--progress", - is_flag=True, - default=False, - show_default=True, - help="Whether to display a progress bar. Useful for large benchmarks", -) -@click.option( - "-s", - "--silent", - is_flag=True, - default=False, - show_default=True, - help="Whether to be entirely silent", -) -@click.option( - "-a", - "--archive", - type=click.Path(path_type=Path), - default=None, - show_default="do not save generated results", - help="A python pickle file into which to store results", -) -def main( - temp, - default_variants, - default_samples, - intervals_variants, - intervals_samples, - reps, - output, - progress=False, - silent=False, - archive=None, -): - """ - Benchmarks classes in the data.genotypes module - """ - DEFAULT_VARIANTS, DEFAULT_SAMPLES, INTERVALS_VARIANTS, INTERVALS_SAMPLES, REPS = ( - default_variants, - default_samples, - range(*intervals_variants), - range(*intervals_samples), - reps, - ) - LOG = logging.getLogger("run", 0 if silent else "DEBUG") - gts = GenotypesPLINK(None, log=LOG) - num_variants = max(DEFAULT_VARIANTS, INTERVALS_VARIANTS.stop) - sample_size = max(DEFAULT_SAMPLES, INTERVALS_SAMPLES.stop) - gts.samples = tuple(f"sample{i}" for i in range(sample_size)) - gts.variants = np.array( - [(f"id{i}", "chr0", i, ("A", "T")) for i in range(1, num_variants + 1)], - dtype=gts.variants.dtype, - ) - np.random.seed(12345) - if not temp.exists(): - LOG.info("Generating fake genotypes") - gts.data = np.random.choice( - [True, False], size=sample_size * num_variants * 2 - ).reshape((sample_size, num_variants, 2)) - gts.fname = temp - # set initial variables - SAMPLES = gts.samples - VARIANTS = gts.variants["id"] - DEFAULT_SAMPLES = min(DEFAULT_SAMPLES, len(SAMPLES)) - DEFAULT_VARIANTS = min(DEFAULT_VARIANTS, len(VARIANTS)) - if INTERVALS_VARIANTS.stop > len(VARIANTS): - INTERVALS_VARIANTS = range( - INTERVALS_VARIANTS.start, len(VARIANTS), INTERVALS_VARIANTS.step - ) - INTERVALS_VARIANTS = list(INTERVALS_VARIANTS) - if INTERVALS_SAMPLES.stop > len(SAMPLES): - INTERVALS_SAMPLES = range( - INTERVALS_SAMPLES.start, len(SAMPLES), INTERVALS_SAMPLES.step - ) - INTERVALS_SAMPLES = list(INTERVALS_SAMPLES) - FILE_TYPES = { - "vcf": "VCF", - "pgen": "PLINK2", - "chunked200": "PLINK2 chunk_size: 200", - "chunked400": "PLINK2 chunk_size: 400", - "chunked600": "PLINK2 chunk_size: 600", - "chunked800": "PLINK2 chunk_size: 800", - } - - # create the files we will try to load if they haven't been created already - if not gts.fname.exists(): - LOG.info("Creating VCF and PGEN files that we can load") - variant_dir = create_variant_files(gts, INTERVALS_VARIANTS, DEFAULT_SAMPLES) - sample_dir = create_sample_files(gts, INTERVALS_SAMPLES, DEFAULT_VARIANTS) - else: - variant_dir = gts.fname / "variant" - sample_dir = gts.fname / "sample" - LOG.info( - "Temp directory already exists. Assuming files have already been created", - ) - - LOG.setLevel(level="ERROR") - - # run each test - LOG.info("Benchmarking the loading of each file") - results = {} - len_variants = len(VARIANTS) - for arg in ("samples", "variants"): - genotype_dir = variant_dir - if arg == "samples": - genotype_dir = sample_dir - results[arg] = {} - intervals = INTERVALS_SAMPLES if arg == "samples" else INTERVALS_VARIANTS - item_iter = ( - progressbar(intervals, prefix=f"{arg}, {file_type}: ", out=sys.stderr) - if progress - else intervals - ) - for file_type in FILE_TYPES.keys(): - results[arg][file_type] = [] - for val in item_iter: - chunk_size = 500 - if file_type == "vcf": - func = time_vcf - file = genotype_dir / f"{val}.vcf" - elif file_type == "pgen" or file_type.startswith("chunked"): - func = time_plink if file_type == "pgen" else time_plink_chunk - file = genotype_dir / f"{val}.pgen" - if file_type.startswith("chunked"): - chunk_size = int(file_type[len("chunked") :]) - else: - continue - times = np.empty(REPS, dtype=np.float64) - for rep in range(REPS): - start = process_time() - func(file, max_variants=len_variants, chunk_size=chunk_size) - end = process_time() - times[rep] = end - start - results[arg][file_type].append(times.mean()) - - # plot the results - LOG.info("Generating plot of results") - fig, (ax_samples, ax_variants) = plt.subplots(1, 2, figsize=(10, 5)) - lab_font = {"fontsize": "xx-small"} - for file_type in FILE_TYPES.keys(): - x_vals = INTERVALS_SAMPLES - y_vals = results["samples"][file_type] - # fit a line to each so that we can report the slope - slope = np.polyfit(x_vals, y_vals, 1)[0] - ax_samples.plot( - x_vals, - y_vals, - marker="o", - label=FILE_TYPES[file_type], - ) - ax_samples.text( - x_vals[-1], - y_vals[-1] + (y_vals[-1] / 16), - f"m = {slope:.3E}", - fontdict=lab_font, - ) - ax_samples.set_xlabel(f"Number of samples\nnum_variants = {DEFAULT_VARIANTS}") - ax_samples.set_ylim(ymin=0) - for file_type in FILE_TYPES.keys(): - x_vals = INTERVALS_VARIANTS - y_vals = results["variants"][file_type] - # fit a line to each so that we can report the slope - slope = np.polyfit(x_vals, y_vals, 1)[0] - ax_variants.plot( - x_vals, - y_vals, - marker="o", - ) - ax_variants.text( - x_vals[-1], - y_vals[-1] + (y_vals[-1] / 16), - f"m = {slope:.3E}", - fontdict=lab_font, - ) - ax_variants.set_xlabel(f"Number of variants\nnum_samples = {DEFAULT_SAMPLES}") - ax_variants.set_ylim(ymin=0) - fig.supylabel("CPU Time (s)") - fig.legend(loc="lower left", fontsize="xx-small") - fig.tight_layout() - fig.savefig(str(output), bbox_inches="tight", dpi=400) - - # saving results - if archive is not None: - with open(archive, "wb") as fh: - pickle.dump(results, fh) - - -def test_bench_genotypes(): - tmp_dir = Path("bench_temp_dir") - tmp_plot = Path("bench-plink.png") - - try: - main(["-t", tmp_dir, "-o", tmp_plot, "--reps", 1, "-s"]) - except SystemExit as err: - # re-raise unless main() finished without an error - if err.code: - raise - - shutil.rmtree(tmp_dir) - tmp_plot.unlink() - - -if __name__ == "__main__": - main() From 47cdf1317ab2a149ca7f21173e676f3746734463 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Thu, 10 Aug 2023 21:26:18 -0700 Subject: [PATCH 16/40] fix progress bar in bench_genotypes.py --- tests/bench_genotypes.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/bench_genotypes.py b/tests/bench_genotypes.py index 1a7e7c07..c378757b 100755 --- a/tests/bench_genotypes.py +++ b/tests/bench_genotypes.py @@ -436,12 +436,12 @@ def main( genotype_dir = sample_dir results[arg] = {} intervals = INTERVALS_SAMPLES if arg == "samples" else INTERVALS_VARIANTS - item_iter = ( - progressbar(intervals, prefix=f"{arg}, {file_type}: ", out=sys.stderr) - if progress - else intervals - ) for file_type in FILE_TYPES.keys(): + item_iter = ( + progressbar(intervals, prefix=f"{arg}, {file_type}: ", out=sys.stderr) + if progress + else intervals + ) results[arg][file_type] = [] for val in item_iter: chunk_size = 500 From 939353eadb228e239e2e9aacbaa4e53f44d09c0c Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 11 Aug 2023 09:57:52 -0700 Subject: [PATCH 17/40] avoid using GetLengthGenotypes for speedup --- haptools/data/genotypes.py | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index fb863920..b5105d41 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -1616,8 +1616,39 @@ def read( See documentation for :py:attr:`~.GenotypesVCF.read` """ super().read(region,samples,variants,max_variants) - for idx, record in enumerate(self._iter_TRRecords(region)): - self.data[:, idx] = record.GetLengthGenotypes() + num_variants = len(self.variants) + vcf = VCF(self.fname.with_suffix(".pvar")) + tr_records = self.trh.TRRecordHarmonizer( + vcffile=vcf, vcfiter=vcf(region), region=region, vcftype=self.vcftype, + ) + # filter out TRs that we didn't want + if variants is not None: + tr_records = filter(lambda rec: rec.record_id in variants, tr_records) + # initialize a jagged array of allele lengths + max_num_alleles = max(map(len, self.variants["alleles"])) + allele_lens = np.empty((len(self.variants), max_num_alleles), dtype=self.data.dtype) + # iterate through each TR and extract the REF and ALT allele lengths + for idx, record in enumerate(tr_records): + if idx > num_variants: + # exit early if we've already found all the variants + break + # extract allele lengths from TRRecord object + allele_lens[idx, 0] = record.ref_allele_length + num_alleles = len(record.alt_allele_lengths) + 1 + allele_lens[idx, 1:num_alleles] = record.alt_allele_lengths + # record missing entries and then set them all to REF + missing = self.data[:, :, :2] == np.iinfo(np.uint8).max + self.data[:, :, :2][missing] = 0 + # TODO: use broadcasting instead of this for loop + for idx in range(num_variants): + # convert from genotype indices to allele lengths + self.data[:, idx, :2] = allele_lens[idx, self.data[:, idx, :2]] + # restore missing entries + self.data[:, :, :2][missing] = np.iinfo(np.uint8).max + # clean up memory + del missing + del allele_lens + gc.collect() def _iterate( self, From f42acff18b92b0751fb5b78afe8fa66b9e23b47e Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 11 Aug 2023 10:52:27 -0700 Subject: [PATCH 18/40] extract TRRecords iteration to a separate method --- haptools/data/genotypes.py | 67 +++++++++++++++----------------------- 1 file changed, 27 insertions(+), 40 deletions(-) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index b5105d41..a66d3945 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -1521,22 +1521,6 @@ class GenotypesPLINKTR(GenotypesPLINK): def __init__(self, fname: Path | str, log: Logger = None, chunk_size: int = None , vcftype: str = "auto"): super().__init__(fname, log, chunk_size) self.vcftype = vcftype - self._import_copy_trh_module() - - def _import_copy_trh_module(self): - """ - Import a copy of the tr_harmonizer module into the "trh" property of this class - Subsequent edits to the module will remain scoped to the class, only - - This code is adapted from https://stackoverflow.com/a/11285504/16815703 - """ - try: - SPEC = importlib.util.find_spec("trtools.utils.tr_harmonizer") - except ModuleNotFoundError: - SPEC = trh.__spec__ - self.trh = importlib.util.module_from_spec(SPEC) - SPEC.loader.exec_module(self.trh) - @classmethod def load( @@ -1577,21 +1561,30 @@ def load( genotypes.check_phase() return genotypes - - - def _iter_TRRecords(self,region: str = None): - gts_obj = self - def GetGenotypeIndicies(self): - n = np.argmax(self.vcfrecord.ID == gts_obj.variants["id"]) - gts_data = gts_obj.data[:, n].astype(np.int8) - gts_data[gts_data == np.iinfo(np.uint8).max] = -1 - return gts_data - self.trh.TRRecord.GetGenotypeIndicies = GetGenotypeIndicies - vcf = VCF(self.fname.with_suffix(".pvar")) - yield from self.trh.TRRecordHarmonizer( - vcffile = vcf, vcfiter= vcf(region), region =region , vcftype =self.vcftype - ) + def _iter_TRRecords(self, region: str = None, variants: set[str] = None): + """ + Yield TRRecord objects from the PVAR file + Parameters + ---------- + region : str, optional + See documentation for :py:attr:`~.GenotypesVCF.read` + variants : set[str], optional + See documentation for :py:attr:`~.GenotypesVCF.read` + + Yields + ------ + Iterator[trh.TRRecord] + An iterator over each line of the PVAR file + """ + vcf = VCF(self.fname.with_suffix(".pvar")) + tr_records = trh.TRRecordHarmonizer( + vcffile=vcf, vcfiter=vcf(region), region=region, vcftype=self.vcftype, + ) + # filter out TRs that we didn't want + if variants is not None: + tr_records = filter(lambda rec: rec.record_id in variants, tr_records) + yield from tr_records def read( self, @@ -1617,18 +1610,11 @@ def read( """ super().read(region,samples,variants,max_variants) num_variants = len(self.variants) - vcf = VCF(self.fname.with_suffix(".pvar")) - tr_records = self.trh.TRRecordHarmonizer( - vcffile=vcf, vcfiter=vcf(region), region=region, vcftype=self.vcftype, - ) - # filter out TRs that we didn't want - if variants is not None: - tr_records = filter(lambda rec: rec.record_id in variants, tr_records) # initialize a jagged array of allele lengths max_num_alleles = max(map(len, self.variants["alleles"])) allele_lens = np.empty((len(self.variants), max_num_alleles), dtype=self.data.dtype) # iterate through each TR and extract the REF and ALT allele lengths - for idx, record in enumerate(tr_records): + for idx, record in enumerate(self._iter_TRRecords(region, variants)): if idx > num_variants: # exit early if we've already found all the variants break @@ -1676,8 +1662,9 @@ def _iterate( An iterator over each line in the file, where each line is encoded as a namedtuple containing each of the class properties """ - variants = super()._iterate(pgen,region,variants) - for variant, record in zip(variants, self._iter_TRRecords(region)): + tr_records = self._iter_TRRecords(region, variants) + variants = super()._iterate(pgen, region, variants) + for variant, record in zip(variants, tr_records): variant.data = record.GetLengthGenotypes() yield variant From ef440d6446e6c9f8f9eb012264ec961278fcd7fa Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 11 Aug 2023 10:59:17 -0700 Subject: [PATCH 19/40] fix some small syntax things --- haptools/data/genotypes.py | 10 ++++++---- tests/test_data.py | 5 ++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index a66d3945..41d9d944 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -1249,7 +1249,7 @@ def read( pv = pgenlib.PvarReader(bytes(str(self.fname.with_suffix(".pvar")), "utf8")) with pgenlib.PgenReader( - bytes(str(self.fname), "utf8"), sample_subset=sample_idxs, pvar= pv + bytes(str(self.fname), "utf8"), sample_subset=sample_idxs, pvar = pv ) as pgen: # how many variants to load? if variants is not None: @@ -1402,9 +1402,11 @@ def __iter__( super(Genotypes, self).read() import pgenlib + pv = pgenlib.PvarReader(bytes(str(self.fname.with_suffix(".pvar")), "utf8")) + sample_idxs = self.read_samples(samples) pgen = pgenlib.PgenReader( - bytes(str(self.fname), "utf8"), sample_subset=sample_idxs + bytes(str(self.fname), "utf8"), sample_subset=sample_idxs, pvar = pv ) # call another function to force the lines above to be run immediately # see https://stackoverflow.com/a/36726497 @@ -1595,7 +1597,7 @@ def read( ): """ Read genotypes from a PGEN file into a numpy matrix stored in - :py:attr:`~.GenotypesPLINK.data` + :py:attr:`~.GenotypesPLINKTR.data` Parameters ---------- @@ -1645,7 +1647,7 @@ def _iterate( """ A generator over the lines of a PGEN-PVAR file pair - This is a helper function for :py:meth:`~.GenotypesPLINK.__iter__` + This is a helper function for :py:meth:`~.GenotypesPLINKTR.__iter__` Parameters ---------- diff --git a/tests/test_data.py b/tests/test_data.py index 326e5791..c6de81d7 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -5,7 +5,6 @@ import pytest import numpy as np import numpy.lib.recfunctions as rfn -import pgenlib from haptools.sim_phenotype import Haplotype as HaptoolsHaplotype from haptools.sim_phenotype import Repeat as HaptoolsRepeat from haptools.data import ( @@ -658,7 +657,7 @@ def _get_fake_genotypes_multiallelic(self): def test_iter(self): # Get the expected data - expected = self. _get_fake_genotypes_multiallelic() + expected = self._get_fake_genotypes_multiallelic() gts = GenotypesPLINKTR(DATADIR / "simple-tr-valid.pgen") # Check that everything matches what we expected for idx, line in enumerate(gts): @@ -671,7 +670,7 @@ def test_iter(self): assert gts.samples == expected.samples def test_read_plinktr(self): - expected_alleles =self._get_fake_genotypes_multiallelic().data + expected_alleles = self._get_fake_genotypes_multiallelic().data gts = GenotypesPLINKTR(DATADIR / "simple-tr-valid.pgen") gts.read() # check genotypes From 8b8fcffe4d2c5e7a1f0d1abb5d99f83618de5296 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 11 Aug 2023 15:05:15 -0700 Subject: [PATCH 20/40] improve scaling of GenotypesPLINKTR as we scale the number of variants by using broadcasting across the number of variants --- haptools/data/genotypes.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index 41d9d944..614e74ad 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -1627,10 +1627,9 @@ def read( # record missing entries and then set them all to REF missing = self.data[:, :, :2] == np.iinfo(np.uint8).max self.data[:, :, :2][missing] = 0 - # TODO: use broadcasting instead of this for loop - for idx in range(num_variants): - # convert from genotype indices to allele lengths - self.data[:, idx, :2] = allele_lens[idx, self.data[:, idx, :2]] + # convert from genotype indices to allele lengths + variant_coords = np.arange(num_variants)[:, np.newaxis] + self.data[:, :, :2] = allele_lens[variant_coords, self.data[:, :, :2]] # restore missing entries self.data[:, :, :2][missing] = np.iinfo(np.uint8).max # clean up memory From e9137f5c125009b89e04ae3730f911a4b7427990 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 11 Aug 2023 15:26:07 -0700 Subject: [PATCH 21/40] fix failing test for GenotypesPLINKTR._iterate --- haptools/data/genotypes.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index 614e74ad..06210dc5 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -1666,7 +1666,15 @@ def _iterate( tr_records = self._iter_TRRecords(region, variants) variants = super()._iterate(pgen, region, variants) for variant, record in zip(variants, tr_records): - variant.data = record.GetLengthGenotypes() + # extract the REF and ALT allele lengths + allele_lens = np.array([record.ref_allele_length, *record.alt_allele_lengths], dtype=variant.data.dtype) + # record missing entries and then set them all to REF + missing = variant.data[:, :2] == np.iinfo(np.uint8).max + variant.data[:, :2][missing] = 0 + # convert from genotype indices to allele lengths + variant.data[:, :2] = allele_lens[variant.data[:, :2]] + # restore missing entries + variant.data[:, :2][missing] = np.iinfo(np.uint8).max yield variant def write(self): From b7973db148b1aa18f02ed843a1bda2bbaadc085f Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sun, 13 Aug 2023 11:59:05 -0700 Subject: [PATCH 22/40] use numpy array method of cyvcf2 for slight speedup --- haptools/data/genotypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index 06210dc5..9d0073cf 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -259,7 +259,7 @@ def _return_data(self, variant: Variant): data: npt.NDArray[np.uint8] Numpy array storing all genotypes """ - return np.array(variant.genotypes, dtype=np.uint8) + return variant.genotype.array().astype(np.uint8) def _iterate(self, vcf: VCF, region: str = None, variants: set[str] = None): """ From 8bdd4c7f37944feb6ee037ec6e3bd478bf0cbc1d Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sun, 13 Aug 2023 12:00:36 -0700 Subject: [PATCH 23/40] ensure all imports are relative within haptools/ --- haptools/index.py | 3 +-- haptools/ld.py | 2 +- haptools/transform.py | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/haptools/index.py b/haptools/index.py index 3a71149f..305e6c72 100644 --- a/haptools/index.py +++ b/haptools/index.py @@ -6,9 +6,8 @@ from pysam import tabix_index -from haptools import data +from . import data from .logging import getLogger -from haptools.data.haplotypes import Haplotypes def append_suffix( diff --git a/haptools/ld.py b/haptools/ld.py index 748a8e3b..be28ca14 100644 --- a/haptools/ld.py +++ b/haptools/ld.py @@ -5,7 +5,7 @@ import numpy as np import numpy.typing as npt -from haptools import data +from . import data from .logging import getLogger from .data import Haplotype as HaplotypeBase diff --git a/haptools/transform.py b/haptools/transform.py index b806cc37..df70f6cb 100644 --- a/haptools/transform.py +++ b/haptools/transform.py @@ -9,7 +9,7 @@ from cyvcf2 import VCF, Variant from pysam import VariantFile -from haptools import data +from . import data from .logging import getLogger From 14be007d5c2c02c9d4f6e97a4c3d03391f1b3113 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sun, 13 Aug 2023 13:35:33 -0700 Subject: [PATCH 24/40] finish writing __iter__() and write() TR tests --- tests/test_data.py | 63 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 58 insertions(+), 5 deletions(-) diff --git a/tests/test_data.py b/tests/test_data.py index c6de81d7..a9bb00f0 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -333,7 +333,7 @@ def _get_fake_genotypes_multiallelic(self): def _get_fake_genotypes_multiallelic_tr(self): pgenlib = pytest.importorskip("pgenlib") - gts_tr = GenotypesTR(DATADIR / "simple_tr.vcf") + gts_tr = GenotypesTR(DATADIR / "simple-tr-valid.vcf") gts_tr.read() gts = GenotypesPLINK(fname="") @@ -461,6 +461,21 @@ def test_load_genotypes_iterate(self): ) assert gts.samples == expected.samples + def test_iter_multiallelic_tr(self): + expected = self._get_fake_genotypes_multiallelic_tr() + + gts = GenotypesPLINK(DATADIR / "simple-tr-valid.pgen") + + # Check that everything matches what we expected + for idx, line in enumerate(gts): + np.testing.assert_allclose(line.data[:, :3], expected.data[:, idx]) + for col in ("chrom", "pos", "id"): + assert line.variants[col] == expected.variants[col][idx] + assert line.variants["alleles"].tolist() == expected.variants["alleles"][idx] + + # Check samples + assert gts.samples == expected.samples + def test_load_genotypes_subset(self): expected = self._get_fake_genotypes_plink() @@ -604,7 +619,33 @@ def test_write_multiallelic(self): new_gts.read() new_gts.check_phase() - # Verify tqhat the uploaded data matches the original data + # Verify that the uploaded data matches the original data + np.testing.assert_allclose(gts_multiallelic.data, new_gts.data) + assert gts_multiallelic.samples == new_gts.samples + for i in range(len(new_gts.variants)): + for col in ("chrom", "pos", "id", "alleles"): + assert gts_multiallelic.variants[col][i] == new_gts.variants[col][i] + + #clean the files created after the test + fname.with_suffix(".psam").unlink() + fname.with_suffix(".pvar").unlink() + fname.unlink() + + def test_write_multiallelic_tr(self): + # Create fake multi-allelic genotype data to write to the PLINK file + gts_multiallelic = self._get_fake_genotypes_multiallelic_tr() + # Name of the PLINK file where the data will be written + fname = DATADIR / "test_write_multiallelic_tr.pgen" + + # Write multiallelic genotype data to the PLINK file + gts_multiallelic.fname = fname + gts_multiallelic.write() + + #Read the data from the newly created PLINK file + new_gts = GenotypesPLINK(fname) + new_gts.read() + + # Verify that the uploaded data matches the original data np.testing.assert_allclose(gts_multiallelic.data, new_gts.data) assert gts_multiallelic.samples == new_gts.samples for i in range(len(new_gts.variants)): @@ -669,7 +710,7 @@ def test_iter(self): # Check samples assert gts.samples == expected.samples - def test_read_plinktr(self): + def test_read(self): expected_alleles = self._get_fake_genotypes_multiallelic().data gts = GenotypesPLINKTR(DATADIR / "simple-tr-valid.pgen") gts.read() @@ -1764,8 +1805,8 @@ def test_merge_variants_vcf(self): class TestGenotypesTR: - def test_read_tr(self): - expected_alleles = np.array( + def _get_fake_tr_alleles(self): + return np.array( [ [[1, 2, 1], [3, 4, 1], [5, 6, 1], [7, 8, 1], [9, 0, 1]], [[3, 1, 1], [3, 1, 1], [1, 1, 1], [1, 1, 1], [3, 3, 1]], @@ -1776,12 +1817,24 @@ def test_read_tr(self): dtype=np.uint8, ).transpose((1, 0, 2)) + def test_read(self): + expected_alleles = self._get_fake_tr_alleles() + gts = GenotypesTR(DATADIR / "simple_tr.vcf") gts.read() # check genotypes np.testing.assert_allclose(expected_alleles, gts.data) + def test_iter(self): + # Get the expected data + expected = self._get_fake_tr_alleles() + + gts = GenotypesTR(DATADIR / "simple_tr.vcf") + # Check that everything matches what we expected + for idx, line in enumerate(gts): + np.testing.assert_allclose(line.data[:, :3], expected[:, idx]) + class TestBreakpoints: def _get_expected_breakpoints(self): From 9253bb06ce8670447f4e3d7714cf1b80503c9ec1 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sun, 13 Aug 2023 22:55:42 -0700 Subject: [PATCH 25/40] add edge case from plink2-users discussion where there is an extra ALT allele that isn't used within any of the genotypes --- tests/data/simple-tr-valid.pgen | Bin 53 -> 54 bytes tests/data/simple-tr-valid.pvar | 4 ++-- tests/data/simple-tr-valid.vcf | 2 +- tests/test_data.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/data/simple-tr-valid.pgen b/tests/data/simple-tr-valid.pgen index 317aa19f140151892387be304e15acaef19a2965..b852ca9b9cebe6b0dbbf9d8f85a263f156012b66 100644 GIT binary patch delta 31 mcmXptn;;{_$+VJ*pI^~AwVYuo0|Uc6W(Gz^W@7=SbIbr>$pxnX delta 30 lcmXproggF0!L*WzpI^~AwVYuo0|Uc6W(Gz^W`Cx0%m7>i1+f4C diff --git a/tests/data/simple-tr-valid.pvar b/tests/data/simple-tr-valid.pvar index 416b02e7..f7c2b3c1 100644 --- a/tests/data/simple-tr-valid.pvar +++ b/tests/data/simple-tr-valid.pvar @@ -1,5 +1,5 @@ ##fileformat=VCFv4.3 -##fileDate=20230724 +##fileDate=20230813 ##source=PLINKv2.00 ##command=HipSTR-v0.7 --test ##filedate=20180225 @@ -12,5 +12,5 @@ 1 10114 1:10114:GTT GTT GTTGTT,GTTGTTGTT,GTTGTTGTTGTT,GTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTTGTTGTT . . START=10114;END=10999;PERIOD=3 1 11000 1:10116:ACACAC ACACAC AC . . START=11000;END=11016;PERIOD=2 1 12117 1:10117:AAA AAA AAAA . . START=12117;END=12143;PERIOD=1 -1 13122 1:10122:GTGT GTGT GTGTGT,GTGTGTGT,GTGTGTGTGTGTGT,GTGTGTGTGTGTGTGTGTGTGT . . START=13122;END=13150;PERIOD=2 +1 13122 1:10122:GTGT GTGT GTGTGT,GTGTGTGT,GTGTGTGTGTGTGT,GTGTGTGTGTGTGTGTGTGTGT,GT . . START=13122;END=13150;PERIOD=2 X 20122 1:20122:GGG GGG GGGGG . . START=20122;END=20150;PERIOD=1 diff --git a/tests/data/simple-tr-valid.vcf b/tests/data/simple-tr-valid.vcf index 396aaf08..cf4ece04 100644 --- a/tests/data/simple-tr-valid.vcf +++ b/tests/data/simple-tr-valid.vcf @@ -11,5 +11,5 @@ 1 10114 1:10114:GTT GTT GTTGTT,GTTGTTGTT,GTTGTTGTTGTT,GTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTTGTTGTT . . START=10114;END=10999;PERIOD=3 GT 0|1 2|3 4|5 6|7 8|8 1 11000 1:10116:ACACAC ACACAC AC . . START=11000;END=11016;PERIOD=2 GT 0|1 0|1 1|1 1|1 0|0 1 12117 1:10117:AAA AAA AAAA . . START=12117;END=12143;PERIOD=1 GT 0|0 0|0 0|0 0|0 0|0 -1 13122 1:10122:GTGT GTGT GTGTGT,GTGTGTGT,GTGTGTGTGTGTGT,GTGTGTGTGTGTGTGTGTGTGT . . START=13122;END=13150;PERIOD=2 GT 4|4 . 3|0 1|2 4|. +1 13122 1:10122:GTGT GTGT GTGTGT,GTGTGTGT,GTGTGTGTGTGTGT,GTGTGTGTGTGTGTGTGTGTGT,GT . . START=13122;END=13150;PERIOD=2 GT 4|4 . 3|0 1|2 4|. X 20122 1:20122:GGG GGG GGGGG . . START=20122;END=20150;PERIOD=1 GT 1 . 0 . . diff --git a/tests/test_data.py b/tests/test_data.py index a9bb00f0..d9c90c31 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -367,7 +367,7 @@ def _get_fake_genotypes_multiallelic_tr(self): tuple("GTT" * i for i in range(1, 10)), ("ACACAC", "AC"), ("AAA", "AAAA"), - ("GTGT", "GTGTGT", "GTGTGTGT", "GTGTGTGTGTGTGT", "GTGTGTGTGTGTGTGTGTGTGT"), + ("GTGT", "GTGTGT", "GTGTGTGT", "GTGTGTGTGTGTGT", "GTGTGTGTGTGTGTGTGTGTGT", "GT"), ("GGG", "GGGGG"), ] gts.variants = np.array( From 436878f4b36d168df347998963a23053121cbf0d Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sun, 13 Aug 2023 22:57:06 -0700 Subject: [PATCH 26/40] transpose phasing before concatenating dummy data --- tests/test_data.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/test_data.py b/tests/test_data.py index d9c90c31..6cbbc161 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -679,13 +679,11 @@ def _get_fake_genotypes_multiallelic(self): [[5, na], [na, na], [3, na], [na, na], [na, na]], ], dtype=np.uint8, - ) + ).transpose((1, 0, 2)) # append the phasing info, transpose, and clean up phasing = gts_plink.data[:, :, 2][:, :, np.newaxis] - phasing[[3,4], 1] = 0 - phasing[1, [3,4]] = 1 - gts.data = np.concatenate((data, phasing), axis=-1).transpose((1, 0, 2)) + gts.data = np.concatenate((data, phasing), axis=-1) # handle half-calls and chrX according to the flag: # --vcf-half-call m From 8a8a11020158624b42301a5a5c632e63343217d8 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Mon, 14 Aug 2023 14:57:53 -0700 Subject: [PATCH 27/40] rename simple-tr-valid test files to simple-tr --- tests/data/{simple-tr-valid.pgen => simple-tr.pgen} | Bin tests/data/{simple-tr-valid.psam => simple-tr.psam} | 0 tests/data/{simple-tr-valid.pvar => simple-tr.pvar} | 0 tests/data/{simple-tr-valid.vcf => simple-tr.vcf} | 0 tests/test_data.py | 10 +++++----- 5 files changed, 5 insertions(+), 5 deletions(-) rename tests/data/{simple-tr-valid.pgen => simple-tr.pgen} (100%) rename tests/data/{simple-tr-valid.psam => simple-tr.psam} (100%) rename tests/data/{simple-tr-valid.pvar => simple-tr.pvar} (100%) rename tests/data/{simple-tr-valid.vcf => simple-tr.vcf} (100%) diff --git a/tests/data/simple-tr-valid.pgen b/tests/data/simple-tr.pgen similarity index 100% rename from tests/data/simple-tr-valid.pgen rename to tests/data/simple-tr.pgen diff --git a/tests/data/simple-tr-valid.psam b/tests/data/simple-tr.psam similarity index 100% rename from tests/data/simple-tr-valid.psam rename to tests/data/simple-tr.psam diff --git a/tests/data/simple-tr-valid.pvar b/tests/data/simple-tr.pvar similarity index 100% rename from tests/data/simple-tr-valid.pvar rename to tests/data/simple-tr.pvar diff --git a/tests/data/simple-tr-valid.vcf b/tests/data/simple-tr.vcf similarity index 100% rename from tests/data/simple-tr-valid.vcf rename to tests/data/simple-tr.vcf diff --git a/tests/test_data.py b/tests/test_data.py index 6cbbc161..4b7394fc 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -333,7 +333,7 @@ def _get_fake_genotypes_multiallelic(self): def _get_fake_genotypes_multiallelic_tr(self): pgenlib = pytest.importorskip("pgenlib") - gts_tr = GenotypesTR(DATADIR / "simple-tr-valid.vcf") + gts_tr = GenotypesTR(DATADIR / "simple-tr.vcf") gts_tr.read() gts = GenotypesPLINK(fname="") @@ -408,7 +408,7 @@ def test_load_genotypes_multiallelic(self): def test_load_genotypes_multiallelic_tr(self): expected = self._get_fake_genotypes_multiallelic_tr() - gts = GenotypesPLINK(DATADIR / "simple-tr-valid.pgen") + gts = GenotypesPLINK(DATADIR / "simple-tr.pgen") gts.read() # check that everything matches what we expected @@ -464,7 +464,7 @@ def test_load_genotypes_iterate(self): def test_iter_multiallelic_tr(self): expected = self._get_fake_genotypes_multiallelic_tr() - gts = GenotypesPLINK(DATADIR / "simple-tr-valid.pgen") + gts = GenotypesPLINK(DATADIR / "simple-tr.pgen") # Check that everything matches what we expected for idx, line in enumerate(gts): @@ -697,7 +697,7 @@ def _get_fake_genotypes_multiallelic(self): def test_iter(self): # Get the expected data expected = self._get_fake_genotypes_multiallelic() - gts = GenotypesPLINKTR(DATADIR / "simple-tr-valid.pgen") + gts = GenotypesPLINKTR(DATADIR / "simple-tr.pgen") # Check that everything matches what we expected for idx, line in enumerate(gts): np.testing.assert_allclose(line.data[:, :3], expected.data[:, idx]) @@ -710,7 +710,7 @@ def test_iter(self): def test_read(self): expected_alleles = self._get_fake_genotypes_multiallelic().data - gts = GenotypesPLINKTR(DATADIR / "simple-tr-valid.pgen") + gts = GenotypesPLINKTR(DATADIR / "simple-tr.pgen") gts.read() # check genotypes np.testing.assert_allclose(expected_alleles, gts.data) From 61e7163e9b249f73809822dda0d157ec3252e313 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 15 Aug 2023 08:13:37 -0700 Subject: [PATCH 28/40] handle allele_cts parameter when writing PGEN files --- haptools/data/genotypes.py | 43 ++++++++++++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 4 deletions(-) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index 9d0073cf..3ee442f2 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -1436,7 +1436,7 @@ def write_variants(self): for contig in set(self.variants["chrom"]): vcf.header.contigs.add(contig) self.log.info("Writing PVAR records") - for var_idx, var in enumerate(self.variants): + for var in self.variants: rec = { "contig": var["chrom"], "start": var["pos"], @@ -1453,6 +1453,34 @@ def write_variants(self): # write the record to a file vcf.write(record) + def _num_unique_alleles(self, arr: npt.NDArray): + """ + Obtain the number of unique elements in each row of a genotype matrix + Needed by https://groups.google.com/g/plink2-users/c/Sn5qVCyDlDw/m/GOWScY6tAQAJ + + Adapted from https://stackoverflow.com/a/46575580 + + Parameters + ---------- + arr : npt.NDArray + A genotype matrix of shape (num_variants, num_samples, 2) and type np.uint8 + + Returns + ------- + npt.NDArray + An array of shape (num_variants,) and type np.uint32 containing allele + counts for each variant + """ + nrows = arr.shape[0] + row_coords = np.arange(nrows)[:, np.newaxis, np.newaxis] + allele_cts = np.zeros((nrows, np.iinfo(np.uint8).max+1, 2), dtype=np.bool_) + # mark whichever allele indices appear in arr then sum them to obtain counts + allele_cts[row_coords, arr, np.arange(2)] = 1 + allele_cts = allele_cts.any(axis=2).sum(axis=1, dtype=np.uint32) + # there are always at least two alleles + allele_cts[allele_cts < 2] = 2 + return allele_cts + def write(self): """ Write the variants in this class to PLINK2 files at @@ -1475,7 +1503,6 @@ def write(self): # write the pgen file pv = pgenlib.PvarReader(bytes(str(self.fname.with_suffix(".pvar")), "utf8")) with pgenlib.PgenWriter( - filename=bytes(str(self.fname), "utf8"), sample_ct=len(self.samples), variant_ct=len(self.variants), @@ -1497,6 +1524,9 @@ def write(self): missing = np.ascontiguousarray( data[start:end] == np.iinfo(np.uint8).max ) + # obtain the number of unique alleles for each variant + # https://stackoverflow.com/a/46575580 + allele_cts = self._num_unique_alleles(data[start:end]) subset_data = np.ascontiguousarray(data[start:end], dtype=np.int32) except np.core._exceptions._ArrayMemoryError as e: raise ValueError( @@ -1507,12 +1537,17 @@ def write(self): missing.resize((len(self.variants), len(self.samples) * 2)) # convert any missing genotypes to -9 subset_data[missing] = -9 + # finally, append the genotypes to the PGEN file if self._prephased or self.data.shape[2] < 3: - pgen.append_alleles_batch(subset_data, all_phased=True) + pgen.append_alleles_batch( + subset_data, all_phased=True, allele_cts=allele_cts, + ) else: # TODO: figure out why this sometimes leads to a corrupted file? subset_phase = self.data[:, start:end, 2].T.copy(order="C") - pgen.append_partially_phased_batch(subset_data, subset_phase) + pgen.append_partially_phased_batch( + subset_data, subset_phase, allele_cts=allele_cts, + ) del subset_phase del subset_data del missing From a32e1c3d7293ce3fc30d25b9a630a8a453b66463 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 15 Aug 2023 08:52:23 -0700 Subject: [PATCH 29/40] docs: the new GenotypesPLINKTR class also explain how to convert TR VCFs to PGEN --- docs/api/data.rst | 25 ++++++++++++++++++++++- docs/formats/genotypes.rst | 19 +++++++++++------ haptools/data/genotypes.py | 42 ++++++++++++++++++++++++++------------ 3 files changed, 66 insertions(+), 20 deletions(-) diff --git a/docs/api/data.rst b/docs/api/data.rst index e5a98e62..6be633c0 100644 --- a/docs/api/data.rst +++ b/docs/api/data.rst @@ -168,7 +168,7 @@ All of the other methods in the :class:`Genotypes` class are inherited, but the GenotypesTR ++++++++++++ -The :class:`GenotypesTR` class *extends* :class:`Genotypes` class. The :class:`GenotypesTR` class follows the same structure of :class:`GenotypesVCF`, but can now load repeat count of tandem repeats as the alleles. +The :class:`GenotypesTR` class *extends* the :class:`Genotypes` class. The :class:`GenotypesTR` class follows the same structure of :class:`GenotypesVCF`, but can now load repeat counts of tandem repeats as the alleles. All of the other methods in the :class:`Genotypes` class are inherited, but the :class:`GenotypesTR` class' ``load()`` function is unique to loading tandem repeat variants. @@ -178,6 +178,11 @@ All of the other methods in the :class:`Genotypes` class are inherited, but the # make the first sample have 4 and 7 repeats for the alleles of the fourth variant genotypes.data[0, 3] = (4, 7) +The following methods from the :class:`Genotypes` class are disabled, however. + +1. ``check_biallelic`` +2. ``check_maf`` + .. _api-data-genotypestr: GenotypesPLINK @@ -239,6 +244,24 @@ A large ``chunk_size`` is more likely to result in memory over-use while a small genotypes = data.GenotypesPLINK('tests/data/simple.pgen', chunk_size=500) genotypes.read() +GenotypesPLINKTR +++++++++++++++++ +The :class:`GenotypesPLINKTR`` class extends the :class:`GenotypesPLINK` class to support loading tandem repeat variants. +The :class:`GenotypesPLINKTR` class works similarly to :class:`GenotypesTR` by filling the ``data`` property with repeat counts for each allele. + +The following methods from the :class:`GenotypesPLINK` class are disabled, however. + +1. ``write`` +2. ``check_maf`` +3. ``write_variants`` +4. ``check_biallelic`` + +The :class:`GenotypesPLINKTR` uses INFO fields from the PVAR file to determine the repeat unit and the number of repeats for each allele. To ensure your PVAR file contains the necessary information, use the following command when converting from VCF. + +.. code-block:: bash + + plink2 --vcf-half-call m --make-pgen 'pvar-cols=vcfheader,qual,filter,info' --vcf input.vcf --make-pgen --out output + haplotypes.py ~~~~~~~~~~~~~ Overview diff --git a/docs/formats/genotypes.rst b/docs/formats/genotypes.rst index 2657c5f0..5828926e 100644 --- a/docs/formats/genotypes.rst +++ b/docs/formats/genotypes.rst @@ -9,14 +9,14 @@ Genotypes The time required to load various genotype file formats. VCF/BCF -------- +~~~~~~~ Genotype files must be specified as VCF or BCF files. They can be bgzip-compressed. .. _formats-genotypesplink: PLINK2 PGEN ------------ +~~~~~~~~~~~ There is also experimental support for `PLINK2 PGEN `_ files in some commands. These files can be loaded and created much more quickly than VCFs, so we highly recommend using them if you're working with large datasets. See the documentation for the :class:`GenotypesPLINK` class in :ref:`the API docs ` for more information. @@ -29,9 +29,16 @@ If you run out memory when using PGEN files, consider reading/writing variants f pip install haptools[files] -.. warning:: - At the moment, only biallelic SNPs can be encoded in PGEN files because of limitations in the ``Pgenlib`` python library. It doesn't properly support multiallelic variants yet (`source `_). To ensure your PGEN files only contain SNPs, we recommend use the following command to convert from VCF to PGEN. +Converting from VCF to PGEN +--------------------------- +To convert a VCF containing only biallelic SNPs to PGEN, use the following command. - .. code-block:: bash +.. code-block:: bash + + plink2 --snps-only 'just-acgt' --max-alleles 2 --vcf input.vcf --make-pgen --out output + +To convert a VCF containing tandem repeats to PGEN, use this command, instead. + +.. code-block:: bash - plink2 --snps-only 'just-acgt' --max-alleles 2 --vcf input.vcf --make-pgen --out output + plink2 --vcf-half-call m --make-pgen 'pvar-cols=vcfheader,qual,filter,info' --vcf input.vcf --make-pgen --out output diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index 3ee442f2..fb39109d 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -1,7 +1,6 @@ from __future__ import annotations import re import gc -import importlib.util from csv import reader from pathlib import Path from typing import Iterator @@ -819,7 +818,7 @@ class GenotypesTR(Genotypes): log: Logger See documentation for :py:attr:`~.Genotypes.log` vcftype: str - TR vcf type currently being read. + TR vcf type currently being read. Options are {'auto', 'gangstr', 'advntr', 'hipstr', 'eh', 'popstr'} """ @@ -939,19 +938,17 @@ class GenotypesPLINK(GenotypesVCF): data : npt.NDArray See documentation for :py:attr:`~.GenotypesVCF.data` samples : tuple - See documentation for :py:attr:`~.GenotypesVCF.data` + See documentation for :py:attr:`~.GenotypesVCF.samples` variants : np.array - See documentation for :py:attr:`~.GenotypesVCF.data` + See documentation for :py:attr:`~.GenotypesVCF.variants` log: Logger - See documentation for :py:attr:`~.GenotypesVCF.data` + See documentation for :py:attr:`~.GenotypesVCF.log` chunk_size: int, optional The max number of variants to fetch from and write to the PGEN file at any given time If this value is provided, variants from the PGEN file will be loaded in chunks so as to use less memory - _prephased: bool - See documentation for :py:attr:`~.GenotypesVCF.data` Examples -------- @@ -1555,6 +1552,29 @@ def write(self): class GenotypesPLINKTR(GenotypesPLINK): + """ + A class for processing repeat genotypes from a PLINK ``.pgen`` file + + Attributes + ---------- + data : npt.NDArray + See documentation for :py:attr:`~.GenotypesPLINK.data` + samples : tuple + See documentation for :py:attr:`~.GenotypesPLINK.samples` + variants : np.array + See documentation for :py:attr:`~.GenotypesPLINK.variants` + log: Logger + See documentation for :py:attr:`~.GenotypesPLINK.log` + chunk_size: int, optional + See documentation for :py:attr:`~.GenotypesPLINK.chunk_size` + vcftype: str, optional + See documentation for :py:attr:`~.GenotypesTR.vcftype` + + Examples + -------- + >>> genotypes = GenotypesPLINK.load('tests/data/simple.pgen') + """ + def __init__(self, fname: Path | str, log: Logger = None, chunk_size: int = None , vcftype: str = "auto"): super().__init__(fname, log, chunk_size) self.vcftype = vcftype @@ -1582,11 +1602,9 @@ def load( samples : list[str], optional See documentation for :py:meth:`~.Genotypes.read` variants : set[str], optional - See documentation for :py:meth:`~.Genotypes.read + See documentation for :py:meth:`~.Genotypes.read` vcftype : str, optional - The type of TR VCF currently being read. Options are: - {'auto', 'gangstr', 'advntr', 'hipstr', 'eh', 'popstr'} -` + See documentation for :py:meth:`~.GenotypesTR.vcftype` Returns ------- @@ -1723,5 +1741,3 @@ def check_biallelic(self): def check_maf(self): raise NotImplementedError - - From 834968993d9abcc1f35cd58dc6c079e26310bfc3 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 15 Aug 2023 08:55:46 -0700 Subject: [PATCH 30/40] add support for TR PGENs in clump and sim_phenotype --- haptools/clump.py | 11 ++++++----- haptools/sim_phenotype.py | 8 +++++++- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/haptools/clump.py b/haptools/clump.py index bf5d5d48..55fe74ce 100644 --- a/haptools/clump.py +++ b/haptools/clump.py @@ -1,13 +1,12 @@ #!/usr/bin/env python # To test: ./clumpSTR.py --summstats-snps tests/eur_gwas_pvalue_chr19.LDL.glm.linear --clump-snp-field ID --clump-field p-value --clump-chrom-field CHROM --clump-pos-field position --clump-p1 0.2 --out test.clump -import sys import math from logging import Logger, getLogger import numpy as np -from .data import Genotypes, GenotypesVCF, GenotypesTR +from .data import Genotypes, GenotypesVCF, GenotypesTR, GenotypesPLINKTR class Variant: @@ -557,15 +556,17 @@ def clumpstr( strgts = None gts = None if gts_snps: + log.debug("Loading SNP Genotypes.") if str(gts_snps).endswith("pgen"): - log.debug("Loading SNP Genotypes.") snpgts = GenotypesPLINK.load(gts_snps) else: - log.debug("Loading SNP Genotypes.") snpgts = GenotypesVCF.load(gts_snps) if gts_strs: log.debug("Loading STR Genotypes.") - strgts = GenotypesTR.load(gts_strs) + if str(gts_strs).endswith("pgen"): + strgts = GenotypesPLINKTR.load(gts_strs) + else: + strgts = GenotypesTR.load(gts_strs) if gts_snps and gts_strs: log.debug("Calculating set of overlapping samples between STRs and SNPs.") diff --git a/haptools/sim_phenotype.py b/haptools/sim_phenotype.py index 1ed53ab8..df6909ec 100644 --- a/haptools/sim_phenotype.py +++ b/haptools/sim_phenotype.py @@ -14,6 +14,7 @@ Haplotypes, GenotypesTR, GenotypesPLINK, + GenotypesPLINKTR, Repeat as RepeatBase, Haplotype as HaplotypeBase, ) @@ -435,7 +436,12 @@ def simulate_pt( if repeats: log.info("Merging with TR genotypes") - tr_gt = GenotypesTR(fname=repeats, log=log) + if repeats.suffix == ".pgen": + log.info("Loading repeat genotypes from PGEN file") + tr_gt = GenotypesPLINKTR(fname=repeats, log=log, chunk_size=chunk_size) + else: + log.info("Loading repeat genotypes from VCF/BCF file") + tr_gt = GenotypesTR(fname=repeats, log=log) tr_gt.read(region=region, samples=samples, variants=haplotype_ids) tr_gt.check_missing() gt = Genotypes.merge_variants((gt, tr_gt), fname=None) From ec294bc94b1b17bc3265626477a0e32270f8e853 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 15 Aug 2023 08:56:52 -0700 Subject: [PATCH 31/40] add support for multiallelic PGENs in sim_genotype --- haptools/sim_genotype.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/haptools/sim_genotype.py b/haptools/sim_genotype.py index d17f6dfd..dcac3c59 100644 --- a/haptools/sim_genotype.py +++ b/haptools/sim_genotype.py @@ -111,10 +111,6 @@ def output_vcf( else: vcf.read(region=f"{region['chr']}:{region['start']}-{region['end']}") - if variant_file.endswith(".pgen"): - # the pgenlib library collapses multiallelic variants into a single allele - vcf.check_biallelic() - log.debug(f"Read in variants from {variant_file}") sample_dict = {} From 02cbc727235a37dc4efa9c247b7d7952c65b0b46 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Thu, 31 Aug 2023 10:09:07 -0700 Subject: [PATCH 32/40] try and fail to speed up GenotypesPLINKTR.read() --- haptools/data/genotypes.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index fb39109d..cee1189e 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -1666,8 +1666,8 @@ def read( super().read(region,samples,variants,max_variants) num_variants = len(self.variants) # initialize a jagged array of allele lengths - max_num_alleles = max(map(len, self.variants["alleles"])) - allele_lens = np.empty((len(self.variants), max_num_alleles), dtype=self.data.dtype) + num_alleles = np.fromiter(map(len, self.variants["alleles"]), dtype=np.uint8, count=num_variants) + allele_lens = np.empty((num_variants, num_alleles.max()+1), dtype=self.data.dtype) # iterate through each TR and extract the REF and ALT allele lengths for idx, record in enumerate(self._iter_TRRecords(region, variants)): if idx > num_variants: @@ -1675,16 +1675,16 @@ def read( break # extract allele lengths from TRRecord object allele_lens[idx, 0] = record.ref_allele_length - num_alleles = len(record.alt_allele_lengths) + 1 - allele_lens[idx, 1:num_alleles] = record.alt_allele_lengths + num_alleles_rec = num_alleles[idx] + allele_lens[idx, 1:num_alleles_rec] = record.alt_allele_lengths + allele_lens[idx, num_alleles_rec] = np.iinfo(np.uint8).max # record missing entries and then set them all to REF missing = self.data[:, :, :2] == np.iinfo(np.uint8).max - self.data[:, :, :2][missing] = 0 + breakpoint() + np.putmask(self.data[:, :, :2], missing, num_alleles) # convert from genotype indices to allele lengths variant_coords = np.arange(num_variants)[:, np.newaxis] self.data[:, :, :2] = allele_lens[variant_coords, self.data[:, :, :2]] - # restore missing entries - self.data[:, :, :2][missing] = np.iinfo(np.uint8).max # clean up memory del missing del allele_lens From b8f2c4577e338077c306e23300214188b2c8028a Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Thu, 31 Aug 2023 10:09:17 -0700 Subject: [PATCH 33/40] Revert "try and fail to speed up GenotypesPLINKTR.read()" This reverts commit 02cbc727235a37dc4efa9c247b7d7952c65b0b46. --- haptools/data/genotypes.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index cee1189e..fb39109d 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -1666,8 +1666,8 @@ def read( super().read(region,samples,variants,max_variants) num_variants = len(self.variants) # initialize a jagged array of allele lengths - num_alleles = np.fromiter(map(len, self.variants["alleles"]), dtype=np.uint8, count=num_variants) - allele_lens = np.empty((num_variants, num_alleles.max()+1), dtype=self.data.dtype) + max_num_alleles = max(map(len, self.variants["alleles"])) + allele_lens = np.empty((len(self.variants), max_num_alleles), dtype=self.data.dtype) # iterate through each TR and extract the REF and ALT allele lengths for idx, record in enumerate(self._iter_TRRecords(region, variants)): if idx > num_variants: @@ -1675,16 +1675,16 @@ def read( break # extract allele lengths from TRRecord object allele_lens[idx, 0] = record.ref_allele_length - num_alleles_rec = num_alleles[idx] - allele_lens[idx, 1:num_alleles_rec] = record.alt_allele_lengths - allele_lens[idx, num_alleles_rec] = np.iinfo(np.uint8).max + num_alleles = len(record.alt_allele_lengths) + 1 + allele_lens[idx, 1:num_alleles] = record.alt_allele_lengths # record missing entries and then set them all to REF missing = self.data[:, :, :2] == np.iinfo(np.uint8).max - breakpoint() - np.putmask(self.data[:, :, :2], missing, num_alleles) + self.data[:, :, :2][missing] = 0 # convert from genotype indices to allele lengths variant_coords = np.arange(num_variants)[:, np.newaxis] self.data[:, :, :2] = allele_lens[variant_coords, self.data[:, :, :2]] + # restore missing entries + self.data[:, :, :2][missing] = np.iinfo(np.uint8).max # clean up memory del missing del allele_lens From 394f5ceb83761a8fa8efe622bfda88cf3a45bd69 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 15 Sep 2023 05:33:10 -0700 Subject: [PATCH 34/40] remove gxx requirement now that pgenlib is published with wheels --- docs/project_info/contributing.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/project_info/contributing.rst b/docs/project_info/contributing.rst index 5b413a41..1b278815 100644 --- a/docs/project_info/contributing.rst +++ b/docs/project_info/contributing.rst @@ -70,7 +70,7 @@ Follow these steps to set up a development environment. .. code-block:: bash - conda create -n haptools-dev -c conda-forge 'poetry>=1.1.15' 'python=3.7' 'gxx_linux-64' + conda create -n haptools-dev -c conda-forge 'poetry>=1.1.15' 'python=3.7' 2. Activate the environment .. code-block:: bash From d97c7575cd92b7c8627e6bae21845b7c973f1e28 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 15 Sep 2023 05:33:13 -0700 Subject: [PATCH 35/40] refmt with black --- haptools/data/genotypes.py | 46 ++++++++++++++++++++++++++------------ tests/test_data.py | 27 +++++++++++++++------- 2 files changed, 51 insertions(+), 22 deletions(-) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index fb39109d..cff52ec8 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -1241,12 +1241,12 @@ def read( """ super(Genotypes, self).read() import pgenlib - + sample_idxs = self.read_samples(samples) pv = pgenlib.PvarReader(bytes(str(self.fname.with_suffix(".pvar")), "utf8")) - + with pgenlib.PgenReader( - bytes(str(self.fname), "utf8"), sample_subset=sample_idxs, pvar = pv + bytes(str(self.fname), "utf8"), sample_subset=sample_idxs, pvar=pv ) as pgen: # how many variants to load? if variants is not None: @@ -1403,7 +1403,7 @@ def __iter__( sample_idxs = self.read_samples(samples) pgen = pgenlib.PgenReader( - bytes(str(self.fname), "utf8"), sample_subset=sample_idxs, pvar = pv + bytes(str(self.fname), "utf8"), sample_subset=sample_idxs, pvar=pv ) # call another function to force the lines above to be run immediately # see https://stackoverflow.com/a/36726497 @@ -1470,7 +1470,7 @@ def _num_unique_alleles(self, arr: npt.NDArray): """ nrows = arr.shape[0] row_coords = np.arange(nrows)[:, np.newaxis, np.newaxis] - allele_cts = np.zeros((nrows, np.iinfo(np.uint8).max+1, 2), dtype=np.bool_) + allele_cts = np.zeros((nrows, np.iinfo(np.uint8).max + 1, 2), dtype=np.bool_) # mark whichever allele indices appear in arr then sum them to obtain counts allele_cts[row_coords, arr, np.arange(2)] = 1 allele_cts = allele_cts.any(axis=2).sum(axis=1, dtype=np.uint32) @@ -1503,7 +1503,7 @@ def write(self): filename=bytes(str(self.fname), "utf8"), sample_ct=len(self.samples), variant_ct=len(self.variants), - allele_ct_limit = pv.get_max_allele_ct(), + allele_ct_limit=pv.get_max_allele_ct(), nonref_flags=False, hardcall_phase_present=True, ) as pgen: @@ -1537,13 +1537,17 @@ def write(self): # finally, append the genotypes to the PGEN file if self._prephased or self.data.shape[2] < 3: pgen.append_alleles_batch( - subset_data, all_phased=True, allele_cts=allele_cts, + subset_data, + all_phased=True, + allele_cts=allele_cts, ) else: # TODO: figure out why this sometimes leads to a corrupted file? subset_phase = self.data[:, start:end, 2].T.copy(order="C") pgen.append_partially_phased_batch( - subset_data, subset_phase, allele_cts=allele_cts, + subset_data, + subset_phase, + allele_cts=allele_cts, ) del subset_phase del subset_data @@ -1575,7 +1579,13 @@ class GenotypesPLINKTR(GenotypesPLINK): >>> genotypes = GenotypesPLINK.load('tests/data/simple.pgen') """ - def __init__(self, fname: Path | str, log: Logger = None, chunk_size: int = None , vcftype: str = "auto"): + def __init__( + self, + fname: Path | str, + log: Logger = None, + chunk_size: int = None, + vcftype: str = "auto", + ): super().__init__(fname, log, chunk_size) self.vcftype = vcftype @@ -1615,7 +1625,7 @@ def load( genotypes.read(region, samples, variants) genotypes.check_phase() return genotypes - + def _iter_TRRecords(self, region: str = None, variants: set[str] = None): """ Yield TRRecord objects from the PVAR file @@ -1634,7 +1644,10 @@ def _iter_TRRecords(self, region: str = None, variants: set[str] = None): """ vcf = VCF(self.fname.with_suffix(".pvar")) tr_records = trh.TRRecordHarmonizer( - vcffile=vcf, vcfiter=vcf(region), region=region, vcftype=self.vcftype, + vcffile=vcf, + vcfiter=vcf(region), + region=region, + vcftype=self.vcftype, ) # filter out TRs that we didn't want if variants is not None: @@ -1663,11 +1676,13 @@ def read( max_variants : int, optional See documentation for :py:attr:`~.GenotypesVCF.read` """ - super().read(region,samples,variants,max_variants) + super().read(region, samples, variants, max_variants) num_variants = len(self.variants) # initialize a jagged array of allele lengths max_num_alleles = max(map(len, self.variants["alleles"])) - allele_lens = np.empty((len(self.variants), max_num_alleles), dtype=self.data.dtype) + allele_lens = np.empty( + (len(self.variants), max_num_alleles), dtype=self.data.dtype + ) # iterate through each TR and extract the REF and ALT allele lengths for idx, record in enumerate(self._iter_TRRecords(region, variants)): if idx > num_variants: @@ -1720,7 +1735,10 @@ def _iterate( variants = super()._iterate(pgen, region, variants) for variant, record in zip(variants, tr_records): # extract the REF and ALT allele lengths - allele_lens = np.array([record.ref_allele_length, *record.alt_allele_lengths], dtype=variant.data.dtype) + allele_lens = np.array( + [record.ref_allele_length, *record.alt_allele_lengths], + dtype=variant.data.dtype, + ) # record missing entries and then set them all to REF missing = variant.data[:, :2] == np.iinfo(np.uint8).max variant.data[:, :2][missing] = 0 diff --git a/tests/test_data.py b/tests/test_data.py index 4b7394fc..02184949 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -367,7 +367,14 @@ def _get_fake_genotypes_multiallelic_tr(self): tuple("GTT" * i for i in range(1, 10)), ("ACACAC", "AC"), ("AAA", "AAAA"), - ("GTGT", "GTGTGT", "GTGTGTGT", "GTGTGTGTGTGTGT", "GTGTGTGTGTGTGTGTGTGTGT", "GT"), + ( + "GTGT", + "GTGTGT", + "GTGTGTGT", + "GTGTGTGTGTGTGT", + "GTGTGTGTGTGTGTGTGTGTGT", + "GT", + ), ("GGG", "GGGGG"), ] gts.variants = np.array( @@ -471,7 +478,9 @@ def test_iter_multiallelic_tr(self): np.testing.assert_allclose(line.data[:, :3], expected.data[:, idx]) for col in ("chrom", "pos", "id"): assert line.variants[col] == expected.variants[col][idx] - assert line.variants["alleles"].tolist() == expected.variants["alleles"][idx] + assert ( + line.variants["alleles"].tolist() == expected.variants["alleles"][idx] + ) # Check samples assert gts.samples == expected.samples @@ -614,7 +623,7 @@ def test_write_multiallelic(self): gts_multiallelic.fname = fname gts_multiallelic.write() - #Read the data from the newly created PLINK file + # Read the data from the newly created PLINK file new_gts = GenotypesPLINK(fname) new_gts.read() new_gts.check_phase() @@ -626,7 +635,7 @@ def test_write_multiallelic(self): for col in ("chrom", "pos", "id", "alleles"): assert gts_multiallelic.variants[col][i] == new_gts.variants[col][i] - #clean the files created after the test + # clean the files created after the test fname.with_suffix(".psam").unlink() fname.with_suffix(".pvar").unlink() fname.unlink() @@ -641,7 +650,7 @@ def test_write_multiallelic_tr(self): gts_multiallelic.fname = fname gts_multiallelic.write() - #Read the data from the newly created PLINK file + # Read the data from the newly created PLINK file new_gts = GenotypesPLINK(fname) new_gts.read() @@ -652,7 +661,7 @@ def test_write_multiallelic_tr(self): for col in ("chrom", "pos", "id", "alleles"): assert gts_multiallelic.variants[col][i] == new_gts.variants[col][i] - #clean the files created after the test + # clean the files created after the test fname.with_suffix(".psam").unlink() fname.with_suffix(".pvar").unlink() fname.unlink() @@ -693,7 +702,7 @@ def _get_fake_genotypes_multiallelic(self): # to see if we can change chrX representations to properly simulate from it return gts - + def test_iter(self): # Get the expected data expected = self._get_fake_genotypes_multiallelic() @@ -703,7 +712,9 @@ def test_iter(self): np.testing.assert_allclose(line.data[:, :3], expected.data[:, idx]) for col in ("chrom", "pos", "id"): assert line.variants[col] == expected.variants[col][idx] - assert line.variants["alleles"].tolist() == expected.variants["alleles"][idx] + assert ( + line.variants["alleles"].tolist() == expected.variants["alleles"][idx] + ) # Check samples assert gts.samples == expected.samples From b6bf7ecfe96dca4d494546c877b6e047e6af80ab Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 15 Sep 2023 13:09:41 -0700 Subject: [PATCH 36/40] do not copy alleles from list to tuple --- haptools/data/genotypes.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index cff52ec8..7f327c66 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -1072,15 +1072,13 @@ def _variant_arr( A row from the :py:attr:`~.GenotypesPLINK.variants` array """ # Parse the REF and ALT alleles from the PVAR record - ref_allele = record[cid["REF"]] - alt_alleles = record[cid["ALT"]].split(",") - alleles = [ref_allele] + alt_alleles + alleles = (record[cid["REF"]], *record[cid["ALT"]].split(",")) return np.array( ( record[cid["ID"]], record[cid["CHROM"]], record[cid["POS"]], - tuple(alleles), + alleles, ), dtype=self.variants.dtype, ) From 1edea76c0363bc44f218c7be6e5560176da6675a Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 15 Sep 2023 14:27:16 -0700 Subject: [PATCH 37/40] handle gts data arrays with dtype np.bool_ when counting uniq vals --- haptools/data/genotypes.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index 7f327c66..a72b3bab 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -1469,8 +1469,11 @@ def _num_unique_alleles(self, arr: npt.NDArray): nrows = arr.shape[0] row_coords = np.arange(nrows)[:, np.newaxis, np.newaxis] allele_cts = np.zeros((nrows, np.iinfo(np.uint8).max + 1, 2), dtype=np.bool_) + # ensure the arr values are interpreted as uint8's + if arr.dtype != np.uint8: + arr = arr.view("uint8") # mark whichever allele indices appear in arr then sum them to obtain counts - allele_cts[row_coords, arr, np.arange(2)] = 1 + allele_cts[row_coords, arr] = 1 allele_cts = allele_cts.any(axis=2).sum(axis=1, dtype=np.uint32) # there are always at least two alleles allele_cts[allele_cts < 2] = 2 From e13d6f24320bb65c499a8a9a0604dc016eff2dfd Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 16 Sep 2023 08:51:27 -0700 Subject: [PATCH 38/40] test multiallelic pgen reading/writing --- tests/data/simple-tr.pgen | Bin 54 -> 56 bytes tests/data/simple-tr.pvar | 2 +- tests/data/simple-tr.vcf | 4 ++-- tests/test_data.py | 2 ++ 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/data/simple-tr.pgen b/tests/data/simple-tr.pgen index b852ca9b9cebe6b0dbbf9d8f85a263f156012b66..da6d0141b9e938e6c16e5be01c3bb9d533ef5afe 100644 GIT binary patch delta 36 rcmXrBm>?<7#lpzRw33ORU(q?WT$o`g12Y4|JZ1()MrLCHrgO{yZ-NEU delta 34 pcmcC8n; ##contig= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00096 HG00097 HG00099 HG00100 HG00101 -1 10114 1:10114:GTT GTT GTTGTT,GTTGTTGTT,GTTGTTGTTGTT,GTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTTGTTGTT . . START=10114;END=10999;PERIOD=3 GT 0|1 2|3 4|5 6|7 8|8 -1 11000 1:10116:ACACAC ACACAC AC . . START=11000;END=11016;PERIOD=2 GT 0|1 0|1 1|1 1|1 0|0 +1 10114 1:10114:GTT GTT GTTGTT,GTTGTTGTT,GTTGTTGTTGTT,GTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTTGTTGTT . . START=10114;END=10999;PERIOD=3 GT 0|1 2/3 4/5 6|7 8|8 +1 11000 1:10116:ACACAC ACACAC AC . . START=11000;END=11016;PERIOD=2 GT 0|1 0/1 1/1 1|1 0|0 1 12117 1:10117:AAA AAA AAAA . . START=12117;END=12143;PERIOD=1 GT 0|0 0|0 0|0 0|0 0|0 1 13122 1:10122:GTGT GTGT GTGTGT,GTGTGTGT,GTGTGTGTGTGTGT,GTGTGTGTGTGTGTGTGTGTGT,GT . . START=13122;END=13150;PERIOD=2 GT 4|4 . 3|0 1|2 4|. X 20122 1:20122:GGG GGG GGGGG . . START=20122;END=20150;PERIOD=1 GT 1 . 0 . . diff --git a/tests/test_data.py b/tests/test_data.py index 02184949..e35dccc6 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -354,6 +354,8 @@ def _get_fake_genotypes_multiallelic_tr(self): phasing = np.ones(data.shape[:2] + (1,)).astype(np.uint8) phasing[4, [1, 3, 4]] = 0 phasing[3, [1, 4]] = 0 + phasing[:2, 1:3] = 0 + phasing[1, 2] = 1 # since homozygous unphased variants are marked as phased gts.data = np.concatenate((data, phasing), axis=-1).transpose((1, 0, 2)) # handle half-calls and chrX according to the flag: # --vcf-half-call m From 7aab3c04bcca2bc88cbfe6810bef538284be1f2c Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 16 Sep 2023 08:51:49 -0700 Subject: [PATCH 39/40] make sure we can write bool gts too --- tests/test_data.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/test_data.py b/tests/test_data.py index e35dccc6..f88cf09e 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -615,6 +615,32 @@ def test_write_genotypes_unphased(self): fname.with_suffix(".pvar").unlink() fname.unlink() + def test_write_genotypes_biallelic(self): + gts = self._get_fake_genotypes_plink() + + # convert to bool - we should be able to handle both + gts.data = gts.data.astype(np.bool_) + + fname = DATADIR / "test_write.pgen" + gts.fname = fname + gts.write() + + new_gts = GenotypesPLINK(fname) + new_gts.read() + new_gts.check_phase() + + # check that everything matches what we expected + np.testing.assert_allclose(gts.data, new_gts.data) + assert gts.samples == new_gts.samples + for i in range(len(new_gts.variants)): + for col in ("chrom", "pos", "id", "alleles"): + assert gts.variants[col][i] == new_gts.variants[col][i] + + # clean up afterwards: delete the files we created + fname.with_suffix(".psam").unlink() + fname.with_suffix(".pvar").unlink() + fname.unlink() + def test_write_multiallelic(self): # Create fake multi-allelic genotype data to write to the PLINK file gts_multiallelic = self._get_fake_genotypes_multiallelic() From f399d8e59d1cee119bbbca615090de48274733bd Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 16 Sep 2023 08:53:10 -0700 Subject: [PATCH 40/40] refmt with black --- tests/test_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_data.py b/tests/test_data.py index f88cf09e..07ec5b02 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -355,7 +355,7 @@ def _get_fake_genotypes_multiallelic_tr(self): phasing[4, [1, 3, 4]] = 0 phasing[3, [1, 4]] = 0 phasing[:2, 1:3] = 0 - phasing[1, 2] = 1 # since homozygous unphased variants are marked as phased + phasing[1, 2] = 1 # since homozygous unphased variants are marked as phased gts.data = np.concatenate((data, phasing), axis=-1).transpose((1, 0, 2)) # handle half-calls and chrX according to the flag: # --vcf-half-call m