From 296e75da20f336a77e9698f2f4788db5499a46ee Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Wed, 19 Jul 2023 12:58:00 -0700
Subject: [PATCH 01/40] update Pgenlib to >=0.90.1

---
 poetry.lock    | 249 +++++++++++++++++++++++++++++++++----------------
 pyproject.toml |   2 +-
 2 files changed, 171 insertions(+), 80 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index d5b353f5..37053a06 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -153,7 +153,7 @@ development = ["black", "flake8", "mypy", "pytest", "types-colorama"]
 
 [[package]]
 name = "coverage"
-version = "7.1.0"
+version = "7.2.7"
 description = "Code coverage measurement for Python"
 category = "dev"
 optional = false
@@ -173,6 +173,14 @@ category = "main"
 optional = false
 python-versions = ">=3.6"
 
+[[package]]
+name = "cython"
+version = "3.0.0"
+description = "The Cython compiler for writing C extensions in the Python language."
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
 [[package]]
 name = "cyvcf2"
 version = "0.30.18"
@@ -526,7 +534,7 @@ ptyprocess = ">=0.5"
 
 [[package]]
 name = "pgenlib"
-version = "0.81.3"
+version = "0.90.1"
 description = "Python wrapper for pgenlib's basic reader and writer."
 category = "main"
 optional = true
@@ -646,11 +654,14 @@ python-versions = "*"
 
 [[package]]
 name = "pysam"
-version = "0.19.0"
-description = "pysam"
+version = "0.21.0"
+description = "pysam - a python module for reading, manipulating and writing genomic data sets."
 category = "main"
 optional = false
-python-versions = "*"
+python-versions = ">=3.6"
+
+[package.dependencies]
+cython = "*"
 
 [[package]]
 name = "pytest"
@@ -992,7 +1003,7 @@ tests = []
 [metadata]
 lock-version = "1.1"
 python-versions = ">=3.7,<3.11"
-content-hash = "c06b769c137c78eec0fa14f907979cf92c1f4c7d2473e9f2a12f2a5d8eb31225"
+content-hash = "30ca6f38b3c8fc1f4f13fff84d6923bba6b00acdd037498c81d0183eebee50d4"
 
 [metadata.files]
 alabaster = [
@@ -1067,62 +1078,131 @@ colorlog = [
     {file = "colorlog-6.7.0.tar.gz", hash = "sha256:bd94bd21c1e13fac7bd3153f4bc3a7dc0eb0974b8bc2fdf1a989e474f6e582e5"},
 ]
 coverage = [
-    {file = "coverage-7.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3b946bbcd5a8231383450b195cfb58cb01cbe7f8949f5758566b881df4b33baf"},
-    {file = "coverage-7.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ec8e767f13be637d056f7e07e61d089e555f719b387a7070154ad80a0ff31801"},
-    {file = "coverage-7.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d4a5a5879a939cb84959d86869132b00176197ca561c664fc21478c1eee60d75"},
-    {file = "coverage-7.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b643cb30821e7570c0aaf54feaf0bfb630b79059f85741843e9dc23f33aaca2c"},
-    {file = "coverage-7.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32df215215f3af2c1617a55dbdfb403b772d463d54d219985ac7cd3bf124cada"},
-    {file = "coverage-7.1.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:33d1ae9d4079e05ac4cc1ef9e20c648f5afabf1a92adfaf2ccf509c50b85717f"},
-    {file = "coverage-7.1.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:29571503c37f2ef2138a306d23e7270687c0efb9cab4bd8038d609b5c2393a3a"},
-    {file = "coverage-7.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:63ffd21aa133ff48c4dff7adcc46b7ec8b565491bfc371212122dd999812ea1c"},
-    {file = "coverage-7.1.0-cp310-cp310-win32.whl", hash = "sha256:4b14d5e09c656de5038a3f9bfe5228f53439282abcab87317c9f7f1acb280352"},
-    {file = "coverage-7.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:8361be1c2c073919500b6601220a6f2f98ea0b6d2fec5014c1d9cfa23dd07038"},
-    {file = "coverage-7.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:da9b41d4539eefd408c46725fb76ecba3a50a3367cafb7dea5f250d0653c1040"},
-    {file = "coverage-7.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c5b15ed7644ae4bee0ecf74fee95808dcc34ba6ace87e8dfbf5cb0dc20eab45a"},
-    {file = "coverage-7.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d12d076582507ea460ea2a89a8c85cb558f83406c8a41dd641d7be9a32e1274f"},
-    {file = "coverage-7.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2617759031dae1bf183c16cef8fcfb3de7617f394c813fa5e8e46e9b82d4222"},
-    {file = "coverage-7.1.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c4e4881fa9e9667afcc742f0c244d9364d197490fbc91d12ac3b5de0bf2df146"},
-    {file = "coverage-7.1.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:9d58885215094ab4a86a6aef044e42994a2bd76a446dc59b352622655ba6621b"},
-    {file = "coverage-7.1.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:ffeeb38ee4a80a30a6877c5c4c359e5498eec095878f1581453202bfacc8fbc2"},
-    {file = "coverage-7.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3baf5f126f30781b5e93dbefcc8271cb2491647f8283f20ac54d12161dff080e"},
-    {file = "coverage-7.1.0-cp311-cp311-win32.whl", hash = "sha256:ded59300d6330be27bc6cf0b74b89ada58069ced87c48eaf9344e5e84b0072f7"},
-    {file = "coverage-7.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:6a43c7823cd7427b4ed763aa7fb63901ca8288591323b58c9cd6ec31ad910f3c"},
-    {file = "coverage-7.1.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:7a726d742816cb3a8973c8c9a97539c734b3a309345236cd533c4883dda05b8d"},
-    {file = "coverage-7.1.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bc7c85a150501286f8b56bd8ed3aa4093f4b88fb68c0843d21ff9656f0009d6a"},
-    {file = "coverage-7.1.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f5b4198d85a3755d27e64c52f8c95d6333119e49fd001ae5798dac872c95e0f8"},
-    {file = "coverage-7.1.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddb726cb861c3117a553f940372a495fe1078249ff5f8a5478c0576c7be12050"},
-    {file = "coverage-7.1.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:51b236e764840a6df0661b67e50697aaa0e7d4124ca95e5058fa3d7cbc240b7c"},
-    {file = "coverage-7.1.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:7ee5c9bb51695f80878faaa5598040dd6c9e172ddcf490382e8aedb8ec3fec8d"},
-    {file = "coverage-7.1.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:c31b75ae466c053a98bf26843563b3b3517b8f37da4d47b1c582fdc703112bc3"},
-    {file = "coverage-7.1.0-cp37-cp37m-win32.whl", hash = "sha256:3b155caf3760408d1cb903b21e6a97ad4e2bdad43cbc265e3ce0afb8e0057e73"},
-    {file = "coverage-7.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:2a60d6513781e87047c3e630b33b4d1e89f39836dac6e069ffee28c4786715f5"},
-    {file = "coverage-7.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f2cba5c6db29ce991029b5e4ac51eb36774458f0a3b8d3137241b32d1bb91f06"},
-    {file = "coverage-7.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:beeb129cacea34490ffd4d6153af70509aa3cda20fdda2ea1a2be870dfec8d52"},
-    {file = "coverage-7.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0c45948f613d5d18c9ec5eaa203ce06a653334cf1bd47c783a12d0dd4fd9c851"},
-    {file = "coverage-7.1.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ef382417db92ba23dfb5864a3fc9be27ea4894e86620d342a116b243ade5d35d"},
-    {file = "coverage-7.1.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7c7c0d0827e853315c9bbd43c1162c006dd808dbbe297db7ae66cd17b07830f0"},
-    {file = "coverage-7.1.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e5cdbb5cafcedea04924568d990e20ce7f1945a1dd54b560f879ee2d57226912"},
-    {file = "coverage-7.1.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:9817733f0d3ea91bea80de0f79ef971ae94f81ca52f9b66500c6a2fea8e4b4f8"},
-    {file = "coverage-7.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:218fe982371ac7387304153ecd51205f14e9d731b34fb0568181abaf7b443ba0"},
-    {file = "coverage-7.1.0-cp38-cp38-win32.whl", hash = "sha256:04481245ef966fbd24ae9b9e537ce899ae584d521dfbe78f89cad003c38ca2ab"},
-    {file = "coverage-7.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:8ae125d1134bf236acba8b83e74c603d1b30e207266121e76484562bc816344c"},
-    {file = "coverage-7.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2bf1d5f2084c3932b56b962a683074a3692bce7cabd3aa023c987a2a8e7612f6"},
-    {file = "coverage-7.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:98b85dd86514d889a2e3dd22ab3c18c9d0019e696478391d86708b805f4ea0fa"},
-    {file = "coverage-7.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38da2db80cc505a611938d8624801158e409928b136c8916cd2e203970dde4dc"},
-    {file = "coverage-7.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3164d31078fa9efe406e198aecd2a02d32a62fecbdef74f76dad6a46c7e48311"},
-    {file = "coverage-7.1.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db61a79c07331e88b9a9974815c075fbd812bc9dbc4dc44b366b5368a2936063"},
-    {file = "coverage-7.1.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9ccb092c9ede70b2517a57382a601619d20981f56f440eae7e4d7eaafd1d1d09"},
-    {file = "coverage-7.1.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:33ff26d0f6cc3ca8de13d14fde1ff8efe1456b53e3f0273e63cc8b3c84a063d8"},
-    {file = "coverage-7.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d47dd659a4ee952e90dc56c97d78132573dc5c7b09d61b416a9deef4ebe01a0c"},
-    {file = "coverage-7.1.0-cp39-cp39-win32.whl", hash = "sha256:d248cd4a92065a4d4543b8331660121b31c4148dd00a691bfb7a5cdc7483cfa4"},
-    {file = "coverage-7.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:7ed681b0f8e8bcbbffa58ba26fcf5dbc8f79e7997595bf071ed5430d8c08d6f3"},
-    {file = "coverage-7.1.0-pp37.pp38.pp39-none-any.whl", hash = "sha256:755e89e32376c850f826c425ece2c35a4fc266c081490eb0a841e7c1cb0d3bda"},
-    {file = "coverage-7.1.0.tar.gz", hash = "sha256:10188fe543560ec4874f974b5305cd1a8bdcfa885ee00ea3a03733464c4ca265"},
+    {file = "coverage-7.2.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d39b5b4f2a66ccae8b7263ac3c8170994b65266797fb96cbbfd3fb5b23921db8"},
+    {file = "coverage-7.2.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6d040ef7c9859bb11dfeb056ff5b3872436e3b5e401817d87a31e1750b9ae2fb"},
+    {file = "coverage-7.2.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ba90a9563ba44a72fda2e85302c3abc71c5589cea608ca16c22b9804262aaeb6"},
+    {file = "coverage-7.2.7-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7d9405291c6928619403db1d10bd07888888ec1abcbd9748fdaa971d7d661b2"},
+    {file = "coverage-7.2.7-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31563e97dae5598556600466ad9beea39fb04e0229e61c12eaa206e0aa202063"},
+    {file = "coverage-7.2.7-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:ebba1cd308ef115925421d3e6a586e655ca5a77b5bf41e02eb0e4562a111f2d1"},
+    {file = "coverage-7.2.7-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:cb017fd1b2603ef59e374ba2063f593abe0fc45f2ad9abdde5b4d83bd922a353"},
+    {file = "coverage-7.2.7-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d62a5c7dad11015c66fbb9d881bc4caa5b12f16292f857842d9d1871595f4495"},
+    {file = "coverage-7.2.7-cp310-cp310-win32.whl", hash = "sha256:ee57190f24fba796e36bb6d3aa8a8783c643d8fa9760c89f7a98ab5455fbf818"},
+    {file = "coverage-7.2.7-cp310-cp310-win_amd64.whl", hash = "sha256:f75f7168ab25dd93110c8a8117a22450c19976afbc44234cbf71481094c1b850"},
+    {file = "coverage-7.2.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:06a9a2be0b5b576c3f18f1a241f0473575c4a26021b52b2a85263a00f034d51f"},
+    {file = "coverage-7.2.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5baa06420f837184130752b7c5ea0808762083bf3487b5038d68b012e5937dbe"},
+    {file = "coverage-7.2.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fdec9e8cbf13a5bf63290fc6013d216a4c7232efb51548594ca3631a7f13c3a3"},
+    {file = "coverage-7.2.7-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:52edc1a60c0d34afa421c9c37078817b2e67a392cab17d97283b64c5833f427f"},
+    {file = "coverage-7.2.7-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:63426706118b7f5cf6bb6c895dc215d8a418d5952544042c8a2d9fe87fcf09cb"},
+    {file = "coverage-7.2.7-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:afb17f84d56068a7c29f5fa37bfd38d5aba69e3304af08ee94da8ed5b0865833"},
+    {file = "coverage-7.2.7-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:48c19d2159d433ccc99e729ceae7d5293fbffa0bdb94952d3579983d1c8c9d97"},
+    {file = "coverage-7.2.7-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0e1f928eaf5469c11e886fe0885ad2bf1ec606434e79842a879277895a50942a"},
+    {file = "coverage-7.2.7-cp311-cp311-win32.whl", hash = "sha256:33d6d3ea29d5b3a1a632b3c4e4f4ecae24ef170b0b9ee493883f2df10039959a"},
+    {file = "coverage-7.2.7-cp311-cp311-win_amd64.whl", hash = "sha256:5b7540161790b2f28143191f5f8ec02fb132660ff175b7747b95dcb77ac26562"},
+    {file = "coverage-7.2.7-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f2f67fe12b22cd130d34d0ef79206061bfb5eda52feb6ce0dba0644e20a03cf4"},
+    {file = "coverage-7.2.7-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a342242fe22407f3c17f4b499276a02b01e80f861f1682ad1d95b04018e0c0d4"},
+    {file = "coverage-7.2.7-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:171717c7cb6b453aebac9a2ef603699da237f341b38eebfee9be75d27dc38e01"},
+    {file = "coverage-7.2.7-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49969a9f7ffa086d973d91cec8d2e31080436ef0fb4a359cae927e742abfaaa6"},
+    {file = "coverage-7.2.7-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:b46517c02ccd08092f4fa99f24c3b83d8f92f739b4657b0f146246a0ca6a831d"},
+    {file = "coverage-7.2.7-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:a3d33a6b3eae87ceaefa91ffdc130b5e8536182cd6dfdbfc1aa56b46ff8c86de"},
+    {file = "coverage-7.2.7-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:976b9c42fb2a43ebf304fa7d4a310e5f16cc99992f33eced91ef6f908bd8f33d"},
+    {file = "coverage-7.2.7-cp312-cp312-win32.whl", hash = "sha256:8de8bb0e5ad103888d65abef8bca41ab93721647590a3f740100cd65c3b00511"},
+    {file = "coverage-7.2.7-cp312-cp312-win_amd64.whl", hash = "sha256:9e31cb64d7de6b6f09702bb27c02d1904b3aebfca610c12772452c4e6c21a0d3"},
+    {file = "coverage-7.2.7-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:58c2ccc2f00ecb51253cbe5d8d7122a34590fac9646a960d1430d5b15321d95f"},
+    {file = "coverage-7.2.7-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d22656368f0e6189e24722214ed8d66b8022db19d182927b9a248a2a8a2f67eb"},
+    {file = "coverage-7.2.7-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a895fcc7b15c3fc72beb43cdcbdf0ddb7d2ebc959edac9cef390b0d14f39f8a9"},
+    {file = "coverage-7.2.7-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e84606b74eb7de6ff581a7915e2dab7a28a0517fbe1c9239eb227e1354064dcd"},
+    {file = "coverage-7.2.7-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:0a5f9e1dbd7fbe30196578ca36f3fba75376fb99888c395c5880b355e2875f8a"},
+    {file = "coverage-7.2.7-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:419bfd2caae268623dd469eff96d510a920c90928b60f2073d79f8fe2bbc5959"},
+    {file = "coverage-7.2.7-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:2aee274c46590717f38ae5e4650988d1af340fe06167546cc32fe2f58ed05b02"},
+    {file = "coverage-7.2.7-cp37-cp37m-win32.whl", hash = "sha256:61b9a528fb348373c433e8966535074b802c7a5d7f23c4f421e6c6e2f1697a6f"},
+    {file = "coverage-7.2.7-cp37-cp37m-win_amd64.whl", hash = "sha256:b1c546aca0ca4d028901d825015dc8e4d56aac4b541877690eb76490f1dc8ed0"},
+    {file = "coverage-7.2.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:54b896376ab563bd38453cecb813c295cf347cf5906e8b41d340b0321a5433e5"},
+    {file = "coverage-7.2.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3d376df58cc111dc8e21e3b6e24606b5bb5dee6024f46a5abca99124b2229ef5"},
+    {file = "coverage-7.2.7-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5e330fc79bd7207e46c7d7fd2bb4af2963f5f635703925543a70b99574b0fea9"},
+    {file = "coverage-7.2.7-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e9d683426464e4a252bf70c3498756055016f99ddaec3774bf368e76bbe02b6"},
+    {file = "coverage-7.2.7-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d13c64ee2d33eccf7437961b6ea7ad8673e2be040b4f7fd4fd4d4d28d9ccb1e"},
+    {file = "coverage-7.2.7-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:b7aa5f8a41217360e600da646004f878250a0d6738bcdc11a0a39928d7dc2050"},
+    {file = "coverage-7.2.7-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:8fa03bce9bfbeeef9f3b160a8bed39a221d82308b4152b27d82d8daa7041fee5"},
+    {file = "coverage-7.2.7-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:245167dd26180ab4c91d5e1496a30be4cd721a5cf2abf52974f965f10f11419f"},
+    {file = "coverage-7.2.7-cp38-cp38-win32.whl", hash = "sha256:d2c2db7fd82e9b72937969bceac4d6ca89660db0a0967614ce2481e81a0b771e"},
+    {file = "coverage-7.2.7-cp38-cp38-win_amd64.whl", hash = "sha256:2e07b54284e381531c87f785f613b833569c14ecacdcb85d56b25c4622c16c3c"},
+    {file = "coverage-7.2.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:537891ae8ce59ef63d0123f7ac9e2ae0fc8b72c7ccbe5296fec45fd68967b6c9"},
+    {file = "coverage-7.2.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:06fb182e69f33f6cd1d39a6c597294cff3143554b64b9825d1dc69d18cc2fff2"},
+    {file = "coverage-7.2.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:201e7389591af40950a6480bd9edfa8ed04346ff80002cec1a66cac4549c1ad7"},
+    {file = "coverage-7.2.7-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f6951407391b639504e3b3be51b7ba5f3528adbf1a8ac3302b687ecababf929e"},
+    {file = "coverage-7.2.7-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f48351d66575f535669306aa7d6d6f71bc43372473b54a832222803eb956fd1"},
+    {file = "coverage-7.2.7-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b29019c76039dc3c0fd815c41392a044ce555d9bcdd38b0fb60fb4cd8e475ba9"},
+    {file = "coverage-7.2.7-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:81c13a1fc7468c40f13420732805a4c38a105d89848b7c10af65a90beff25250"},
+    {file = "coverage-7.2.7-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:975d70ab7e3c80a3fe86001d8751f6778905ec723f5b110aed1e450da9d4b7f2"},
+    {file = "coverage-7.2.7-cp39-cp39-win32.whl", hash = "sha256:7ee7d9d4822c8acc74a5e26c50604dff824710bc8de424904c0982e25c39c6cb"},
+    {file = "coverage-7.2.7-cp39-cp39-win_amd64.whl", hash = "sha256:eb393e5ebc85245347950143969b241d08b52b88a3dc39479822e073a1a8eb27"},
+    {file = "coverage-7.2.7-pp37.pp38.pp39-none-any.whl", hash = "sha256:b7b4c971f05e6ae490fef852c218b0e79d4e52f79ef0c8475566584a8fb3e01d"},
+    {file = "coverage-7.2.7.tar.gz", hash = "sha256:924d94291ca674905fe9481f12294eb11f2d3d3fd1adb20314ba89e94f44ed59"},
 ]
 cycler = [
     {file = "cycler-0.11.0-py3-none-any.whl", hash = "sha256:3a27e95f763a428a739d2add979fa7494c912a32c17c4c38c4d5f082cad165a3"},
     {file = "cycler-0.11.0.tar.gz", hash = "sha256:9c87405839a19696e837b3b818fed3f5f69f16f1eec1a1ad77e043dcea9c772f"},
 ]
+cython = [
+    {file = "Cython-3.0.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7c7d728e1a49ad01d41181e3a9ea80b8d14e825f4679e4dd837cbf7bca7998a5"},
+    {file = "Cython-3.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:626a4a6ef4b7ced87c348ea805488e4bd39dad9d0b39659aa9e1040b62bbfedf"},
+    {file = "Cython-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:33c900d1ca9f622b969ac7d8fc44bdae140a4a6c7d8819413b51f3ccd0586a09"},
+    {file = "Cython-3.0.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a65bc50dc1bc2faeafd9425defbdef6a468974f5c4192497ff7f14adccfdcd32"},
+    {file = "Cython-3.0.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:3b71b399b10b038b056ad12dce1e317a8aa7a96e99de7e4fa2fa5d1c9415cfb9"},
+    {file = "Cython-3.0.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f42f304c097cc53e9eb5f1a1d150380353d5018a3191f1b77f0de353c762181e"},
+    {file = "Cython-3.0.0-cp310-cp310-win32.whl", hash = "sha256:3e234e2549e808d9259fdb23ebcfd145be30c638c65118326ec33a8d29248dc2"},
+    {file = "Cython-3.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:829c8333195100448a23863cf64a07e1334fae6a275aefe871458937911531b6"},
+    {file = "Cython-3.0.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:06db81b1a01858fcc406616f8528e686ffb6cf7c3d78fb83767832bfecea8ad8"},
+    {file = "Cython-3.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c93634845238645ce7abf63a56b1c5b6248189005c7caff898fd4a0dac1c5e1e"},
+    {file = "Cython-3.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa606675c6bd23478b1d174e2a84e3c5a2c660968f97dc455afe0fae198f9d3d"},
+    {file = "Cython-3.0.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d3355e6f690184f984eeb108b0f5bbc4bcf8b9444f8168933acf79603abf7baf"},
+    {file = "Cython-3.0.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:93a34e1ca8afa4b7075b02ed14a7e4969256297029fb1bfd4cbe48f7290dbcff"},
+    {file = "Cython-3.0.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:bb1165ca9e78823f9ad1efa5b3d83156f868eabd679a615d140a3021bb92cd65"},
+    {file = "Cython-3.0.0-cp311-cp311-win32.whl", hash = "sha256:2fadde1da055944f5e1e17625055f54ddd11f451889110278ef30e07bd5e1695"},
+    {file = "Cython-3.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:254ed1f03a6c237fa64f0c6e44862058de65bfa2e6a3b48ca3c205492e0653aa"},
+    {file = "Cython-3.0.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:4e212237b7531759befb92699c452cd65074a78051ae4ee36ff8b237395ecf3d"},
+    {file = "Cython-3.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9f29307463eba53747b31f71394ed087e3e3e264dcc433e62de1d51f5c0c966c"},
+    {file = "Cython-3.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:53328a8af0806bebbdb48a4191883b11ee9d9dfb084d84f58fa5a8ab58baefc9"},
+    {file = "Cython-3.0.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5962e70b15e863e72bed6910e8c6ffef77d36cc98e2b31c474378f3b9e49b0e3"},
+    {file = "Cython-3.0.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9e69139f4e60ab14c50767a568612ea64d6907e9c8e0289590a170eb495e005f"},
+    {file = "Cython-3.0.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:c40bdbcb2286f0aeeb5df9ce53d45da2d2a9b36a16b331cd0809d212d22a8fc7"},
+    {file = "Cython-3.0.0-cp312-cp312-win32.whl", hash = "sha256:8abb8915eb2e57fa53d918afe641c05d1bcc6ed1913682ec1f28de71f4e3f398"},
+    {file = "Cython-3.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:30a4bd2481e59bd7ab2539f835b78edc19fc455811e476916f56026b93afd28b"},
+    {file = "Cython-3.0.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:0e1e4b7e4bfbf22fecfa5b852f0e499c442d4853b7ebd33ae37cdec9826ed5d8"},
+    {file = "Cython-3.0.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b00df42cdd1a285a64491ba23de08ab14169d3257c840428d40eb7e8e9979af"},
+    {file = "Cython-3.0.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:650d03ddddc08b051b4659778733f0f173ca7d327415755c05d265a6c1ba02fb"},
+    {file = "Cython-3.0.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4965f2ebade17166f21a508d66dd60d2a0b3a3b90abe3f72003baa17ae020dd6"},
+    {file = "Cython-3.0.0-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:4123c8d03167803df31da6b39de167cb9c04ac0aa4e35d4e5aa9d08ad511b84d"},
+    {file = "Cython-3.0.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:296c53b6c0030cf82987eef163444e8d7631cc139d995f9d58679d9fd1ddbf31"},
+    {file = "Cython-3.0.0-cp36-cp36m-win32.whl", hash = "sha256:0d2c1e172f1c81bafcca703093608e10dc16e3e2d24c5644c17606c7fdb1792c"},
+    {file = "Cython-3.0.0-cp36-cp36m-win_amd64.whl", hash = "sha256:bc816d8eb3686d6f8d165f4156bac18c1147e1035dc28a76742d0b7fb5b7c032"},
+    {file = "Cython-3.0.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8d86651347bbdbac1aca1824696c5e4c0a3b162946c422edcca2be12a03744d1"},
+    {file = "Cython-3.0.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:84176bd04ce9f3cc8799b47ec6d1959fa1ea5e71424507df7bbf0b0915bbedef"},
+    {file = "Cython-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:35abcf07b8277ec95bbe49a07b5c8760a2d941942ccfe759a94c8d2fe5602e9f"},
+    {file = "Cython-3.0.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a44d6b9a29b2bff38bb648577b2fcf6a68cf8b1783eee89c2eb749f69494b98d"},
+    {file = "Cython-3.0.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:4dc6bbe7cf079db37f1ebb9b0f10d0d7f29e293bb8688e92d50b5ea7a91d82f3"},
+    {file = "Cython-3.0.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:e28763e75e380b8be62b02266a7995a781997c97c119efbdccb8fb954bcd7574"},
+    {file = "Cython-3.0.0-cp37-cp37m-win32.whl", hash = "sha256:edae615cb4af51d5173e76ba9aea212424d025c57012e9cdf2f131f774c5ba71"},
+    {file = "Cython-3.0.0-cp37-cp37m-win_amd64.whl", hash = "sha256:20c604e974832aaf8b7a1f5455ee7274b34df62a35ee095cd7d2ed7e818e6c53"},
+    {file = "Cython-3.0.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c85fd2b1cbd9400d60ebe074795bb9a9188752f1612be3b35b0831a24879b91f"},
+    {file = "Cython-3.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:090256c687106932339f87f888b95f0d69c617bc9b18801555545b695d29d8ab"},
+    {file = "Cython-3.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cec2a67a0a7d9d4399758c0657ca03e5912e37218859cfbf046242cc532bfb3b"},
+    {file = "Cython-3.0.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a1cdd01ce45333bc264a218c6e183700d6b998f029233f586a53c9b13455c2d2"},
+    {file = "Cython-3.0.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ecee663d2d50ca939fc5db81f2f8a219c2417b4651ad84254c50a03a9cb1aadd"},
+    {file = "Cython-3.0.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:30f10e79393b411af7677c270ea69807acb9fc30205c8ff25561f4deef780ec1"},
+    {file = "Cython-3.0.0-cp38-cp38-win32.whl", hash = "sha256:609777d3a7a0a23b225e84d967af4ad2485c8bdfcacef8037cf197e87d431ca0"},
+    {file = "Cython-3.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:7f4a6dfd42ae0a45797f50fc4f6add702abf46ab3e7cd61811a6c6a97a40e1a2"},
+    {file = "Cython-3.0.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2d8158277c8942c0b20ff4c074fe6a51c5b89e6ac60cef606818de8c92773596"},
+    {file = "Cython-3.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54e34f99b2a8c1e11478541b2822e6408c132b98b6b8f5ed89411e5e906631ea"},
+    {file = "Cython-3.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:877d1c8745df59dd2061a0636c602729e9533ba13f13aa73a498f68662e1cbde"},
+    {file = "Cython-3.0.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:204690be60f0ff32eb70b04f28ef0d1e50ffd7b3f77ba06a7dc2389ee3b848e0"},
+    {file = "Cython-3.0.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:06fcb4628ccce2ba5abc8630adbeaf4016f63a359b4c6c3827b2d80e0673981c"},
+    {file = "Cython-3.0.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:090e24cfa31c926d0b13d8bb2ef48175acdd061ae1413343c94a2b12a4a4fa6f"},
+    {file = "Cython-3.0.0-cp39-cp39-win32.whl", hash = "sha256:4cd00f2158dc00f7f93a92444d0f663eda124c9c29bbbd658964f4e89c357fe8"},
+    {file = "Cython-3.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:5b4cc896d49ce2bae8d6a030f9a4c64965b59c38acfbf4617685e17f7fcf1731"},
+    {file = "Cython-3.0.0-py2.py3-none-any.whl", hash = "sha256:ff1aef1a03cfe293237c7a86ae9625b0411b2df30c53d1a7f29a8d381f38a1df"},
+    {file = "Cython-3.0.0.tar.gz", hash = "sha256:350b18f9673e63101dbbfcf774ee2f57c20ac4636d255741d76ca79016b1bd82"},
+]
 cyvcf2 = [
     {file = "cyvcf2-0.30.18-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:405d9c3a2d1c5c211a34222ce4b100b5bcf7fe32b0d25db7020311ae4ef9323d"},
     {file = "cyvcf2-0.30.18-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f61dd4503c4d8ffc49cc1ebe4b08a990430aaa7f3963330a5a3c6d0c0ed2783c"},
@@ -1417,13 +1497,24 @@ pexpect = [
     {file = "pexpect-4.8.0.tar.gz", hash = "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c"},
 ]
 pgenlib = [
-    {file = "Pgenlib-0.81.3.tar.gz", hash = "sha256:d7ab19e6b5c8befdfdf417834986c625abe6fcb86eb6d88190706271b93c465f"},
+    {file = "Pgenlib-0.90.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c5823a8f3bb558c2d60243eb19ca6220cc2d5c759fc50139e5c09b179353b68"},
+    {file = "Pgenlib-0.90.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:758b3927228029bb10758a28a8ded86a71844da973d6196b44913c4765cdf23d"},
+    {file = "Pgenlib-0.90.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72427c7e92993859be40e7ad326a5d86025d2218c0109ac3b3fa1620fd37bec2"},
+    {file = "Pgenlib-0.90.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85fb2bd56d12a338ec3605149c3f0782b04c861588ad7e4afb41f3ff13f90c00"},
+    {file = "Pgenlib-0.90.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8283d5b6cb1196dc859f7c489b36d3cd2673174a19e507fa52a869ef25333127"},
+    {file = "Pgenlib-0.90.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89d5baf59cb19f1f53ae1ec48fb33775e3bd84c0c4ed3d8582a6b45051f26e1a"},
+    {file = "Pgenlib-0.90.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f06160710bfbf69655bb30ba8231e1329d4903f0a71f5dcb52df9d04c2f94282"},
+    {file = "Pgenlib-0.90.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:af42bb7fc0812a562e3c17218019650174d4bd3598f0a031b2604ea534c2e361"},
+    {file = "Pgenlib-0.90.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72072417b51b9dc0189594c0b705968898b22e43f8a81e9e2a2e4ad19df9ba50"},
+    {file = "Pgenlib-0.90.1.tar.gz", hash = "sha256:069db673c1cc5baffcb9b20c2e6e26cc9fdaadd0095f55a188b41502d0511cb9"},
 ]
 pickleshare = [
     {file = "pickleshare-0.7.5-py2.py3-none-any.whl", hash = "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56"},
     {file = "pickleshare-0.7.5.tar.gz", hash = "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca"},
 ]
 pillow = [
+    {file = "Pillow-9.3.0-1-cp37-cp37m-win32.whl", hash = "sha256:e6ea6b856a74d560d9326c0f5895ef8050126acfdc7ca08ad703eb0081e82b74"},
+    {file = "Pillow-9.3.0-1-cp37-cp37m-win_amd64.whl", hash = "sha256:32a44128c4bdca7f31de5be641187367fe2a450ad83b833ef78910397db491aa"},
     {file = "Pillow-9.3.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:0b7257127d646ff8676ec8a15520013a698d1fdc48bc2a79ba4e53df792526f2"},
     {file = "Pillow-9.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b90f7616ea170e92820775ed47e136208e04c967271c9ef615b6fbd08d9af0e3"},
     {file = "Pillow-9.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:68943d632f1f9e3dce98908e873b3a090f6cba1cbb1b892a9e8d97c938871fbe"},
@@ -1522,27 +1613,27 @@ pyreadline3 = [
     {file = "pyreadline3-3.4.1.tar.gz", hash = "sha256:6f3d1f7b8a31ba32b73917cefc1f28cc660562f39aea8646d30bd6eff21f7bae"},
 ]
 pysam = [
-    {file = "pysam-0.19.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:44de1a3af7c7eb5f404d6337f0c9c4ee88c34c2d2fee1a7896ccd8e7d2aa475a"},
-    {file = "pysam-0.19.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0ceb07c6253598ec70fef6ac0c0f7ab0d299562c1a91e737adb07239afba22d6"},
-    {file = "pysam-0.19.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2057f3b8cc20562fd010e7971e83ab78978f17975563a711c94bca583ce8a2d3"},
-    {file = "pysam-0.19.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a1a9ee6cd6dfa50973dcb51cd2245ea7d4d64d4e962d60e5e1a088f7b790e3e"},
-    {file = "pysam-0.19.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:b831170ff810bfd1242dbce4ddf8e693e95e39a4332d5903d416233d3d1be50a"},
-    {file = "pysam-0.19.0-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:fa4a5d334986d0696522246820f295cbf6c18dc1b78798f800a2d57d56207789"},
-    {file = "pysam-0.19.0-cp36-cp36m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ab58d7d9b140e2e8a4918fc00661aa901ba461d9bccd4b499102b0828f2d897e"},
-    {file = "pysam-0.19.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b1d964c29fedf55d2a5d095227d19d915c2e0e5e42b1e0bdf7fab30cd1d2a850"},
-    {file = "pysam-0.19.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8c78f9b84f7fe69530eaccf5b7f08636b3419f0017b5050aa7013f1c59d3d654"},
-    {file = "pysam-0.19.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:ada92b376717ac4c9c9924a096af9186035115d29a113eaa997d1c020ce421b9"},
-    {file = "pysam-0.19.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:80b8f5b929c7f660b6e1497790a13c61f386b310a31ca54ad6ba110674d11c55"},
-    {file = "pysam-0.19.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:30017bed8d002d41f83fa7e10569525c811a8e3860d73a880ee653ef29fd4fbc"},
-    {file = "pysam-0.19.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5e2e465522ba2c34cb96c013a28f9c9db303f8ab1350b4c11cca73677f1e9594"},
-    {file = "pysam-0.19.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:2103b4b6d9b0cc0b4aaccf64e87a92bfdabb7dc92810cf84be35ffe78fafa002"},
-    {file = "pysam-0.19.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:eaf342b9c71ed83a63237df2000e3bc1f0236165d48fd7240c4c78b66f28d63b"},
-    {file = "pysam-0.19.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bc0021b154206edfbaa13515cb67523c76c576b7a670e72a793f2da4f03139f4"},
-    {file = "pysam-0.19.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3101e0fcd2f6accbfa72a554a71baf83f1c096bb0f9045059b3ead35901ce128"},
-    {file = "pysam-0.19.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:5118dcabac574638df43739941f0ee20cc4e9197aee4a8f10f195542a13f18e3"},
-    {file = "pysam-0.19.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:4fe1f1fae0e559d3412625dc7d4d08b477e80211b3fe5b267ba341a84f78b8be"},
-    {file = "pysam-0.19.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2911e3760dd2b709c686f075146787b8bda35629093352c28a7e52ac00ddaffc"},
-    {file = "pysam-0.19.0.tar.gz", hash = "sha256:dcc052566f9509fd93b2a2664f094a13f016fd60cdd189e05fb4eafa0c89505b"},
+    {file = "pysam-0.21.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7012207f6afc181a7009c70f5ab6969ac46c06090df63e798ead9efe433098a5"},
+    {file = "pysam-0.21.0-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:3501296b6e8af6092f8b3c01b271d7c3e03b37b81ed5462d3c8041b00f8fa73a"},
+    {file = "pysam-0.21.0-cp310-cp310-manylinux_2_24_i686.whl", hash = "sha256:ed21d9138e927eaed24e05295a9ec4619f97fbfd4db2d488cc969a4bc15a3db5"},
+    {file = "pysam-0.21.0-cp310-cp310-manylinux_2_24_x86_64.whl", hash = "sha256:37e0ed17c05f7cb3b141c2c78ad23b3592479e8adc47453ae5681c6ffd0d5e00"},
+    {file = "pysam-0.21.0-cp311-cp311-manylinux_2_24_aarch64.whl", hash = "sha256:7eba09d7f711f2d3759924178950ec4c38585bce7f0c18e90cb3133bc411808e"},
+    {file = "pysam-0.21.0-cp311-cp311-manylinux_2_24_i686.whl", hash = "sha256:7a4f7c34b66379b8565e4d4105a8f8b68f9b308c25adebfd357af8671d909ff5"},
+    {file = "pysam-0.21.0-cp311-cp311-manylinux_2_24_x86_64.whl", hash = "sha256:6d9f453048aefb3908b9cd40edb15d3140f8dea15c24ed0df11796a4367e9718"},
+    {file = "pysam-0.21.0-cp36-cp36m-manylinux_2_24_aarch64.whl", hash = "sha256:36d78fad182f3230ca8618ea1f4de4c4ed9b4437bc54aa9462d8f5e22f074c22"},
+    {file = "pysam-0.21.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:246ce42bf6321516ee8e26763907eaa5aa36a959af6927efba0d0f97127618e9"},
+    {file = "pysam-0.21.0-cp37-cp37m-manylinux_2_24_aarch64.whl", hash = "sha256:d7555782b2de082551e008fae1236dbe10365df0551776b4ec203953f6bda381"},
+    {file = "pysam-0.21.0-cp37-cp37m-manylinux_2_24_i686.whl", hash = "sha256:962136c9b2d456a4c7e7bad1cd1b680ed27334dbd6ee28a6381dfc698d84951d"},
+    {file = "pysam-0.21.0-cp37-cp37m-manylinux_2_24_x86_64.whl", hash = "sha256:f773e1b02ec47b665ad1e4813ea34a4eb1a17d137200f313ad00525aef08a94e"},
+    {file = "pysam-0.21.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cff4afedb902def2fb464e7023c0a95fcecbc3f9a1437b31b29cd805735014e5"},
+    {file = "pysam-0.21.0-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:6260ff6a0965d33141af753f0b81495d4579299ea906ce3a3f172294fc6454d7"},
+    {file = "pysam-0.21.0-cp38-cp38-manylinux_2_24_i686.whl", hash = "sha256:feaa464620ae221fb4878ef1b4f5370c319291522cf52ef497e8beba64c269a0"},
+    {file = "pysam-0.21.0-cp38-cp38-manylinux_2_24_x86_64.whl", hash = "sha256:eb283c0de1d197737b53f70d3ea04a2f42c66134803862c43669b9cbef6e45c0"},
+    {file = "pysam-0.21.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2087be3b6e754aed3a9178b9f9dd8dc4688c16df8197cdca5a3145098732f786"},
+    {file = "pysam-0.21.0-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:4daa4a9927c6c3bf6da6aa8ffce3242b989fe995733c790913dc2c586bc67d4a"},
+    {file = "pysam-0.21.0-cp39-cp39-manylinux_2_24_i686.whl", hash = "sha256:f58c86d76cf4e87b879a9aa8f860426a074a2d9f55d3a0078f5cf00e2b32f34c"},
+    {file = "pysam-0.21.0-cp39-cp39-manylinux_2_24_x86_64.whl", hash = "sha256:d6babe094fa79af483fe935fe1ea2ebc8e956bbe79dfb9b0a5ce4b2240d00389"},
+    {file = "pysam-0.21.0.tar.gz", hash = "sha256:5c9645ddd87668e36ff0a1966391e26f9c403bf85b1bc06c53fe2fcd592da2ce"},
 ]
 pytest = [
     {file = "pytest-7.2.0-py3-none-any.whl", hash = "sha256:892f933d339f068883b6fd5a459f03d85bfcb355e4981e146d2c7616c21fef71"},
diff --git a/pyproject.toml b/pyproject.toml
index 2791ff88..dc2b5ad0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,7 +16,7 @@ click = ">=8.0.3"
 pysam = ">=0.19.0"
 cyvcf2 = ">=0.30.14"
 matplotlib = ">=3.5.1"
-Pgenlib = {version = ">=0.81.3", optional = true, extras = ["files"]}
+Pgenlib = {version = ">=0.90.1", optional = true, extras = ["files"]}
 
 # docs
 # these belong in dev-dependencies, but RTD doesn't support that yet -- see

From 5b24aa9236a42f24a17b277ac260ddf8b476fc38 Mon Sep 17 00:00:00 2001
From: Gonzalo <138239951+gonzalogc1@users.noreply.github.com>
Date: Mon, 24 Jul 2023 17:09:52 -0600
Subject: [PATCH 02/40] Here it's the test_write_multiallelic()

---
 tests/test_data.py | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/tests/test_data.py b/tests/test_data.py
index 2932d146..3c8c8608 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -5,7 +5,7 @@
 import pytest
 import numpy as np
 import numpy.lib.recfunctions as rfn
-
+import pgenlib
 from haptools.sim_phenotype import Haplotype as HaptoolsHaplotype
 from haptools.sim_phenotype import Repeat as HaptoolsRepeat
 from haptools.data import (
@@ -344,7 +344,7 @@ def test_load_genotypes(self):
             for col in ("chrom", "pos", "id", "alleles"):
                 assert gts.variants[col][i] == expected.variants[col][i]
 
-    @pytest.mark.xfail(reason="not implemented yet")
+    
     def test_load_genotypes_multiallelic(self):
         expected = self._get_fake_genotypes_multiallelic()
 
@@ -531,6 +531,33 @@ def test_write_genotypes_unphased(self):
         fname.unlink()
 
 
+    def test_write_multiallelic(self):
+        # Create fake multi-allelic genotype data to write to the PLINK file
+        gts_multiallelic = self._get_fake_genotypes_multiallelic()
+        #  Name of the PLINK file where the data will be written
+        fname = DATADIR / "test_write_multiallelic.pgen"
+
+        # Write multiallelic genotype data to the PLINK file
+        gts_multiallelic.fname = fname
+        gts_multiallelic.write()
+
+        #Read the data from the newly created PLINK file
+        new_gts = GenotypesPLINK(fname)
+        new_gts.read()
+        new_gts.check_phase()
+
+        # Verify tqhat the uploaded data matches the original data
+        np.testing.assert_allclose(gts_multiallelic.data, new_gts.data)
+        assert gts_multiallelic.samples == new_gts.samples
+        for i in range(len(new_gts.variants)):
+            for col in ("chrom", "pos", "id", "alleles"):
+                assert gts_multiallelic.variants[col][i] == new_gts.variants[col][i]
+
+        #clean the files created after the test
+        fname.with_suffix(".psam").unlink()
+        fname.with_suffix(".pvar").unlink()
+        fname.unlink()
+
 class TestPhenotypes:
     def _get_expected_phenotypes(self):
         # create a phen matrix with shape: samples x phenotypes

From d0fb1060be1b35b99fc7b8c690115540965b7841 Mon Sep 17 00:00:00 2001
From: Gonzalo <138239951+gonzalogc1@users.noreply.github.com>
Date: Wed, 26 Jul 2023 10:50:04 -0600
Subject: [PATCH 03/40] I add multiallelic variants support for genotypePLINK

---
 haptools/data/genotypes.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index f8ea318d..97764e66 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -1074,12 +1074,16 @@ def _variant_arr(
         npt.NDArray
             A row from the :py:attr:`~.GenotypesPLINK.variants` array
         """
+        # Parse the REF and ALT alleles from the PVAR record
+        ref_allele = record[cid["REF"]]
+        alt_alleles = record[cid["ALT"]].split(",")
+        alleles = [ref_allele] + alt_alleles
         return np.array(
             (
                 record[cid["ID"]],
                 record[cid["CHROM"]],
                 record[cid["POS"]],
-                (record[cid["REF"]], record[cid["ALT"]]),
+                tuple(alleles),
             ),
             dtype=self.variants.dtype,
         )
@@ -1240,10 +1244,12 @@ def read(
         """
         super(Genotypes, self).read()
         import pgenlib
-
+    
         sample_idxs = self.read_samples(samples)
+        pv = pgenlib.PvarReader(bytes(str(self.fname.with_suffix(".pvar")), "utf8"))
+    
         with pgenlib.PgenReader(
-            bytes(str(self.fname), "utf8"), sample_subset=sample_idxs
+            bytes(str(self.fname), "utf8"), sample_subset=sample_idxs, pvar= pv
         ) as pgen:
             # how many variants to load?
             if variants is not None:
@@ -1465,10 +1471,13 @@ def write(self):
             chunks = len(self.variants)
 
         # write the pgen file
+        pv = pgenlib.PvarReader(bytes(str(self.fname.with_suffix(".pvar")), "utf8"))
         with pgenlib.PgenWriter(
+
             filename=bytes(str(self.fname), "utf8"),
             sample_ct=len(self.samples),
             variant_ct=len(self.variants),
+            allele_ct_limit = pv.get_max_allele_ct(),
             nonref_flags=False,
             hardcall_phase_present=True,
         ) as pgen:

From b8cbffca70fcb2738e78b96be756d208fd2676b9 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Wed, 26 Jul 2023 09:55:14 -0700
Subject: [PATCH 04/40] create skeleton for GenotypesPLINKTR class

---
 haptools/data/__init__.py  | 2 +-
 haptools/data/genotypes.py | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/haptools/data/__init__.py b/haptools/data/__init__.py
index a0561484..65a78c6a 100644
--- a/haptools/data/__init__.py
+++ b/haptools/data/__init__.py
@@ -3,4 +3,4 @@
 from .covariates import Covariates
 from .breakpoints import Breakpoints, HapBlock
 from .haplotypes import Extra, Repeat, Variant, Haplotype, Haplotypes
-from .genotypes import Genotypes, GenotypesVCF, GenotypesTR, GenotypesPLINK
+from .genotypes import Genotypes, GenotypesVCF, GenotypesTR, GenotypesPLINK, GenotypesPLINKTR
diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index 97764e66..2f937fe1 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -908,7 +908,6 @@ def _return_data(self, variant: trh.TRRecord):
             self.log.warning(
                 "The current variant in the VCF only has one allele per sample."
             )
-            data = []
             # Only one GT so phase will always be 0
             zeros = np.zeros((gts.shape[0], 1))
             missing = -1 * np.ones((gts.shape[0],))
@@ -1515,3 +1514,8 @@ def write(self):
             del subset_data
             del missing
             gc.collect()
+
+
+class GenotypesPLINKTR(GenotypesPLINK):
+    # TODO: implement this class
+    pass

From a4a9584d83e6c94e38a2eefe0a53ba38ea8438e6 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Wed, 26 Jul 2023 09:56:00 -0700
Subject: [PATCH 05/40] add plink2 TR test files

---
 tests/data/simple_tr.pgen | Bin 0 -> 53 bytes
 tests/data/simple_tr.psam |   6 ++++++
 tests/data/simple_tr.pvar |  16 ++++++++++++++++
 3 files changed, 22 insertions(+)
 create mode 100644 tests/data/simple_tr.pgen
 create mode 100644 tests/data/simple_tr.psam
 create mode 100644 tests/data/simple_tr.pvar

diff --git a/tests/data/simple_tr.pgen b/tests/data/simple_tr.pgen
new file mode 100644
index 0000000000000000000000000000000000000000..317aa19f140151892387be304e15acaef19a2965
GIT binary patch
literal 53
zcmd0i7GPyyU;tqkIUoT75&|p|44ll2984>j`1uu`Q_C5a0)^)>GcYnT`!k(m1^{(1
B1=Rom

literal 0
HcmV?d00001

diff --git a/tests/data/simple_tr.psam b/tests/data/simple_tr.psam
new file mode 100644
index 00000000..62f619e0
--- /dev/null
+++ b/tests/data/simple_tr.psam
@@ -0,0 +1,6 @@
+#IID	SEX
+HG00096	NA
+HG00097	NA
+HG00099	NA
+HG00100	NA
+HG00101	NA
diff --git a/tests/data/simple_tr.pvar b/tests/data/simple_tr.pvar
new file mode 100644
index 00000000..416b02e7
--- /dev/null
+++ b/tests/data/simple_tr.pvar
@@ -0,0 +1,16 @@
+##fileformat=VCFv4.3
+##fileDate=20230724
+##source=PLINKv2.00
+##command=HipSTR-v0.7 --test
+##filedate=20180225
+##INFO=<ID=START,Number=1,Type=Integer,Description="Inclusive start coodinate for the repetitive portion of the reference allele">
+##INFO=<ID=END,Number=1,Type=Integer,Description="Inclusive end coordinate for the repetitive portion of the reference allele">
+##INFO=<ID=PERIOD,Number=1,Type=Integer,Description="Length of STR motif">
+##contig=<ID=1>
+##contig=<ID=X>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+1	10114	1:10114:GTT	GTT	GTTGTT,GTTGTTGTT,GTTGTTGTTGTT,GTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTTGTTGTT	.	.	START=10114;END=10999;PERIOD=3
+1	11000	1:10116:ACACAC	ACACAC	AC	.	.	START=11000;END=11016;PERIOD=2
+1	12117	1:10117:AAA	AAA	AAAA	.	.	START=12117;END=12143;PERIOD=1
+1	13122	1:10122:GTGT	GTGT	GTGTGT,GTGTGTGT,GTGTGTGTGTGTGT,GTGTGTGTGTGTGTGTGTGTGT	.	.	START=13122;END=13150;PERIOD=2
+X	20122	1:20122:GGG	GGG	GGGGG	.	.	START=20122;END=20150;PERIOD=1

From 5f7ab286eb7c191daddb986e0bbd45a87454255d Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Wed, 26 Jul 2023 09:57:32 -0700
Subject: [PATCH 06/40] clean up tests in test_data.py

---
 tests/test_data.py | 27 +++------------------------
 1 file changed, 3 insertions(+), 24 deletions(-)

diff --git a/tests/test_data.py b/tests/test_data.py
index 3c8c8608..352668c4 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -530,7 +530,6 @@ def test_write_genotypes_unphased(self):
         fname.with_suffix(".pvar").unlink()
         fname.unlink()
 
-
     def test_write_multiallelic(self):
         # Create fake multi-allelic genotype data to write to the PLINK file
         gts_multiallelic = self._get_fake_genotypes_multiallelic()
@@ -1655,14 +1654,13 @@ def test_read_tr(self):
                 [[5, 255, 0], [255, 255, 0], [3, 255, 0], [255, 255, 0], [255, 255, 0]],
             ],
             dtype=np.uint8,
-        )
+        ).transpose((1, 0, 2))
+
         gts = GenotypesTR(DATADIR / "simple_tr.vcf")
         gts.read()
 
         # check genotypes
-        for i, variants in enumerate(expected_alleles):
-            for j, sample_var in enumerate(variants):
-                assert tuple(sample_var) == tuple(gts.data[j, i])
+        np.testing.assert_allclose(expected_alleles, gts.data)
 
 
 class TestBreakpoints:
@@ -1822,25 +1820,6 @@ def test_breakpoints_to_pop_array_chrom_no_match(self):
             [("chr1", 59423086), ("1", 59423090), ("1", 239403770), ("2", 229668150)],
             dtype=[("chrom", "U10"), ("pos", np.uint32)],
         )
-        expected_pop_arr = np.array(
-            [
-                [
-                    [0, 0],
-                    [1, 0],
-                    [1, 0],
-                    [0, 1],
-                ],
-                [
-                    [1, 1],
-                    [0, 1],
-                    [0, 1],
-                    [1, 0],
-                ],
-            ],
-            dtype=np.uint8,
-        )
-        labels = {"YRI": 0, "CEU": 1}
-        labels_map = np.vectorize({v: k for k, v in labels.items()}.get)
 
         expected = self._get_expected_breakpoints()
 

From f07751ac70891c7aca4ebfbfb6098728d1358aec Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Wed, 26 Jul 2023 09:58:16 -0700
Subject: [PATCH 07/40] add TestGenotypesPLINKTR._fake_genotypes_multiallelic()
 dummy method

---
 tests/test_data.py | 40 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/tests/test_data.py b/tests/test_data.py
index 352668c4..9ece725a 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -21,8 +21,9 @@
     Breakpoints,
     GenotypesTR,
     GenotypesVCF,
-    GenotypesPLINK,
     GenotypesTR,
+    GenotypesPLINK,
+    GenotypesPLINKTR,
 )
 
 
@@ -557,6 +558,43 @@ def test_write_multiallelic(self):
         fname.with_suffix(".pvar").unlink()
         fname.unlink()
 
+
+class TestGenotypesPLINKTR:
+    def _get_fake_genotypes_multiallelic(self):
+        # define the code for an empty allele
+        empty = np.iinfo(np.uint8).max
+
+        # fill out the class with the same variants and samples as GenotypesPLINK
+        gts_plink = TestGenotypesPLINK()._get_fake_genotypes_multiallelic()
+        gts = GenotypesPLINKTR(gts_plink.fname)
+        gts.variants = gts_plink.variants
+        gts.samples = gts_plink.samples
+
+        # fill out the data array with the repeat unit counts
+        data = np.array(
+            [
+                [[1, 2], [3, 4], [5, 6], [7, 8], [9, 9]],
+                [[3, 1], [3, 1], [1, 1], [1, 1], [3, 3]],
+                [[3, 3], [3, 3], [3, 3], [3, 3], [3, 3]],
+                [[11, 11], [empty, empty], [7, 2], [3, 4], [11, empty]],
+                [[5, empty], [empty, empty], [3, empty], [empty, empty], [empty, empty]],
+            ],
+            dtype=np.uint8,
+        )
+
+        # append the phasing info, transpose, and clean up
+        phasing = np.ones(data.shape[:2] + (1,)).astype(np.uint8)
+        gts.data = np.concatenate((data, phasing), axis=-1).transpose((1, 0, 2))
+
+        # handle half-calls and chrX according to the flag:
+        # --vcf-half-call m
+        gts.data[4, 3, 0] = empty
+        gts.data[:, 4, 1] = gts.data[:, 4, 0]
+        # TODO: figure out whether this is the correct way to handle half-calls and try
+        # to see if we can change chrX representations to properly simulate from it
+
+        return gts
+
 class TestPhenotypes:
     def _get_expected_phenotypes(self):
         # create a phen matrix with shape: samples x phenotypes

From 449c5543435d1b413d9cdf58e1ca47456e29cf4f Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Wed, 26 Jul 2023 17:00:57 -0700
Subject: [PATCH 08/40] create new simple-tr-valid VCF and PGEN files also, add
 test for reading TRs in TestGenotypesPLINK

---
 .../{simple_tr.pgen => simple-tr-valid.pgen}  | Bin
 .../{simple_tr.psam => simple-tr-valid.psam}  |   0
 .../{simple_tr.pvar => simple-tr-valid.pvar}  |   0
 tests/data/simple-tr-valid.vcf                |  15 +++++
 tests/test_data.py                            |  61 +++++++++++++++++-
 5 files changed, 75 insertions(+), 1 deletion(-)
 rename tests/data/{simple_tr.pgen => simple-tr-valid.pgen} (100%)
 rename tests/data/{simple_tr.psam => simple-tr-valid.psam} (100%)
 rename tests/data/{simple_tr.pvar => simple-tr-valid.pvar} (100%)
 create mode 100644 tests/data/simple-tr-valid.vcf

diff --git a/tests/data/simple_tr.pgen b/tests/data/simple-tr-valid.pgen
similarity index 100%
rename from tests/data/simple_tr.pgen
rename to tests/data/simple-tr-valid.pgen
diff --git a/tests/data/simple_tr.psam b/tests/data/simple-tr-valid.psam
similarity index 100%
rename from tests/data/simple_tr.psam
rename to tests/data/simple-tr-valid.psam
diff --git a/tests/data/simple_tr.pvar b/tests/data/simple-tr-valid.pvar
similarity index 100%
rename from tests/data/simple_tr.pvar
rename to tests/data/simple-tr-valid.pvar
diff --git a/tests/data/simple-tr-valid.vcf b/tests/data/simple-tr-valid.vcf
new file mode 100644
index 00000000..396aaf08
--- /dev/null
+++ b/tests/data/simple-tr-valid.vcf
@@ -0,0 +1,15 @@
+##fileformat=VCFv4.2
+##command=HipSTR-v0.7 --test
+##filedate=20180225
+##INFO=<ID=START,Number=1,Type=Integer,Description="Inclusive start coodinate for the repetitive portion of the reference allele">
+##INFO=<ID=END,Number=1,Type=Integer,Description="Inclusive end coordinate for the repetitive portion of the reference allele">
+##INFO=<ID=PERIOD,Number=1,Type=Integer,Description="Length of STR motif">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##contig=<ID=1>
+##contig=<ID=X>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	HG00096	HG00097	HG00099	HG00100	HG00101
+1	10114	1:10114:GTT	GTT	GTTGTT,GTTGTTGTT,GTTGTTGTTGTT,GTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTTGTTGTT	.	.	START=10114;END=10999;PERIOD=3	GT	0|1	2|3	4|5	6|7	8|8
+1	11000	1:10116:ACACAC	ACACAC	AC	.	.	START=11000;END=11016;PERIOD=2	GT	0|1	0|1	1|1	1|1	0|0
+1	12117	1:10117:AAA	AAA	AAAA	.	.	START=12117;END=12143;PERIOD=1	GT	0|0	0|0	0|0	0|0	0|0
+1	13122	1:10122:GTGT	GTGT	GTGTGT,GTGTGTGT,GTGTGTGTGTGTGT,GTGTGTGTGTGTGTGTGTGTGT	.	.	START=13122;END=13150;PERIOD=2	GT	4|4	.	3|0	1|2	4|.
+X	20122	1:20122:GGG	GGG	GGGGG	.	.	START=20122;END=20150;PERIOD=1	GT	1	.	0	.	.
diff --git a/tests/test_data.py b/tests/test_data.py
index 9ece725a..191d1129 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -331,6 +331,53 @@ def _get_fake_genotypes_multiallelic(self):
         gts.variants = gts_ref_alt.variants
         return gts
 
+    def _get_fake_genotypes_multiallelic_tr(self):
+        pgenlib = pytest.importorskip("pgenlib")
+
+        gts_tr = GenotypesTR(DATADIR / "simple_tr.vcf")
+        gts_tr.read()
+
+        gts = GenotypesPLINK(fname="")
+        gts.samples = ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101")
+
+        na = np.iinfo(np.uint8).max
+        data = np.array(
+            [
+                [[0, 1], [2, 3], [4, 5], [6, 7], [8, 8]],
+                [[0, 1], [0, 1], [1, 1], [1, 1], [0, 0]],
+                [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]],
+                [[4, 4], [na, na], [3, 0], [1, 2], [4, na]],
+                [[1, na], [na, na], [0, na], [na, na], [na, na]],
+            ],
+            dtype=np.uint8,
+        )
+        # append the phasing info, transpose, and clean up
+        phasing = np.ones(data.shape[:2] + (1,)).astype(np.uint8)
+        phasing[4, [1, 3, 4]] = 0
+        phasing[3, [1, 4]] = 0
+        gts.data = np.concatenate((data, phasing), axis=-1).transpose((1, 0, 2))
+        # handle half-calls and chrX according to the flag:
+        # --vcf-half-call m
+        gts.data[4, 3, 0] = na
+        gts.data[:, 4, 1] = gts.data[:, 4, 0]
+        # TODO: figure out whether this is the correct way to handle half-calls and try
+        # to see if we can change chrX representations to properly simulate from it
+
+        # add the appropriate alleles
+        ref_alt = [
+            tuple("GTT" * i for i in range(1, 10)),
+            ("ACACAC", "AC"),
+            ("AAA", "AAAA"),
+            ("GTGT", "GTGTGT", "GTGTGTGT", "GTGTGTGTGTGTGT", "GTGTGTGTGTGTGTGTGTGTGT"),
+            ("GGG", "GGGGG"),
+        ]
+        gts.variants = np.array(
+            [tuple(rec) + (ref_alt[idx],) for idx, rec in enumerate(gts_tr.variants)],
+            dtype=gts.variants.dtype,
+        )
+
+        return gts
+
     def test_load_genotypes(self):
         expected = self._get_fake_genotypes_plink()
 
@@ -345,7 +392,6 @@ def test_load_genotypes(self):
             for col in ("chrom", "pos", "id", "alleles"):
                 assert gts.variants[col][i] == expected.variants[col][i]
 
-    
     def test_load_genotypes_multiallelic(self):
         expected = self._get_fake_genotypes_multiallelic()
 
@@ -360,6 +406,19 @@ def test_load_genotypes_multiallelic(self):
             for col in ("chrom", "pos", "id", "alleles"):
                 assert gts.variants[col][i] == expected.variants[col][i]
 
+    def test_load_genotypes_multiallelic_tr(self):
+        expected = self._get_fake_genotypes_multiallelic_tr()
+
+        gts = GenotypesPLINK(DATADIR / "simple-tr-valid.pgen")
+        gts.read()
+
+        # check that everything matches what we expected
+        np.testing.assert_array_equal(gts.data, expected.data)
+        assert gts.samples == expected.samples
+        for i, x in enumerate(expected.variants):
+            for col in ("chrom", "pos", "id", "alleles"):
+                assert gts.variants[col][i] == expected.variants[col][i]
+
     def test_load_genotypes_chunked(self):
         expected = self._get_fake_genotypes_plink()
 

From fd6b31873be2b823e19bb745008b83161e32e995 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Wed, 26 Jul 2023 17:02:19 -0700
Subject: [PATCH 09/40] leverage new dummy method in TestGenotypesPLINK

---
 haptools/data/__init__.py |  8 +++++++-
 tests/test_data.py        | 13 +++++++------
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/haptools/data/__init__.py b/haptools/data/__init__.py
index 65a78c6a..a5d2c2f4 100644
--- a/haptools/data/__init__.py
+++ b/haptools/data/__init__.py
@@ -3,4 +3,10 @@
 from .covariates import Covariates
 from .breakpoints import Breakpoints, HapBlock
 from .haplotypes import Extra, Repeat, Variant, Haplotype, Haplotypes
-from .genotypes import Genotypes, GenotypesVCF, GenotypesTR, GenotypesPLINK, GenotypesPLINKTR
+from .genotypes import (
+    Genotypes,
+    GenotypesVCF,
+    GenotypesTR,
+    GenotypesPLINK,
+    GenotypesPLINKTR,
+)
diff --git a/tests/test_data.py b/tests/test_data.py
index 191d1129..959faaec 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -621,10 +621,10 @@ def test_write_multiallelic(self):
 class TestGenotypesPLINKTR:
     def _get_fake_genotypes_multiallelic(self):
         # define the code for an empty allele
-        empty = np.iinfo(np.uint8).max
+        na = np.iinfo(np.uint8).max
 
         # fill out the class with the same variants and samples as GenotypesPLINK
-        gts_plink = TestGenotypesPLINK()._get_fake_genotypes_multiallelic()
+        gts_plink = TestGenotypesPLINK()._get_fake_genotypes_multiallelic_tr()
         gts = GenotypesPLINKTR(gts_plink.fname)
         gts.variants = gts_plink.variants
         gts.samples = gts_plink.samples
@@ -635,25 +635,26 @@ def _get_fake_genotypes_multiallelic(self):
                 [[1, 2], [3, 4], [5, 6], [7, 8], [9, 9]],
                 [[3, 1], [3, 1], [1, 1], [1, 1], [3, 3]],
                 [[3, 3], [3, 3], [3, 3], [3, 3], [3, 3]],
-                [[11, 11], [empty, empty], [7, 2], [3, 4], [11, empty]],
-                [[5, empty], [empty, empty], [3, empty], [empty, empty], [empty, empty]],
+                [[11, 11], [na, na], [7, 2], [3, 4], [11, na]],
+                [[5, na], [na, na], [3, na], [na, na], [na, na]],
             ],
             dtype=np.uint8,
         )
 
         # append the phasing info, transpose, and clean up
-        phasing = np.ones(data.shape[:2] + (1,)).astype(np.uint8)
+        phasing = gts_plink.data[:, :, 2]
         gts.data = np.concatenate((data, phasing), axis=-1).transpose((1, 0, 2))
 
         # handle half-calls and chrX according to the flag:
         # --vcf-half-call m
-        gts.data[4, 3, 0] = empty
+        gts.data[4, 3, 0] = na
         gts.data[:, 4, 1] = gts.data[:, 4, 0]
         # TODO: figure out whether this is the correct way to handle half-calls and try
         # to see if we can change chrX representations to properly simulate from it
 
         return gts
 
+
 class TestPhenotypes:
     def _get_expected_phenotypes(self):
         # create a phen matrix with shape: samples x phenotypes

From 7f1db2d95be71e0449794929da69ed2cd93723f0 Mon Sep 17 00:00:00 2001
From: Gonzalo <138239951+gonzalogc1@users.noreply.github.com>
Date: Fri, 28 Jul 2023 17:33:20 -0600
Subject: [PATCH 10/40] I just added test_read_plinktr and test_iter in the
 TestGenotypesPLINKTR class of the test/test_data.py file, also I already
 added the __init__ function in the GenotypesPLINKTR class in genotype.py in
 the data file

---
 haptools/data/genotypes.py |  6 ++++++
 tests/test_data.py         | 21 +++++++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index 2f937fe1..79f3db79 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -1517,5 +1517,11 @@ def write(self):
 
 
 class GenotypesPLINKTR(GenotypesPLINK):
+    def __init__(self, fname: Path | str, log: Logger = None, chunk_size: int = None , vcftype: str = "auto"):
+        super().__init__(fname, log, chunk_size)
+        self.vcftype = vcftype
+        
+
+
     # TODO: implement this class
     pass
diff --git a/tests/test_data.py b/tests/test_data.py
index 959faaec..d05d8ba8 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -653,6 +653,27 @@ def _get_fake_genotypes_multiallelic(self):
         # to see if we can change chrX representations to properly simulate from it
 
         return gts
+    
+    def test_iter(self):
+        # Get the expected data
+        expected = self. _get_fake_genotypes_multiallelic()
+        gts = GenotypesPLINKTR(DATADIR / "simple-tr-valid.pgen")
+        # Check that everything matches what we expected
+        for idx, line in enumerate(gts):
+            np.testing.assert_allclose(line.data[:, :3], expected.data[:, idx])
+            for col in ("chrom", "pos", "id"):
+                assert line.variants[col] == expected.variants[col][idx]
+            assert line.variants["alleles"].tolist() == expected.variants["alleles"][idx]
+
+        # Check samples
+        assert gts.samples == expected.samples
+
+    def test_read_plinktr(self):
+        expected_alleles =self._get_fake_genotypes_multiallelic().data
+        gts = GenotypesPLINKTR(DATADIR / "simple-tr-valid.vcf")
+        gts.read()
+        # check genotypes
+        np.testing.assert_allclose(expected_alleles, gts.data)
 
 
 class TestPhenotypes:

From 5b3edfbeef1b80c1555b78463ab4aea4f89887ba Mon Sep 17 00:00:00 2001
From: Gonzalo <138239951+gonzalogc1@users.noreply.github.com>
Date: Thu, 3 Aug 2023 11:09:11 -0600
Subject: [PATCH 11/40] Added the load() method in the GenotypesPLINKTR class

---
 haptools/data/genotypes.py | 40 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index 79f3db79..22dc5192 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -1520,7 +1520,45 @@ class GenotypesPLINKTR(GenotypesPLINK):
     def __init__(self, fname: Path | str, log: Logger = None, chunk_size: int = None , vcftype: str = "auto"):
         super().__init__(fname, log, chunk_size)
         self.vcftype = vcftype
-        
+
+    @classmethod
+    def load(
+        cls: GenotypesPLINKTR,
+        fname: Path | str,
+        region: str = None,
+        samples: list[str] = None,
+        variants: set[str] = None,
+        vcftype: str = "auto",
+    ) -> Genotypes:
+        """
+        Load STR genotypes from a VCF file
+
+        Read the file contents, check the genotype phase, and create the MAC matrix
+
+        Parameters
+        ----------
+        fname
+            See documentation for :py:attr:`~.Data.fname`
+        region : str, optional
+            See documentation for :py:meth:`~.Genotypes.read`
+        samples : list[str], optional
+            See documentation for :py:meth:`~.Genotypes.read`
+        variants : set[str], optional
+            See documentation for :py:meth:`~.Genotypes.read
+        vcftype : str, optional
+        The type of TR VCF currently being read. Options are:
+        {'auto', 'gangstr', 'advntr', 'hipstr', 'eh', 'popstr'}
+`
+
+        Returns
+        -------
+        Genotypes
+            A Genotypes object with the data loaded into its properties
+        """
+        genotypes = cls(fname, log=None, chunk_size=None, vcftype=vcftype)
+        genotypes.read(region, samples, variants)
+        genotypes.check_phase()
+        return genotypes
 
 
     # TODO: implement this class

From 65d8bcfd3a2b2ca9a26361fee522b56a5c6d663f Mon Sep 17 00:00:00 2001
From: Gonzalo <138239951+gonzalogc1@users.noreply.github.com>
Date: Mon, 7 Aug 2023 16:13:14 -0600
Subject: [PATCH 12/40] Add the following methods: read(), _iterate(), write(),
 write_variants(), check_biallelic(), and check_maf() for GenotypesPLINKTR

---
 haptools/data/genotypes.py | 103 ++++++++++++++++++++++++++++++++++++-
 tests/test_data.py         |   5 +-
 2 files changed, 104 insertions(+), 4 deletions(-)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index 22dc5192..c69c9a79 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 import re
 import gc
+import importlib.util
 from csv import reader
 from pathlib import Path
 from typing import Iterator
@@ -1520,6 +1521,22 @@ class GenotypesPLINKTR(GenotypesPLINK):
     def __init__(self, fname: Path | str, log: Logger = None, chunk_size: int = None , vcftype: str = "auto"):
         super().__init__(fname, log, chunk_size)
         self.vcftype = vcftype
+        self._import_copy_trh_module()
+    
+    def _import_copy_trh_module(self):
+        """
+        Import a copy of the tr_harmonizer module into the "trh" property of this class
+        Subsequent edits to the module will remain scoped to the class, only
+        
+        This code is adapted from https://stackoverflow.com/a/11285504/16815703
+        """
+        try:
+            SPEC = importlib.util.find_spec("trtools.utils.tr_harmonizer")
+        except ModuleNotFoundError:
+            SPEC = trh.__spec__
+        self.trh = importlib.util.module_from_spec(SPEC)
+        SPEC.loader.exec_module(self.trh)
+        
 
     @classmethod
     def load(
@@ -1559,7 +1576,89 @@ def load(
         genotypes.read(region, samples, variants)
         genotypes.check_phase()
         return genotypes
+    
+    
+    
+    def _iter_TRRecords(self,region: str = None):
+        gts_obj = self
+        def GetGenotypeIndicies(self):
+            n = np.argmax(self.vcfrecord.ID == gts_obj.variants["id"])
+            return gts_obj.data[:, n]
+    
+        self.trh.TRRecord.GetGenotypeIndicies = GetGenotypeIndicies
+        vcf = VCF(self.fname.with_suffix(".pvar"))
+        yield from self.trh.TRRecordHarmonizer(
+            vcffile = vcf, vcfiter= vcf(region), region =region , vcftype =self.vcftype
+            )
+
+
+    def read(
+        self,
+        region: str = None,
+        samples: list[str] = None,
+        variants: set[str] = None,
+        max_variants: int = None,
+    ):
+        """
+        Read genotypes from a PGEN file into a numpy matrix stored in
+        :py:attr:`~.GenotypesPLINK.data`
+
+        Parameters
+        ----------
+        region : str, optional
+            See documentation for :py:attr:`~.GenotypesVCF.read`
+        samples : list[str], optional
+            See documentation for :py:attr:`~.GenotypesVCF.read`
+        variants : set[str], optional
+            See documentation for :py:attr:`~.GenotypesVCF.read`
+        max_variants : int, optional
+            See documentation for :py:attr:`~.GenotypesVCF.read`
+        """
+        super().read(region,samples,variants,max_variants)
+        for idx, record in enumerate(self._iter_TRRecords(region)):
+            self.data[:, idx] = record.GetLengthGenotypes()
+
+    def _iterate(
+        self,
+        pgen: pgenlib.PgenReader,
+        region: str = None,
+        variants: set[str] = None,
+    ):
+        """
+        A generator over the lines of a PGEN-PVAR file pair
+
+        This is a helper function for :py:meth:`~.GenotypesPLINK.__iter__`
+
+        Parameters
+        ----------
+        pgen: pgenlib.PgenReader
+            The pgenlib.PgenReader object from which to fetch variant records
+        region : str, optional
+            See documentation for :py:meth:`~.Genotypes.read`
+        variants : set[str], optional
+            See documentation for :py:meth:`~.Genotypes.read`
+
+        Yields
+        ------
+        Iterator[namedtuple]
+            An iterator over each line in the file, where each line is encoded as a
+            namedtuple containing each of the class properties
+        """
+        variants = super()._iterate(pgen,region,variants) 
+        for variant, record in zip(variants, self._iter_TRRecords(region)):
+            variant.data = record.GetLengthGenotypes()
+            yield variant
+
+    def write(self):
+        raise NotImplementedError
+
+    def write_variants(self):
+        raise NotImplementedError
+
+    def check_biallelic(self):
+        raise NotImplementedError
+
+    def check_maf(self):
+        raise NotImplementedError
 
 
-    # TODO: implement this class
-    pass
diff --git a/tests/test_data.py b/tests/test_data.py
index d05d8ba8..66c1c436 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -642,7 +642,7 @@ def _get_fake_genotypes_multiallelic(self):
         )
 
         # append the phasing info, transpose, and clean up
-        phasing = gts_plink.data[:, :, 2]
+        phasing = gts_plink.data[:, :, 2][:, :, np.newaxis]
         gts.data = np.concatenate((data, phasing), axis=-1).transpose((1, 0, 2))
 
         # handle half-calls and chrX according to the flag:
@@ -670,8 +670,9 @@ def test_iter(self):
 
     def test_read_plinktr(self):
         expected_alleles =self._get_fake_genotypes_multiallelic().data
-        gts = GenotypesPLINKTR(DATADIR / "simple-tr-valid.vcf")
+        gts = GenotypesPLINKTR(DATADIR / "simple-tr-valid.pgen")
         gts.read()
+        breakpoint()
         # check genotypes
         np.testing.assert_allclose(expected_alleles, gts.data)
 

From f444ab231bcbc3d34a59ce76c11432d7b84a750d Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Mon, 7 Aug 2023 17:28:48 -0700
Subject: [PATCH 13/40] handle missing and unphased genotypes in
 plinkgenotypestr

---
 haptools/data/genotypes.py | 5 +++--
 tests/test_data.py         | 2 ++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index c69c9a79..fb863920 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -1583,8 +1583,9 @@ def _iter_TRRecords(self,region: str = None):
         gts_obj = self
         def GetGenotypeIndicies(self):
             n = np.argmax(self.vcfrecord.ID == gts_obj.variants["id"])
-            return gts_obj.data[:, n]
-    
+            gts_data = gts_obj.data[:, n].astype(np.int8)
+            gts_data[gts_data == np.iinfo(np.uint8).max] = -1
+            return gts_data
         self.trh.TRRecord.GetGenotypeIndicies = GetGenotypeIndicies
         vcf = VCF(self.fname.with_suffix(".pvar"))
         yield from self.trh.TRRecordHarmonizer(
diff --git a/tests/test_data.py b/tests/test_data.py
index 66c1c436..ecf8e368 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -643,6 +643,8 @@ def _get_fake_genotypes_multiallelic(self):
 
         # append the phasing info, transpose, and clean up
         phasing = gts_plink.data[:, :, 2][:, :, np.newaxis]
+        phasing[[3,4], 1] = 0
+        phasing[1, [3,4]] = 1
         gts.data = np.concatenate((data, phasing), axis=-1).transpose((1, 0, 2))
 
         # handle half-calls and chrX according to the flag:

From 6b2c0c1f31750f12a939fe0be83b753c25b515b4 Mon Sep 17 00:00:00 2001
From: Gonzalo <138239951+gonzalogc1@users.noreply.github.com>
Date: Wed, 9 Aug 2023 13:01:03 -0600
Subject: [PATCH 14/40] Start benchmarking of GenotypesPLINKTR

---
 tests/bench_genotypes_TR.py | 441 ++++++++++++++++++++++++++++++++++++
 tests/test_data.py          |   1 -
 2 files changed, 441 insertions(+), 1 deletion(-)
 create mode 100755 tests/bench_genotypes_TR.py

diff --git a/tests/bench_genotypes_TR.py b/tests/bench_genotypes_TR.py
new file mode 100755
index 00000000..fc943530
--- /dev/null
+++ b/tests/bench_genotypes_TR.py
@@ -0,0 +1,441 @@
+#!/usr/bin/env python
+
+import sys
+import pickle
+import shutil
+from pathlib import Path
+from time import process_time
+
+import click
+import matplotlib
+import numpy as np
+import matplotlib.pyplot as plt
+
+# Force matplotlib to not use any Xwindows backend.
+matplotlib.use("Agg")
+
+from haptools import logging
+from haptools.data import GenotypesVCF, GenotypesPLINK, GenotypesTR, GenotypesPLINKTR
+from pysam import VariantFile
+
+# COMMAND FOR GENERATING UKB PLOT:
+# tests/bench_genotypes.py \
+# --default-variants 18472 --default-samples 487409 --intervals-variants 1 80 4 \
+# --intervals-samples 1 80 4 -o plot.png -a results.pickle
+
+DATADIR = Path(__file__).parent.joinpath("data")
+
+
+def create_variant_files(gts, intervals, num_samps):
+    samples = gts.samples[:num_samps]
+    variant_dir = gts.fname / "variant"
+    variant_dir.mkdir(parents=True, exist_ok=True)
+    for val in intervals:
+        variants = tuple(gts.variants["id"][:val])
+        sub = gts.subset(samples=samples, variants=variants)
+        # write PLINK2 files
+        sub.fname = variant_dir / f"{val}.pgen"
+        # write VCF files
+        vcf = GenotypesTR(sub.fname.with_suffix(".vcf"))
+        vcf.data = sub.data
+        vcf.samples = sub.samples
+        vcf.variants = sub.variants
+        write_TR_VCF(vcf)
+    return variant_dir
+
+def write_TR_VCF(gts):
+    """
+    Write the variants in this class to a VCF at :py:attr:`~.GenotypesVCF.fname`
+    """
+    vcf = VariantFile(str(gts.fname), mode="w")
+    # make sure the header is properly structured
+    for contig in set(gts.variants["chrom"]):
+        vcf.header.contigs.add(contig)
+    vcf.header.add_meta(
+        "INFO",
+        items=[
+            ("ID", "START"),
+            ("Number", 1),
+            ("Type", "Integer"),
+            ("Description", "Inclusive start coodinate for the repetitive portion of the reference allele"),
+        ],
+    )
+    vcf.header.add_meta(
+        "INFO",
+        items=[
+            ("ID", "END"),
+            ("Number", 1),
+            ("Type", "Integer"),
+            ("Description", "Inclusive end coordinate for the repetitive portion of the reference allele"),
+        ],
+    )
+    vcf.header.add_meta(
+        "INFO",
+        items=[
+            ("ID", "PERIOD"),
+            ("Number", 1),
+            ("Type", "Integer"),
+            ("Description", "Length of STR motif"),
+        ],
+    )
+    vcf.header.add_meta(
+        "FORMAT",
+        items=[
+            ("ID", "GT"),
+            ("Number", 1),
+            ("Type", "String"),
+            ("Description", "Genotype"),
+        ],
+    )
+    try:
+        vcf.header.add_samples(gts.samples)
+    except AttributeError:
+        gts.log.warning(
+            "Upgrade to pysam >=0.19.1 to reduce the time required to create "
+            "VCFs. See https://github.com/pysam-developers/pysam/issues/1104"
+        )
+        for sample in gts.samples:
+            vcf.header.add_sample(sample)
+    gts.log.info("Writing VCF records")
+    phased = gts._prephased or (gts.data.shape[2] < 3)
+    missing_val = np.iinfo(np.uint8).max
+    for var_idx, var in enumerate(gts.variants):
+        rec = {
+            "contig": var["chrom"],
+            "start": var["pos"],
+            "stop": var["pos"] + len(var["alleles"][0]) - 1,
+            "qual": None,
+            "alleles": var["alleles"],
+            "id": var["id"],
+            "filter": None,
+            "info":{"START":var["pos"] - 1,"END":var["pos"] + len(var["alleles"][0]) - 1,"PERIOD":len(var["alleles"][0])},
+        }
+        # handle pysam increasing the start site by 1
+        rec["start"] -= 1
+        # parse the record into a pysam.VariantRecord
+        record = vcf.new_record(**rec)
+        for samp_idx, sample in enumerate(gts.samples):
+            record.samples[sample]["GT"] = tuple(
+                None if val == missing_val else val
+                for val in gts.data[samp_idx, var_idx, :2]
+            )
+            # add proper phasing info
+            if phased:
+                record.samples[sample].phased = True
+            else:
+                record.samples[sample].phased = gts.data[samp_idx, var_idx, 2]
+        # write the record to a file
+        vcf.write(record)
+    vcf.close()
+
+
+
+def create_sample_files(gts, intervals, num_vars):
+    variants = tuple(gts.variants["id"][:num_vars])
+    sample_dir = gts.fname / "sample"
+    sample_dir.mkdir(parents=True, exist_ok=True)
+    for val in intervals:
+        samples = gts.samples[:val]
+        sub = gts.subset(samples=samples, variants=variants)
+        # write PLINK2 files
+        sub.fname = sample_dir / f"{val}.pgen"
+        # write VCF files
+        vcf = GenotypesTR(sub.fname.with_suffix(".vcf"))
+        vcf.data = sub.data
+        vcf.samples = sub.samples
+        vcf.variants = sub.variants
+        write_TR_VCF(vcf)
+    return sample_dir
+
+
+def time_vcf(vcf, max_variants, chunk_size=500):
+    GenotypesVCF(vcf).read(max_variants=max_variants)
+
+
+def time_plink(pgen, max_variants, chunk_size=500):
+    GenotypesPLINK(pgen).read(max_variants=max_variants)
+
+
+def time_plink_chunk(pgen, max_variants, chunk_size=500):
+    GenotypesPLINK(pgen, chunk_size=chunk_size).read(max_variants=max_variants)
+
+
+def progressbar(it, prefix="", size=60, out=sys.stdout):  # Python3.6+
+    count = len(it)
+
+    def show(j):
+        x = int(size * j / count)
+        print(
+            f"{prefix}[{u'█'*x}{('.'*(size-x))}] {j}/{count}",
+            end="\r",
+            file=out,
+            flush=True,
+        )
+
+    show(0)
+    for i, item in enumerate(it):
+        yield item
+        show(i + 1)
+    print("\n", flush=True, file=out)
+
+
+@click.command()
+@click.option(
+    "-t",
+    "--temp",
+    type=click.Path(path_type=Path, file_okay=False, dir_okay=True),
+    default=Path("bench_temp_dir"),
+    show_default=True,
+    help="A temp directory into which to place generated temporary files",
+)
+@click.option(
+    "--default-variants",
+    type=int,
+    default=4,
+    show_default=True,
+    help="The number of variants to use when we vary the number of samples",
+)
+@click.option(
+    "--default-samples",
+    type=int,
+    default=5,
+    show_default=True,
+    help="The number of samples to use when we vary the number of variants",
+)
+@click.option(
+    "--intervals-variants",
+    type=int,
+    nargs=3,
+    default=(1, 4, 1),
+    show_default=True,
+    help="The start, end, and step values for the x-axis of the variants plot",
+)
+@click.option(
+    "--intervals-samples",
+    type=int,
+    nargs=3,
+    default=(1, 5, 1),
+    show_default=True,
+    help="The start, end, and step values for the x-axis of the samples plot",
+)
+@click.option(
+    "--reps",
+    type=int,
+    default=3,
+    show_default=True,
+    help="For each benchmark value, we take the mean of --reps X replicates",
+)
+@click.option(
+    "-o",
+    "--output",
+    type=click.Path(path_type=Path),
+    default=Path("bench-plink.png"),
+    show_default=True,
+    help="A PNG file containing the results of the benchmark",
+)
+@click.option(
+    "-p",
+    "--progress",
+    is_flag=True,
+    default=False,
+    show_default=True,
+    help="Whether to display a progress bar. Useful for large benchmarks",
+)
+@click.option(
+    "-s",
+    "--silent",
+    is_flag=True,
+    default=False,
+    show_default=True,
+    help="Whether to be entirely silent",
+)
+@click.option(
+    "-a",
+    "--archive",
+    type=click.Path(path_type=Path),
+    default=None,
+    show_default="do not save generated results",
+    help="A python pickle file into which to store results",
+)
+def main(
+    temp,
+    default_variants,
+    default_samples,
+    intervals_variants,
+    intervals_samples,
+    reps,
+    output,
+    progress=False,
+    silent=False,
+    archive=None,
+):
+    """
+    Benchmarks classes in the data.genotypes module
+    """
+    DEFAULT_VARIANTS, DEFAULT_SAMPLES, INTERVALS_VARIANTS, INTERVALS_SAMPLES, REPS = (
+        default_variants,
+        default_samples,
+        range(*intervals_variants),
+        range(*intervals_samples),
+        reps,
+    )
+    LOG = logging.getLogger("run", 0 if silent else "DEBUG")
+    gts = GenotypesPLINK(None, log=LOG)
+    num_variants = max(DEFAULT_VARIANTS, INTERVALS_VARIANTS.stop)
+    sample_size = max(DEFAULT_SAMPLES, INTERVALS_SAMPLES.stop)
+    gts.samples = tuple(f"sample{i}" for i in range(sample_size))
+    gts.variants = np.array(
+        [(f"id{i}", "chr0", i, ("A", "T")) for i in range(1, num_variants + 1)],
+        dtype=gts.variants.dtype,
+    )
+    np.random.seed(12345)
+    if not temp.exists():
+        LOG.info("Generating fake genotypes")
+        gts.data = np.random.choice(
+            [True, False], size=sample_size * num_variants * 2
+        ).reshape((sample_size, num_variants, 2))
+    gts.fname = temp
+    # set initial variables
+    SAMPLES = gts.samples
+    VARIANTS = gts.variants["id"]
+    DEFAULT_SAMPLES = min(DEFAULT_SAMPLES, len(SAMPLES))
+    DEFAULT_VARIANTS = min(DEFAULT_VARIANTS, len(VARIANTS))
+    if INTERVALS_VARIANTS.stop > len(VARIANTS):
+        INTERVALS_VARIANTS = range(
+            INTERVALS_VARIANTS.start, len(VARIANTS), INTERVALS_VARIANTS.step
+        )
+    INTERVALS_VARIANTS = list(INTERVALS_VARIANTS)
+    if INTERVALS_SAMPLES.stop > len(SAMPLES):
+        INTERVALS_SAMPLES = range(
+            INTERVALS_SAMPLES.start, len(SAMPLES), INTERVALS_SAMPLES.step
+        )
+    INTERVALS_SAMPLES = list(INTERVALS_SAMPLES)
+    FILE_TYPES = {
+        "vcf": "VCF",
+        "pgen": "PLINK2",
+        "chunked200": "PLINK2 chunk_size: 200",
+        "chunked400": "PLINK2 chunk_size: 400",
+        "chunked600": "PLINK2 chunk_size: 600",
+        "chunked800": "PLINK2 chunk_size: 800",
+    }
+
+    # create the files we will try to load if they haven't been created already
+    if not gts.fname.exists():
+        LOG.info("Creating VCF and PGEN files that we can load")
+        variant_dir = create_variant_files(gts, INTERVALS_VARIANTS, DEFAULT_SAMPLES)
+        sample_dir = create_sample_files(gts, INTERVALS_SAMPLES, DEFAULT_VARIANTS)
+    else:
+        variant_dir = gts.fname / "variant"
+        sample_dir = gts.fname / "sample"
+        LOG.info(
+            "Temp directory already exists. Assuming files have already been created",
+        )
+
+    LOG.setLevel(level="ERROR")
+
+    # run each test
+    LOG.info("Benchmarking the loading of each file")
+    results = {}
+    len_variants = len(VARIANTS)
+    for arg in ("samples", "variants"):
+        genotype_dir = variant_dir
+        if arg == "samples":
+            genotype_dir = sample_dir
+        results[arg] = {}
+        intervals = INTERVALS_SAMPLES if arg == "samples" else INTERVALS_VARIANTS
+        item_iter = (
+            progressbar(intervals, prefix=f"{arg}, {file_type}: ", out=sys.stderr)
+            if progress
+            else intervals
+        )
+        for file_type in FILE_TYPES.keys():
+            results[arg][file_type] = []
+            for val in item_iter:
+                chunk_size = 500
+                if file_type == "vcf":
+                    func = time_vcf
+                    file = genotype_dir / f"{val}.vcf"
+                elif file_type == "pgen" or file_type.startswith("chunked"):
+                    func = time_plink if file_type == "pgen" else time_plink_chunk
+                    file = genotype_dir / f"{val}.pgen"
+                    if file_type.startswith("chunked"):
+                        chunk_size = int(file_type[len("chunked") :])
+                else:
+                    continue
+                times = np.empty(REPS, dtype=np.float64)
+                for rep in range(REPS):
+                    start = process_time()
+                    func(file, max_variants=len_variants, chunk_size=chunk_size)
+                    end = process_time()
+                    times[rep] = end - start
+                results[arg][file_type].append(times.mean())
+
+    # plot the results
+    LOG.info("Generating plot of results")
+    fig, (ax_samples, ax_variants) = plt.subplots(1, 2, figsize=(10, 5))
+    lab_font = {"fontsize": "xx-small"}
+    for file_type in FILE_TYPES.keys():
+        x_vals = INTERVALS_SAMPLES
+        y_vals = results["samples"][file_type]
+        # fit a line to each so that we can report the slope
+        slope = np.polyfit(x_vals, y_vals, 1)[0]
+        ax_samples.plot(
+            x_vals,
+            y_vals,
+            marker="o",
+            label=FILE_TYPES[file_type],
+        )
+        ax_samples.text(
+            x_vals[-1],
+            y_vals[-1] + (y_vals[-1] / 16),
+            f"m = {slope:.3E}",
+            fontdict=lab_font,
+        )
+    ax_samples.set_xlabel(f"Number of samples\nnum_variants = {DEFAULT_VARIANTS}")
+    ax_samples.set_ylim(ymin=0)
+    for file_type in FILE_TYPES.keys():
+        x_vals = INTERVALS_VARIANTS
+        y_vals = results["variants"][file_type]
+        # fit a line to each so that we can report the slope
+        slope = np.polyfit(x_vals, y_vals, 1)[0]
+        ax_variants.plot(
+            x_vals,
+            y_vals,
+            marker="o",
+        )
+        ax_variants.text(
+            x_vals[-1],
+            y_vals[-1] + (y_vals[-1] / 16),
+            f"m = {slope:.3E}",
+            fontdict=lab_font,
+        )
+    ax_variants.set_xlabel(f"Number of variants\nnum_samples = {DEFAULT_SAMPLES}")
+    ax_variants.set_ylim(ymin=0)
+    fig.supylabel("CPU Time (s)")
+    fig.legend(loc="lower left", fontsize="xx-small")
+    fig.tight_layout()
+    fig.savefig(str(output), bbox_inches="tight", dpi=400)
+
+    # saving results
+    if archive is not None:
+        with open(archive, "wb") as fh:
+            pickle.dump(results, fh)
+
+
+def test_bench_genotypes():
+    tmp_dir = Path("bench_temp_dir")
+    tmp_plot = Path("bench-plink.png")
+
+    try:
+        main(["-t", tmp_dir, "-o", tmp_plot, "--reps", 1, "-s"])
+    except SystemExit as err:
+        # re-raise unless main() finished without an error
+        if err.code:
+            raise
+
+    shutil.rmtree(tmp_dir)
+    tmp_plot.unlink()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_data.py b/tests/test_data.py
index ecf8e368..326e5791 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -674,7 +674,6 @@ def test_read_plinktr(self):
         expected_alleles =self._get_fake_genotypes_multiallelic().data
         gts = GenotypesPLINKTR(DATADIR / "simple-tr-valid.pgen")
         gts.read()
-        breakpoint()
         # check genotypes
         np.testing.assert_allclose(expected_alleles, gts.data)
 

From 3d98d089ddc7396f31e9433be381fb3271942a29 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Thu, 10 Aug 2023 14:10:00 -0700
Subject: [PATCH 15/40] add benchmarking of TRs to bench_genotypes.py

---
 tests/bench_genotypes.py    | 220 ++++++++++++++++--
 tests/bench_genotypes_TR.py | 441 ------------------------------------
 2 files changed, 200 insertions(+), 461 deletions(-)
 delete mode 100755 tests/bench_genotypes_TR.py

diff --git a/tests/bench_genotypes.py b/tests/bench_genotypes.py
index 58dd39b7..1a7e7c07 100755
--- a/tests/bench_genotypes.py
+++ b/tests/bench_genotypes.py
@@ -3,19 +3,26 @@
 import sys
 import pickle
 import shutil
+import subprocess
 from pathlib import Path
 from time import process_time
 
 import click
 import matplotlib
 import numpy as np
+from pysam import VariantFile
 import matplotlib.pyplot as plt
 
 # Force matplotlib to not use any Xwindows backend.
 matplotlib.use("Agg")
 
 from haptools import logging
-from haptools.data import GenotypesVCF, GenotypesPLINK
+from haptools.data import (
+    GenotypesTR,
+    GenotypesVCF,
+    GenotypesPLINK,
+    GenotypesPLINKTR,
+)
 
 
 # COMMAND FOR GENERATING UKB PLOT:
@@ -26,41 +33,168 @@
 DATADIR = Path(__file__).parent.joinpath("data")
 
 
-def create_variant_files(gts, intervals, num_samps):
+class GenotypesVCFTR(GenotypesVCF):
+    def write(self):
+        """
+        Write the variants in this class to a VCF at :py:attr:`~.GenotypesTR.fname`
+
+        Output the VCF in HipSTR format
+        """
+        vcf = VariantFile(str(self.fname), mode="w")
+        # make sure the header is properly structured
+        for contig in set(self.variants["chrom"]):
+            vcf.header.contigs.add(contig)
+        d = "Inclusive {} coodinate for the repetitive portion of the reference allele"
+        vcf.header.add_meta(
+            "INFO",
+            items=[
+                ("ID", "START"),
+                ("Number", 1),
+                ("Type", "Integer"),
+                ("Description", d.format("start")),
+            ],
+        )
+        vcf.header.add_meta(
+            "INFO",
+            items=[
+                ("ID", "END"),
+                ("Number", 1),
+                ("Type", "Integer"),
+                ("Description", d.format("end")),
+            ],
+        )
+        vcf.header.add_meta(
+            "INFO",
+            items=[
+                ("ID", "PERIOD"),
+                ("Number", 1),
+                ("Type", "Integer"),
+                ("Description", "Length of STR motif"),
+            ],
+        )
+        vcf.header.add_meta(
+            "FORMAT",
+            items=[
+                ("ID", "GT"),
+                ("Number", 1),
+                ("Type", "String"),
+                ("Description", "Genotype"),
+            ],
+        )
+        try:
+            vcf.header.add_samples(self.samples)
+        except AttributeError:
+            self.log.warning(
+                "Upgrade to pysam >=0.19.1 to reduce the time required to create "
+                "VCFs. See https://github.com/pysam-developers/pysam/issues/1104"
+            )
+            for sample in self.samples:
+                vcf.header.add_sample(sample)
+        self.log.info("Writing VCF records")
+        phased = self._prephased or (self.data.shape[2] < 3)
+        missing_val = np.iinfo(np.uint8).max
+        for var_idx, var in enumerate(self.variants):
+            rec = {
+                "contig": var["chrom"],
+                "start": var["pos"],
+                "stop": var["pos"] + len(var["alleles"][0]),
+                "qual": None,
+                "alleles": var["alleles"],
+                "id": var["id"],
+                "filter": None,
+            }
+            # handle pysam increasing the start site by 1
+            rec["start"] -= 1
+            # parse the record into a pysam.VariantRecord
+            record = vcf.new_record(**rec)
+            # add INFO flags expected of HipSTR
+            # Note: this is only possible because we guarantee that the REF allele
+            # is a single copy of a dinucleotide repeat
+            record.info["START"] = int(rec["start"])
+            record.stop = int(rec["stop"])
+            record.info["PERIOD"] = len(var["alleles"][0])
+            for samp_idx, sample in enumerate(self.samples):
+                record.samples[sample]["GT"] = tuple(
+                    None if val == missing_val else val
+                    for val in self.data[samp_idx, var_idx, :2]
+                )
+                # add proper phasing info
+                if phased:
+                    record.samples[sample].phased = True
+                else:
+                    record.samples[sample].phased = self.data[samp_idx, var_idx, 2]
+            # write the record to a file
+            vcf.write(record)
+        vcf.close()
+
+
+def write_tr_files(plink2: Path, vcf: Path, pgen: Path):
+    # first, add "HipSTR" to the header so that we can trick TRTools
+    subprocess.call(["sed", "-i", "2i##command=HipSTR-v0.7 --test/", vcf])
+    subprocess.call(
+        [
+            plink2,
+            "--vcf-half-call",
+            "m",
+            "--make-pgen",
+            "pvar-cols=vcfheader,qual,filter,info",
+            "--vcf",
+            vcf,
+            "--out",
+            pgen.with_suffix(""),
+        ],
+        stderr=subprocess.DEVNULL,
+        stdout=subprocess.DEVNULL,
+    )
+
+
+def create_variant_files(gts, intervals, num_samps, plink2: Path = None):
     samples = gts.samples[:num_samps]
     variant_dir = gts.fname / "variant"
     variant_dir.mkdir(parents=True, exist_ok=True)
     for val in intervals:
         variants = tuple(gts.variants["id"][:val])
         sub = gts.subset(samples=samples, variants=variants)
-        # write PLINK2 files
         sub.fname = variant_dir / f"{val}.pgen"
-        sub.write()
         # write VCF files
-        vcf = GenotypesVCF(sub.fname.with_suffix(".vcf"))
+        if plink2:
+            vcf = GenotypesVCFTR(sub.fname.with_suffix(".vcf"))
+        else:
+            vcf = GenotypesVCF(sub.fname.with_suffix(".vcf"))
         vcf.data = sub.data
         vcf.samples = sub.samples
         vcf.variants = sub.variants
         vcf.write()
+        # write PLINK2 files
+        if plink2:
+            write_tr_files(plink2, vcf.fname, sub.fname)
+        else:
+            sub.write()
     return variant_dir
 
 
-def create_sample_files(gts, intervals, num_vars):
+def create_sample_files(gts, intervals, num_vars, plink2: Path = None):
     variants = tuple(gts.variants["id"][:num_vars])
     sample_dir = gts.fname / "sample"
     sample_dir.mkdir(parents=True, exist_ok=True)
     for val in intervals:
         samples = gts.samples[:val]
         sub = gts.subset(samples=samples, variants=variants)
-        # write PLINK2 files
         sub.fname = sample_dir / f"{val}.pgen"
-        sub.write()
         # write VCF files
-        vcf = GenotypesVCF(sub.fname.with_suffix(".vcf"))
+        if plink2:
+            vcf = GenotypesVCFTR(sub.fname.with_suffix(".vcf"))
+        else:
+            vcf = GenotypesVCF(sub.fname.with_suffix(".vcf"))
         vcf.data = sub.data
         vcf.samples = sub.samples
         vcf.variants = sub.variants
         vcf.write()
+        # write PLINK2 files
+        if plink2:
+            write_tr_files(plink2, vcf.fname, sub.fname)
+        else:
+            sub.write()
     return sample_dir
 
 
@@ -68,14 +202,28 @@ def time_vcf(vcf, max_variants, chunk_size=500):
     GenotypesVCF(vcf).read(max_variants=max_variants)
 
 
+def time_vcf_tr(vcf, max_variants, chunk_size=500):
+    GenotypesTR(vcf, vcftype="hipstr").read(max_variants=max_variants)
+
+
 def time_plink(pgen, max_variants, chunk_size=500):
     GenotypesPLINK(pgen).read(max_variants=max_variants)
 
 
+def time_plink_tr(pgen, max_variants, chunk_size=500):
+    GenotypesPLINKTR(pgen, vcftype="hipstr").read(max_variants=max_variants)
+
+
 def time_plink_chunk(pgen, max_variants, chunk_size=500):
     GenotypesPLINK(pgen, chunk_size=chunk_size).read(max_variants=max_variants)
 
 
+def time_plink_chunk_tr(pgen, max_variants, chunk_size=500):
+    GenotypesPLINKTR(pgen, chunk_size=chunk_size, vcftype="hipstr").read(
+        max_variants=max_variants
+    )
+
+
 def progressbar(it, prefix="", size=60, out=sys.stdout):  # Python3.6+
     count = len(it)
 
@@ -95,6 +243,10 @@ def show(j):
     print("\n", flush=True, file=out)
 
 
+def create_tr_alleles():
+    return tuple("AT" * j for j in range(1, np.random.randint(2, 11) + 1))
+
+
 @click.command()
 @click.option(
     "-t",
@@ -173,6 +325,13 @@ def show(j):
     show_default="do not save generated results",
     help="A python pickle file into which to store results",
 )
+@click.option(
+    "--plink2",
+    type=click.Path(path_type=Path, exists=True),
+    default=None,
+    show_default="do not benchmark repeats",
+    help="Benchmark repeats in addition to SNPs. Create PGENs with this plink2 binary",
+)
 def main(
     temp,
     default_variants,
@@ -184,6 +343,7 @@ def main(
     progress=False,
     silent=False,
     archive=None,
+    plink2=None,
 ):
     """
     Benchmarks classes in the data.genotypes module
@@ -200,16 +360,29 @@ def main(
     num_variants = max(DEFAULT_VARIANTS, INTERVALS_VARIANTS.stop)
     sample_size = max(DEFAULT_SAMPLES, INTERVALS_SAMPLES.stop)
     gts.samples = tuple(f"sample{i}" for i in range(sample_size))
-    gts.variants = np.array(
-        [(f"id{i}", "chr0", i, ("A", "T")) for i in range(1, num_variants + 1)],
-        dtype=gts.variants.dtype,
-    )
+    if plink2:
+        gts.variants = np.array(
+            [
+                (f"id{i}", "chr0", i, create_tr_alleles())
+                for i in range(1, num_variants + 1)
+            ],
+            dtype=gts.variants.dtype,
+        )
+    else:
+        gts.variants = np.array(
+            [(f"id{i}", "chr0", i, ("A", "T")) for i in range(1, num_variants + 1)],
+            dtype=gts.variants.dtype,
+        )
     np.random.seed(12345)
     if not temp.exists():
         LOG.info("Generating fake genotypes")
-        gts.data = np.random.choice(
-            [True, False], size=sample_size * num_variants * 2
-        ).reshape((sample_size, num_variants, 2))
+        gts.data = np.empty(
+            (sample_size, num_variants, 2), dtype=(np.uint8 if plink2 else np.bool_)
+        )
+        for i in range(num_variants):
+            gts.data[:, i] = np.random.choice(
+                range(len(gts.variants[i]["alleles"])), size=sample_size * 2
+            ).reshape((sample_size, 2))
     gts.fname = temp
     # set initial variables
     SAMPLES = gts.samples
@@ -238,8 +411,12 @@ def main(
     # create the files we will try to load if they haven't been created already
     if not gts.fname.exists():
         LOG.info("Creating VCF and PGEN files that we can load")
-        variant_dir = create_variant_files(gts, INTERVALS_VARIANTS, DEFAULT_SAMPLES)
-        sample_dir = create_sample_files(gts, INTERVALS_SAMPLES, DEFAULT_VARIANTS)
+        variant_dir = create_variant_files(
+            gts, INTERVALS_VARIANTS, DEFAULT_SAMPLES, plink2
+        )
+        sample_dir = create_sample_files(
+            gts, INTERVALS_SAMPLES, DEFAULT_VARIANTS, plink2
+        )
     else:
         variant_dir = gts.fname / "variant"
         sample_dir = gts.fname / "sample"
@@ -269,10 +446,13 @@ def main(
             for val in item_iter:
                 chunk_size = 500
                 if file_type == "vcf":
-                    func = time_vcf
+                    func = time_vcf_tr if plink2 else time_vcf
                     file = genotype_dir / f"{val}.vcf"
                 elif file_type == "pgen" or file_type.startswith("chunked"):
-                    func = time_plink if file_type == "pgen" else time_plink_chunk
+                    if file_type == "pgen":
+                        func = time_plink_tr if plink2 else time_plink
+                    else:
+                        funct = time_plink_chunk_tr if plink2 else time_plink_chunk
                     file = genotype_dir / f"{val}.pgen"
                     if file_type.startswith("chunked"):
                         chunk_size = int(file_type[len("chunked") :])
diff --git a/tests/bench_genotypes_TR.py b/tests/bench_genotypes_TR.py
deleted file mode 100755
index fc943530..00000000
--- a/tests/bench_genotypes_TR.py
+++ /dev/null
@@ -1,441 +0,0 @@
-#!/usr/bin/env python
-
-import sys
-import pickle
-import shutil
-from pathlib import Path
-from time import process_time
-
-import click
-import matplotlib
-import numpy as np
-import matplotlib.pyplot as plt
-
-# Force matplotlib to not use any Xwindows backend.
-matplotlib.use("Agg")
-
-from haptools import logging
-from haptools.data import GenotypesVCF, GenotypesPLINK, GenotypesTR, GenotypesPLINKTR
-from pysam import VariantFile
-
-# COMMAND FOR GENERATING UKB PLOT:
-# tests/bench_genotypes.py \
-# --default-variants 18472 --default-samples 487409 --intervals-variants 1 80 4 \
-# --intervals-samples 1 80 4 -o plot.png -a results.pickle
-
-DATADIR = Path(__file__).parent.joinpath("data")
-
-
-def create_variant_files(gts, intervals, num_samps):
-    samples = gts.samples[:num_samps]
-    variant_dir = gts.fname / "variant"
-    variant_dir.mkdir(parents=True, exist_ok=True)
-    for val in intervals:
-        variants = tuple(gts.variants["id"][:val])
-        sub = gts.subset(samples=samples, variants=variants)
-        # write PLINK2 files
-        sub.fname = variant_dir / f"{val}.pgen"
-        # write VCF files
-        vcf = GenotypesTR(sub.fname.with_suffix(".vcf"))
-        vcf.data = sub.data
-        vcf.samples = sub.samples
-        vcf.variants = sub.variants
-        write_TR_VCF(vcf)
-    return variant_dir
-
-def write_TR_VCF(gts):
-    """
-    Write the variants in this class to a VCF at :py:attr:`~.GenotypesVCF.fname`
-    """
-    vcf = VariantFile(str(gts.fname), mode="w")
-    # make sure the header is properly structured
-    for contig in set(gts.variants["chrom"]):
-        vcf.header.contigs.add(contig)
-    vcf.header.add_meta(
-        "INFO",
-        items=[
-            ("ID", "START"),
-            ("Number", 1),
-            ("Type", "Integer"),
-            ("Description", "Inclusive start coodinate for the repetitive portion of the reference allele"),
-        ],
-    )
-    vcf.header.add_meta(
-        "INFO",
-        items=[
-            ("ID", "END"),
-            ("Number", 1),
-            ("Type", "Integer"),
-            ("Description", "Inclusive end coordinate for the repetitive portion of the reference allele"),
-        ],
-    )
-    vcf.header.add_meta(
-        "INFO",
-        items=[
-            ("ID", "PERIOD"),
-            ("Number", 1),
-            ("Type", "Integer"),
-            ("Description", "Length of STR motif"),
-        ],
-    )
-    vcf.header.add_meta(
-        "FORMAT",
-        items=[
-            ("ID", "GT"),
-            ("Number", 1),
-            ("Type", "String"),
-            ("Description", "Genotype"),
-        ],
-    )
-    try:
-        vcf.header.add_samples(gts.samples)
-    except AttributeError:
-        gts.log.warning(
-            "Upgrade to pysam >=0.19.1 to reduce the time required to create "
-            "VCFs. See https://github.com/pysam-developers/pysam/issues/1104"
-        )
-        for sample in gts.samples:
-            vcf.header.add_sample(sample)
-    gts.log.info("Writing VCF records")
-    phased = gts._prephased or (gts.data.shape[2] < 3)
-    missing_val = np.iinfo(np.uint8).max
-    for var_idx, var in enumerate(gts.variants):
-        rec = {
-            "contig": var["chrom"],
-            "start": var["pos"],
-            "stop": var["pos"] + len(var["alleles"][0]) - 1,
-            "qual": None,
-            "alleles": var["alleles"],
-            "id": var["id"],
-            "filter": None,
-            "info":{"START":var["pos"] - 1,"END":var["pos"] + len(var["alleles"][0]) - 1,"PERIOD":len(var["alleles"][0])},
-        }
-        # handle pysam increasing the start site by 1
-        rec["start"] -= 1
-        # parse the record into a pysam.VariantRecord
-        record = vcf.new_record(**rec)
-        for samp_idx, sample in enumerate(gts.samples):
-            record.samples[sample]["GT"] = tuple(
-                None if val == missing_val else val
-                for val in gts.data[samp_idx, var_idx, :2]
-            )
-            # add proper phasing info
-            if phased:
-                record.samples[sample].phased = True
-            else:
-                record.samples[sample].phased = gts.data[samp_idx, var_idx, 2]
-        # write the record to a file
-        vcf.write(record)
-    vcf.close()
-
-
-
-def create_sample_files(gts, intervals, num_vars):
-    variants = tuple(gts.variants["id"][:num_vars])
-    sample_dir = gts.fname / "sample"
-    sample_dir.mkdir(parents=True, exist_ok=True)
-    for val in intervals:
-        samples = gts.samples[:val]
-        sub = gts.subset(samples=samples, variants=variants)
-        # write PLINK2 files
-        sub.fname = sample_dir / f"{val}.pgen"
-        # write VCF files
-        vcf = GenotypesTR(sub.fname.with_suffix(".vcf"))
-        vcf.data = sub.data
-        vcf.samples = sub.samples
-        vcf.variants = sub.variants
-        write_TR_VCF(vcf)
-    return sample_dir
-
-
-def time_vcf(vcf, max_variants, chunk_size=500):
-    GenotypesVCF(vcf).read(max_variants=max_variants)
-
-
-def time_plink(pgen, max_variants, chunk_size=500):
-    GenotypesPLINK(pgen).read(max_variants=max_variants)
-
-
-def time_plink_chunk(pgen, max_variants, chunk_size=500):
-    GenotypesPLINK(pgen, chunk_size=chunk_size).read(max_variants=max_variants)
-
-
-def progressbar(it, prefix="", size=60, out=sys.stdout):  # Python3.6+
-    count = len(it)
-
-    def show(j):
-        x = int(size * j / count)
-        print(
-            f"{prefix}[{u'█'*x}{('.'*(size-x))}] {j}/{count}",
-            end="\r",
-            file=out,
-            flush=True,
-        )
-
-    show(0)
-    for i, item in enumerate(it):
-        yield item
-        show(i + 1)
-    print("\n", flush=True, file=out)
-
-
-@click.command()
-@click.option(
-    "-t",
-    "--temp",
-    type=click.Path(path_type=Path, file_okay=False, dir_okay=True),
-    default=Path("bench_temp_dir"),
-    show_default=True,
-    help="A temp directory into which to place generated temporary files",
-)
-@click.option(
-    "--default-variants",
-    type=int,
-    default=4,
-    show_default=True,
-    help="The number of variants to use when we vary the number of samples",
-)
-@click.option(
-    "--default-samples",
-    type=int,
-    default=5,
-    show_default=True,
-    help="The number of samples to use when we vary the number of variants",
-)
-@click.option(
-    "--intervals-variants",
-    type=int,
-    nargs=3,
-    default=(1, 4, 1),
-    show_default=True,
-    help="The start, end, and step values for the x-axis of the variants plot",
-)
-@click.option(
-    "--intervals-samples",
-    type=int,
-    nargs=3,
-    default=(1, 5, 1),
-    show_default=True,
-    help="The start, end, and step values for the x-axis of the samples plot",
-)
-@click.option(
-    "--reps",
-    type=int,
-    default=3,
-    show_default=True,
-    help="For each benchmark value, we take the mean of --reps X replicates",
-)
-@click.option(
-    "-o",
-    "--output",
-    type=click.Path(path_type=Path),
-    default=Path("bench-plink.png"),
-    show_default=True,
-    help="A PNG file containing the results of the benchmark",
-)
-@click.option(
-    "-p",
-    "--progress",
-    is_flag=True,
-    default=False,
-    show_default=True,
-    help="Whether to display a progress bar. Useful for large benchmarks",
-)
-@click.option(
-    "-s",
-    "--silent",
-    is_flag=True,
-    default=False,
-    show_default=True,
-    help="Whether to be entirely silent",
-)
-@click.option(
-    "-a",
-    "--archive",
-    type=click.Path(path_type=Path),
-    default=None,
-    show_default="do not save generated results",
-    help="A python pickle file into which to store results",
-)
-def main(
-    temp,
-    default_variants,
-    default_samples,
-    intervals_variants,
-    intervals_samples,
-    reps,
-    output,
-    progress=False,
-    silent=False,
-    archive=None,
-):
-    """
-    Benchmarks classes in the data.genotypes module
-    """
-    DEFAULT_VARIANTS, DEFAULT_SAMPLES, INTERVALS_VARIANTS, INTERVALS_SAMPLES, REPS = (
-        default_variants,
-        default_samples,
-        range(*intervals_variants),
-        range(*intervals_samples),
-        reps,
-    )
-    LOG = logging.getLogger("run", 0 if silent else "DEBUG")
-    gts = GenotypesPLINK(None, log=LOG)
-    num_variants = max(DEFAULT_VARIANTS, INTERVALS_VARIANTS.stop)
-    sample_size = max(DEFAULT_SAMPLES, INTERVALS_SAMPLES.stop)
-    gts.samples = tuple(f"sample{i}" for i in range(sample_size))
-    gts.variants = np.array(
-        [(f"id{i}", "chr0", i, ("A", "T")) for i in range(1, num_variants + 1)],
-        dtype=gts.variants.dtype,
-    )
-    np.random.seed(12345)
-    if not temp.exists():
-        LOG.info("Generating fake genotypes")
-        gts.data = np.random.choice(
-            [True, False], size=sample_size * num_variants * 2
-        ).reshape((sample_size, num_variants, 2))
-    gts.fname = temp
-    # set initial variables
-    SAMPLES = gts.samples
-    VARIANTS = gts.variants["id"]
-    DEFAULT_SAMPLES = min(DEFAULT_SAMPLES, len(SAMPLES))
-    DEFAULT_VARIANTS = min(DEFAULT_VARIANTS, len(VARIANTS))
-    if INTERVALS_VARIANTS.stop > len(VARIANTS):
-        INTERVALS_VARIANTS = range(
-            INTERVALS_VARIANTS.start, len(VARIANTS), INTERVALS_VARIANTS.step
-        )
-    INTERVALS_VARIANTS = list(INTERVALS_VARIANTS)
-    if INTERVALS_SAMPLES.stop > len(SAMPLES):
-        INTERVALS_SAMPLES = range(
-            INTERVALS_SAMPLES.start, len(SAMPLES), INTERVALS_SAMPLES.step
-        )
-    INTERVALS_SAMPLES = list(INTERVALS_SAMPLES)
-    FILE_TYPES = {
-        "vcf": "VCF",
-        "pgen": "PLINK2",
-        "chunked200": "PLINK2 chunk_size: 200",
-        "chunked400": "PLINK2 chunk_size: 400",
-        "chunked600": "PLINK2 chunk_size: 600",
-        "chunked800": "PLINK2 chunk_size: 800",
-    }
-
-    # create the files we will try to load if they haven't been created already
-    if not gts.fname.exists():
-        LOG.info("Creating VCF and PGEN files that we can load")
-        variant_dir = create_variant_files(gts, INTERVALS_VARIANTS, DEFAULT_SAMPLES)
-        sample_dir = create_sample_files(gts, INTERVALS_SAMPLES, DEFAULT_VARIANTS)
-    else:
-        variant_dir = gts.fname / "variant"
-        sample_dir = gts.fname / "sample"
-        LOG.info(
-            "Temp directory already exists. Assuming files have already been created",
-        )
-
-    LOG.setLevel(level="ERROR")
-
-    # run each test
-    LOG.info("Benchmarking the loading of each file")
-    results = {}
-    len_variants = len(VARIANTS)
-    for arg in ("samples", "variants"):
-        genotype_dir = variant_dir
-        if arg == "samples":
-            genotype_dir = sample_dir
-        results[arg] = {}
-        intervals = INTERVALS_SAMPLES if arg == "samples" else INTERVALS_VARIANTS
-        item_iter = (
-            progressbar(intervals, prefix=f"{arg}, {file_type}: ", out=sys.stderr)
-            if progress
-            else intervals
-        )
-        for file_type in FILE_TYPES.keys():
-            results[arg][file_type] = []
-            for val in item_iter:
-                chunk_size = 500
-                if file_type == "vcf":
-                    func = time_vcf
-                    file = genotype_dir / f"{val}.vcf"
-                elif file_type == "pgen" or file_type.startswith("chunked"):
-                    func = time_plink if file_type == "pgen" else time_plink_chunk
-                    file = genotype_dir / f"{val}.pgen"
-                    if file_type.startswith("chunked"):
-                        chunk_size = int(file_type[len("chunked") :])
-                else:
-                    continue
-                times = np.empty(REPS, dtype=np.float64)
-                for rep in range(REPS):
-                    start = process_time()
-                    func(file, max_variants=len_variants, chunk_size=chunk_size)
-                    end = process_time()
-                    times[rep] = end - start
-                results[arg][file_type].append(times.mean())
-
-    # plot the results
-    LOG.info("Generating plot of results")
-    fig, (ax_samples, ax_variants) = plt.subplots(1, 2, figsize=(10, 5))
-    lab_font = {"fontsize": "xx-small"}
-    for file_type in FILE_TYPES.keys():
-        x_vals = INTERVALS_SAMPLES
-        y_vals = results["samples"][file_type]
-        # fit a line to each so that we can report the slope
-        slope = np.polyfit(x_vals, y_vals, 1)[0]
-        ax_samples.plot(
-            x_vals,
-            y_vals,
-            marker="o",
-            label=FILE_TYPES[file_type],
-        )
-        ax_samples.text(
-            x_vals[-1],
-            y_vals[-1] + (y_vals[-1] / 16),
-            f"m = {slope:.3E}",
-            fontdict=lab_font,
-        )
-    ax_samples.set_xlabel(f"Number of samples\nnum_variants = {DEFAULT_VARIANTS}")
-    ax_samples.set_ylim(ymin=0)
-    for file_type in FILE_TYPES.keys():
-        x_vals = INTERVALS_VARIANTS
-        y_vals = results["variants"][file_type]
-        # fit a line to each so that we can report the slope
-        slope = np.polyfit(x_vals, y_vals, 1)[0]
-        ax_variants.plot(
-            x_vals,
-            y_vals,
-            marker="o",
-        )
-        ax_variants.text(
-            x_vals[-1],
-            y_vals[-1] + (y_vals[-1] / 16),
-            f"m = {slope:.3E}",
-            fontdict=lab_font,
-        )
-    ax_variants.set_xlabel(f"Number of variants\nnum_samples = {DEFAULT_SAMPLES}")
-    ax_variants.set_ylim(ymin=0)
-    fig.supylabel("CPU Time (s)")
-    fig.legend(loc="lower left", fontsize="xx-small")
-    fig.tight_layout()
-    fig.savefig(str(output), bbox_inches="tight", dpi=400)
-
-    # saving results
-    if archive is not None:
-        with open(archive, "wb") as fh:
-            pickle.dump(results, fh)
-
-
-def test_bench_genotypes():
-    tmp_dir = Path("bench_temp_dir")
-    tmp_plot = Path("bench-plink.png")
-
-    try:
-        main(["-t", tmp_dir, "-o", tmp_plot, "--reps", 1, "-s"])
-    except SystemExit as err:
-        # re-raise unless main() finished without an error
-        if err.code:
-            raise
-
-    shutil.rmtree(tmp_dir)
-    tmp_plot.unlink()
-
-
-if __name__ == "__main__":
-    main()

From 47cdf1317ab2a149ca7f21173e676f3746734463 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Thu, 10 Aug 2023 21:26:18 -0700
Subject: [PATCH 16/40] fix progress bar in bench_genotypes.py

---
 tests/bench_genotypes.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/bench_genotypes.py b/tests/bench_genotypes.py
index 1a7e7c07..c378757b 100755
--- a/tests/bench_genotypes.py
+++ b/tests/bench_genotypes.py
@@ -436,12 +436,12 @@ def main(
             genotype_dir = sample_dir
         results[arg] = {}
         intervals = INTERVALS_SAMPLES if arg == "samples" else INTERVALS_VARIANTS
-        item_iter = (
-            progressbar(intervals, prefix=f"{arg}, {file_type}: ", out=sys.stderr)
-            if progress
-            else intervals
-        )
         for file_type in FILE_TYPES.keys():
+            item_iter = (
+                progressbar(intervals, prefix=f"{arg}, {file_type}: ", out=sys.stderr)
+                if progress
+                else intervals
+            )
             results[arg][file_type] = []
             for val in item_iter:
                 chunk_size = 500

From 939353eadb228e239e2e9aacbaa4e53f44d09c0c Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 11 Aug 2023 09:57:52 -0700
Subject: [PATCH 17/40] avoid using GetLengthGenotypes for speedup

---
 haptools/data/genotypes.py | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index fb863920..b5105d41 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -1616,8 +1616,39 @@ def read(
             See documentation for :py:attr:`~.GenotypesVCF.read`
         """
         super().read(region,samples,variants,max_variants)
-        for idx, record in enumerate(self._iter_TRRecords(region)):
-            self.data[:, idx] = record.GetLengthGenotypes()
+        num_variants = len(self.variants)
+        vcf = VCF(self.fname.with_suffix(".pvar"))
+        tr_records = self.trh.TRRecordHarmonizer(
+            vcffile=vcf, vcfiter=vcf(region), region=region, vcftype=self.vcftype,
+        )
+        # filter out TRs that we didn't want
+        if variants is not None:
+            tr_records = filter(lambda rec: rec.record_id in variants, tr_records)
+        # initialize a jagged array of allele lengths
+        max_num_alleles = max(map(len, self.variants["alleles"]))
+        allele_lens = np.empty((len(self.variants), max_num_alleles), dtype=self.data.dtype)
+        # iterate through each TR and extract the REF and ALT allele lengths
+        for idx, record in enumerate(tr_records):
+            if idx > num_variants:
+                # exit early if we've already found all the variants
+                break
+            # extract allele lengths from TRRecord object
+            allele_lens[idx, 0] = record.ref_allele_length
+            num_alleles = len(record.alt_allele_lengths) + 1
+            allele_lens[idx, 1:num_alleles] = record.alt_allele_lengths
+        # record missing entries and then set them all to REF
+        missing = self.data[:, :, :2] == np.iinfo(np.uint8).max
+        self.data[:, :, :2][missing] = 0
+        # TODO: use broadcasting instead of this for loop
+        for idx in range(num_variants):
+            # convert from genotype indices to allele lengths
+            self.data[:, idx, :2] = allele_lens[idx, self.data[:, idx, :2]]
+        # restore missing entries
+        self.data[:, :, :2][missing] = np.iinfo(np.uint8).max
+        # clean up memory
+        del missing
+        del allele_lens
+        gc.collect()
 
     def _iterate(
         self,

From f42acff18b92b0751fb5b78afe8fa66b9e23b47e Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 11 Aug 2023 10:52:27 -0700
Subject: [PATCH 18/40] extract TRRecords iteration to a separate method

---
 haptools/data/genotypes.py | 67 +++++++++++++++-----------------------
 1 file changed, 27 insertions(+), 40 deletions(-)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index b5105d41..a66d3945 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -1521,22 +1521,6 @@ class GenotypesPLINKTR(GenotypesPLINK):
     def __init__(self, fname: Path | str, log: Logger = None, chunk_size: int = None , vcftype: str = "auto"):
         super().__init__(fname, log, chunk_size)
         self.vcftype = vcftype
-        self._import_copy_trh_module()
-    
-    def _import_copy_trh_module(self):
-        """
-        Import a copy of the tr_harmonizer module into the "trh" property of this class
-        Subsequent edits to the module will remain scoped to the class, only
-        
-        This code is adapted from https://stackoverflow.com/a/11285504/16815703
-        """
-        try:
-            SPEC = importlib.util.find_spec("trtools.utils.tr_harmonizer")
-        except ModuleNotFoundError:
-            SPEC = trh.__spec__
-        self.trh = importlib.util.module_from_spec(SPEC)
-        SPEC.loader.exec_module(self.trh)
-        
 
     @classmethod
     def load(
@@ -1577,21 +1561,30 @@ def load(
         genotypes.check_phase()
         return genotypes
     
-    
-    
-    def _iter_TRRecords(self,region: str = None):
-        gts_obj = self
-        def GetGenotypeIndicies(self):
-            n = np.argmax(self.vcfrecord.ID == gts_obj.variants["id"])
-            gts_data = gts_obj.data[:, n].astype(np.int8)
-            gts_data[gts_data == np.iinfo(np.uint8).max] = -1
-            return gts_data
-        self.trh.TRRecord.GetGenotypeIndicies = GetGenotypeIndicies
-        vcf = VCF(self.fname.with_suffix(".pvar"))
-        yield from self.trh.TRRecordHarmonizer(
-            vcffile = vcf, vcfiter= vcf(region), region =region , vcftype =self.vcftype
-            )
+    def _iter_TRRecords(self, region: str = None, variants: set[str] = None):
+        """
+        Yield TRRecord objects from the PVAR file
 
+        Parameters
+        ----------
+        region : str, optional
+            See documentation for :py:attr:`~.GenotypesVCF.read`
+        variants : set[str], optional
+            See documentation for :py:attr:`~.GenotypesVCF.read`
+
+        Yields
+        ------
+        Iterator[trh.TRRecord]
+            An iterator over each line of the PVAR file
+        """
+        vcf = VCF(self.fname.with_suffix(".pvar"))
+        tr_records = trh.TRRecordHarmonizer(
+            vcffile=vcf, vcfiter=vcf(region), region=region, vcftype=self.vcftype,
+        )
+        # filter out TRs that we didn't want
+        if variants is not None:
+            tr_records = filter(lambda rec: rec.record_id in variants, tr_records)
+        yield from tr_records
 
     def read(
         self,
@@ -1617,18 +1610,11 @@ def read(
         """
         super().read(region,samples,variants,max_variants)
         num_variants = len(self.variants)
-        vcf = VCF(self.fname.with_suffix(".pvar"))
-        tr_records = self.trh.TRRecordHarmonizer(
-            vcffile=vcf, vcfiter=vcf(region), region=region, vcftype=self.vcftype,
-        )
-        # filter out TRs that we didn't want
-        if variants is not None:
-            tr_records = filter(lambda rec: rec.record_id in variants, tr_records)
         # initialize a jagged array of allele lengths
         max_num_alleles = max(map(len, self.variants["alleles"]))
         allele_lens = np.empty((len(self.variants), max_num_alleles), dtype=self.data.dtype)
         # iterate through each TR and extract the REF and ALT allele lengths
-        for idx, record in enumerate(tr_records):
+        for idx, record in enumerate(self._iter_TRRecords(region, variants)):
             if idx > num_variants:
                 # exit early if we've already found all the variants
                 break
@@ -1676,8 +1662,9 @@ def _iterate(
             An iterator over each line in the file, where each line is encoded as a
             namedtuple containing each of the class properties
         """
-        variants = super()._iterate(pgen,region,variants) 
-        for variant, record in zip(variants, self._iter_TRRecords(region)):
+        tr_records = self._iter_TRRecords(region, variants)
+        variants = super()._iterate(pgen, region, variants)
+        for variant, record in zip(variants, tr_records):
             variant.data = record.GetLengthGenotypes()
             yield variant
 

From ef440d6446e6c9f8f9eb012264ec961278fcd7fa Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 11 Aug 2023 10:59:17 -0700
Subject: [PATCH 19/40] fix some small syntax things

---
 haptools/data/genotypes.py | 10 ++++++----
 tests/test_data.py         |  5 ++---
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index a66d3945..41d9d944 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -1249,7 +1249,7 @@ def read(
         pv = pgenlib.PvarReader(bytes(str(self.fname.with_suffix(".pvar")), "utf8"))
     
         with pgenlib.PgenReader(
-            bytes(str(self.fname), "utf8"), sample_subset=sample_idxs, pvar= pv
+            bytes(str(self.fname), "utf8"), sample_subset=sample_idxs, pvar = pv
         ) as pgen:
             # how many variants to load?
             if variants is not None:
@@ -1402,9 +1402,11 @@ def __iter__(
         super(Genotypes, self).read()
         import pgenlib
 
+        pv = pgenlib.PvarReader(bytes(str(self.fname.with_suffix(".pvar")), "utf8"))
+
         sample_idxs = self.read_samples(samples)
         pgen = pgenlib.PgenReader(
-            bytes(str(self.fname), "utf8"), sample_subset=sample_idxs
+            bytes(str(self.fname), "utf8"), sample_subset=sample_idxs, pvar = pv
         )
         # call another function to force the lines above to be run immediately
         # see https://stackoverflow.com/a/36726497
@@ -1595,7 +1597,7 @@ def read(
     ):
         """
         Read genotypes from a PGEN file into a numpy matrix stored in
-        :py:attr:`~.GenotypesPLINK.data`
+        :py:attr:`~.GenotypesPLINKTR.data`
 
         Parameters
         ----------
@@ -1645,7 +1647,7 @@ def _iterate(
         """
         A generator over the lines of a PGEN-PVAR file pair
 
-        This is a helper function for :py:meth:`~.GenotypesPLINK.__iter__`
+        This is a helper function for :py:meth:`~.GenotypesPLINKTR.__iter__`
 
         Parameters
         ----------
diff --git a/tests/test_data.py b/tests/test_data.py
index 326e5791..c6de81d7 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -5,7 +5,6 @@
 import pytest
 import numpy as np
 import numpy.lib.recfunctions as rfn
-import pgenlib
 from haptools.sim_phenotype import Haplotype as HaptoolsHaplotype
 from haptools.sim_phenotype import Repeat as HaptoolsRepeat
 from haptools.data import (
@@ -658,7 +657,7 @@ def _get_fake_genotypes_multiallelic(self):
     
     def test_iter(self):
         # Get the expected data
-        expected = self. _get_fake_genotypes_multiallelic()
+        expected = self._get_fake_genotypes_multiallelic()
         gts = GenotypesPLINKTR(DATADIR / "simple-tr-valid.pgen")
         # Check that everything matches what we expected
         for idx, line in enumerate(gts):
@@ -671,7 +670,7 @@ def test_iter(self):
         assert gts.samples == expected.samples
 
     def test_read_plinktr(self):
-        expected_alleles =self._get_fake_genotypes_multiallelic().data
+        expected_alleles = self._get_fake_genotypes_multiallelic().data
         gts = GenotypesPLINKTR(DATADIR / "simple-tr-valid.pgen")
         gts.read()
         # check genotypes

From 8b8fcffe4d2c5e7a1f0d1abb5d99f83618de5296 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 11 Aug 2023 15:05:15 -0700
Subject: [PATCH 20/40] improve scaling of GenotypesPLINKTR as we scale the
 number of variants by using broadcasting across the number of variants

---
 haptools/data/genotypes.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index 41d9d944..614e74ad 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -1627,10 +1627,9 @@ def read(
         # record missing entries and then set them all to REF
         missing = self.data[:, :, :2] == np.iinfo(np.uint8).max
         self.data[:, :, :2][missing] = 0
-        # TODO: use broadcasting instead of this for loop
-        for idx in range(num_variants):
-            # convert from genotype indices to allele lengths
-            self.data[:, idx, :2] = allele_lens[idx, self.data[:, idx, :2]]
+        # convert from genotype indices to allele lengths
+        variant_coords = np.arange(num_variants)[:, np.newaxis]
+        self.data[:, :, :2] = allele_lens[variant_coords, self.data[:, :, :2]]
         # restore missing entries
         self.data[:, :, :2][missing] = np.iinfo(np.uint8).max
         # clean up memory

From e9137f5c125009b89e04ae3730f911a4b7427990 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 11 Aug 2023 15:26:07 -0700
Subject: [PATCH 21/40] fix failing test for GenotypesPLINKTR._iterate

---
 haptools/data/genotypes.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index 614e74ad..06210dc5 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -1666,7 +1666,15 @@ def _iterate(
         tr_records = self._iter_TRRecords(region, variants)
         variants = super()._iterate(pgen, region, variants)
         for variant, record in zip(variants, tr_records):
-            variant.data = record.GetLengthGenotypes()
+            # extract the REF and ALT allele lengths
+            allele_lens = np.array([record.ref_allele_length, *record.alt_allele_lengths], dtype=variant.data.dtype)
+            # record missing entries and then set them all to REF
+            missing = variant.data[:, :2] == np.iinfo(np.uint8).max
+            variant.data[:, :2][missing] = 0
+            # convert from genotype indices to allele lengths
+            variant.data[:, :2] = allele_lens[variant.data[:, :2]]
+            # restore missing entries
+            variant.data[:, :2][missing] = np.iinfo(np.uint8).max
             yield variant
 
     def write(self):

From b7973db148b1aa18f02ed843a1bda2bbaadc085f Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Sun, 13 Aug 2023 11:59:05 -0700
Subject: [PATCH 22/40] use numpy array method of cyvcf2 for slight speedup

---
 haptools/data/genotypes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index 06210dc5..9d0073cf 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -259,7 +259,7 @@ def _return_data(self, variant: Variant):
         data: npt.NDArray[np.uint8]
             Numpy array storing all genotypes
         """
-        return np.array(variant.genotypes, dtype=np.uint8)
+        return variant.genotype.array().astype(np.uint8)
 
     def _iterate(self, vcf: VCF, region: str = None, variants: set[str] = None):
         """

From 8bdd4c7f37944feb6ee037ec6e3bd478bf0cbc1d Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Sun, 13 Aug 2023 12:00:36 -0700
Subject: [PATCH 23/40] ensure all imports are relative within haptools/

---
 haptools/index.py     | 3 +--
 haptools/ld.py        | 2 +-
 haptools/transform.py | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/haptools/index.py b/haptools/index.py
index 3a71149f..305e6c72 100644
--- a/haptools/index.py
+++ b/haptools/index.py
@@ -6,9 +6,8 @@
 
 from pysam import tabix_index
 
-from haptools import data
+from . import data
 from .logging import getLogger
-from haptools.data.haplotypes import Haplotypes
 
 
 def append_suffix(
diff --git a/haptools/ld.py b/haptools/ld.py
index 748a8e3b..be28ca14 100644
--- a/haptools/ld.py
+++ b/haptools/ld.py
@@ -5,7 +5,7 @@
 import numpy as np
 import numpy.typing as npt
 
-from haptools import data
+from . import data
 from .logging import getLogger
 from .data import Haplotype as HaplotypeBase
 
diff --git a/haptools/transform.py b/haptools/transform.py
index b806cc37..df70f6cb 100644
--- a/haptools/transform.py
+++ b/haptools/transform.py
@@ -9,7 +9,7 @@
 from cyvcf2 import VCF, Variant
 from pysam import VariantFile
 
-from haptools import data
+from . import data
 from .logging import getLogger
 
 

From 14be007d5c2c02c9d4f6e97a4c3d03391f1b3113 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Sun, 13 Aug 2023 13:35:33 -0700
Subject: [PATCH 24/40] finish writing __iter__() and write() TR tests

---
 tests/test_data.py | 63 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 58 insertions(+), 5 deletions(-)

diff --git a/tests/test_data.py b/tests/test_data.py
index c6de81d7..a9bb00f0 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -333,7 +333,7 @@ def _get_fake_genotypes_multiallelic(self):
     def _get_fake_genotypes_multiallelic_tr(self):
         pgenlib = pytest.importorskip("pgenlib")
 
-        gts_tr = GenotypesTR(DATADIR / "simple_tr.vcf")
+        gts_tr = GenotypesTR(DATADIR / "simple-tr-valid.vcf")
         gts_tr.read()
 
         gts = GenotypesPLINK(fname="")
@@ -461,6 +461,21 @@ def test_load_genotypes_iterate(self):
             )
         assert gts.samples == expected.samples
 
+    def test_iter_multiallelic_tr(self):
+        expected = self._get_fake_genotypes_multiallelic_tr()
+
+        gts = GenotypesPLINK(DATADIR / "simple-tr-valid.pgen")
+
+        # Check that everything matches what we expected
+        for idx, line in enumerate(gts):
+            np.testing.assert_allclose(line.data[:, :3], expected.data[:, idx])
+            for col in ("chrom", "pos", "id"):
+                assert line.variants[col] == expected.variants[col][idx]
+            assert line.variants["alleles"].tolist() == expected.variants["alleles"][idx]
+
+        # Check samples
+        assert gts.samples == expected.samples
+
     def test_load_genotypes_subset(self):
         expected = self._get_fake_genotypes_plink()
 
@@ -604,7 +619,33 @@ def test_write_multiallelic(self):
         new_gts.read()
         new_gts.check_phase()
 
-        # Verify tqhat the uploaded data matches the original data
+        # Verify that the uploaded data matches the original data
+        np.testing.assert_allclose(gts_multiallelic.data, new_gts.data)
+        assert gts_multiallelic.samples == new_gts.samples
+        for i in range(len(new_gts.variants)):
+            for col in ("chrom", "pos", "id", "alleles"):
+                assert gts_multiallelic.variants[col][i] == new_gts.variants[col][i]
+
+        #clean the files created after the test
+        fname.with_suffix(".psam").unlink()
+        fname.with_suffix(".pvar").unlink()
+        fname.unlink()
+
+    def test_write_multiallelic_tr(self):
+        # Create fake multi-allelic genotype data to write to the PLINK file
+        gts_multiallelic = self._get_fake_genotypes_multiallelic_tr()
+        #  Name of the PLINK file where the data will be written
+        fname = DATADIR / "test_write_multiallelic_tr.pgen"
+
+        # Write multiallelic genotype data to the PLINK file
+        gts_multiallelic.fname = fname
+        gts_multiallelic.write()
+
+        #Read the data from the newly created PLINK file
+        new_gts = GenotypesPLINK(fname)
+        new_gts.read()
+
+        # Verify that the uploaded data matches the original data
         np.testing.assert_allclose(gts_multiallelic.data, new_gts.data)
         assert gts_multiallelic.samples == new_gts.samples
         for i in range(len(new_gts.variants)):
@@ -669,7 +710,7 @@ def test_iter(self):
         # Check samples
         assert gts.samples == expected.samples
 
-    def test_read_plinktr(self):
+    def test_read(self):
         expected_alleles = self._get_fake_genotypes_multiallelic().data
         gts = GenotypesPLINKTR(DATADIR / "simple-tr-valid.pgen")
         gts.read()
@@ -1764,8 +1805,8 @@ def test_merge_variants_vcf(self):
 
 
 class TestGenotypesTR:
-    def test_read_tr(self):
-        expected_alleles = np.array(
+    def _get_fake_tr_alleles(self):
+        return np.array(
             [
                 [[1, 2, 1], [3, 4, 1], [5, 6, 1], [7, 8, 1], [9, 0, 1]],
                 [[3, 1, 1], [3, 1, 1], [1, 1, 1], [1, 1, 1], [3, 3, 1]],
@@ -1776,12 +1817,24 @@ def test_read_tr(self):
             dtype=np.uint8,
         ).transpose((1, 0, 2))
 
+    def test_read(self):
+        expected_alleles = self._get_fake_tr_alleles()
+
         gts = GenotypesTR(DATADIR / "simple_tr.vcf")
         gts.read()
 
         # check genotypes
         np.testing.assert_allclose(expected_alleles, gts.data)
 
+    def test_iter(self):
+        # Get the expected data
+        expected = self._get_fake_tr_alleles()
+
+        gts = GenotypesTR(DATADIR / "simple_tr.vcf")
+        # Check that everything matches what we expected
+        for idx, line in enumerate(gts):
+            np.testing.assert_allclose(line.data[:, :3], expected[:, idx])
+
 
 class TestBreakpoints:
     def _get_expected_breakpoints(self):

From 9253bb06ce8670447f4e3d7714cf1b80503c9ec1 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Sun, 13 Aug 2023 22:55:42 -0700
Subject: [PATCH 25/40] add edge case from plink2-users discussion where there
 is an extra ALT allele that isn't used within any of the genotypes

---
 tests/data/simple-tr-valid.pgen | Bin 53 -> 54 bytes
 tests/data/simple-tr-valid.pvar |   4 ++--
 tests/data/simple-tr-valid.vcf  |   2 +-
 tests/test_data.py              |   2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/data/simple-tr-valid.pgen b/tests/data/simple-tr-valid.pgen
index 317aa19f140151892387be304e15acaef19a2965..b852ca9b9cebe6b0dbbf9d8f85a263f156012b66 100644
GIT binary patch
delta 31
mcmXptn;;{_$+VJ*pI^~AwVYuo0|Uc6W(Gz^W@7=SbIbr>$pxnX

delta 30
lcmXproggF0!L*WzpI^~AwVYuo0|Uc6W(Gz^W`Cx0%m7>i1+f4C

diff --git a/tests/data/simple-tr-valid.pvar b/tests/data/simple-tr-valid.pvar
index 416b02e7..f7c2b3c1 100644
--- a/tests/data/simple-tr-valid.pvar
+++ b/tests/data/simple-tr-valid.pvar
@@ -1,5 +1,5 @@
 ##fileformat=VCFv4.3
-##fileDate=20230724
+##fileDate=20230813
 ##source=PLINKv2.00
 ##command=HipSTR-v0.7 --test
 ##filedate=20180225
@@ -12,5 +12,5 @@
 1	10114	1:10114:GTT	GTT	GTTGTT,GTTGTTGTT,GTTGTTGTTGTT,GTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTTGTTGTT	.	.	START=10114;END=10999;PERIOD=3
 1	11000	1:10116:ACACAC	ACACAC	AC	.	.	START=11000;END=11016;PERIOD=2
 1	12117	1:10117:AAA	AAA	AAAA	.	.	START=12117;END=12143;PERIOD=1
-1	13122	1:10122:GTGT	GTGT	GTGTGT,GTGTGTGT,GTGTGTGTGTGTGT,GTGTGTGTGTGTGTGTGTGTGT	.	.	START=13122;END=13150;PERIOD=2
+1	13122	1:10122:GTGT	GTGT	GTGTGT,GTGTGTGT,GTGTGTGTGTGTGT,GTGTGTGTGTGTGTGTGTGTGT,GT	.	.	START=13122;END=13150;PERIOD=2
 X	20122	1:20122:GGG	GGG	GGGGG	.	.	START=20122;END=20150;PERIOD=1
diff --git a/tests/data/simple-tr-valid.vcf b/tests/data/simple-tr-valid.vcf
index 396aaf08..cf4ece04 100644
--- a/tests/data/simple-tr-valid.vcf
+++ b/tests/data/simple-tr-valid.vcf
@@ -11,5 +11,5 @@
 1	10114	1:10114:GTT	GTT	GTTGTT,GTTGTTGTT,GTTGTTGTTGTT,GTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTTGTTGTT	.	.	START=10114;END=10999;PERIOD=3	GT	0|1	2|3	4|5	6|7	8|8
 1	11000	1:10116:ACACAC	ACACAC	AC	.	.	START=11000;END=11016;PERIOD=2	GT	0|1	0|1	1|1	1|1	0|0
 1	12117	1:10117:AAA	AAA	AAAA	.	.	START=12117;END=12143;PERIOD=1	GT	0|0	0|0	0|0	0|0	0|0
-1	13122	1:10122:GTGT	GTGT	GTGTGT,GTGTGTGT,GTGTGTGTGTGTGT,GTGTGTGTGTGTGTGTGTGTGT	.	.	START=13122;END=13150;PERIOD=2	GT	4|4	.	3|0	1|2	4|.
+1	13122	1:10122:GTGT	GTGT	GTGTGT,GTGTGTGT,GTGTGTGTGTGTGT,GTGTGTGTGTGTGTGTGTGTGT,GT	.	.	START=13122;END=13150;PERIOD=2	GT	4|4	.	3|0	1|2	4|.
 X	20122	1:20122:GGG	GGG	GGGGG	.	.	START=20122;END=20150;PERIOD=1	GT	1	.	0	.	.
diff --git a/tests/test_data.py b/tests/test_data.py
index a9bb00f0..d9c90c31 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -367,7 +367,7 @@ def _get_fake_genotypes_multiallelic_tr(self):
             tuple("GTT" * i for i in range(1, 10)),
             ("ACACAC", "AC"),
             ("AAA", "AAAA"),
-            ("GTGT", "GTGTGT", "GTGTGTGT", "GTGTGTGTGTGTGT", "GTGTGTGTGTGTGTGTGTGTGT"),
+            ("GTGT", "GTGTGT", "GTGTGTGT", "GTGTGTGTGTGTGT", "GTGTGTGTGTGTGTGTGTGTGT", "GT"),
             ("GGG", "GGGGG"),
         ]
         gts.variants = np.array(

From 436878f4b36d168df347998963a23053121cbf0d Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Sun, 13 Aug 2023 22:57:06 -0700
Subject: [PATCH 26/40] transpose phasing before concatenating dummy data

---
 tests/test_data.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/test_data.py b/tests/test_data.py
index d9c90c31..6cbbc161 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -679,13 +679,11 @@ def _get_fake_genotypes_multiallelic(self):
                 [[5, na], [na, na], [3, na], [na, na], [na, na]],
             ],
             dtype=np.uint8,
-        )
+        ).transpose((1, 0, 2))
 
         # append the phasing info, transpose, and clean up
         phasing = gts_plink.data[:, :, 2][:, :, np.newaxis]
-        phasing[[3,4], 1] = 0
-        phasing[1, [3,4]] = 1
-        gts.data = np.concatenate((data, phasing), axis=-1).transpose((1, 0, 2))
+        gts.data = np.concatenate((data, phasing), axis=-1)
 
         # handle half-calls and chrX according to the flag:
         # --vcf-half-call m

From 8a8a11020158624b42301a5a5c632e63343217d8 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Mon, 14 Aug 2023 14:57:53 -0700
Subject: [PATCH 27/40] rename simple-tr-valid test files to simple-tr

---
 tests/data/{simple-tr-valid.pgen => simple-tr.pgen} | Bin
 tests/data/{simple-tr-valid.psam => simple-tr.psam} |   0
 tests/data/{simple-tr-valid.pvar => simple-tr.pvar} |   0
 tests/data/{simple-tr-valid.vcf => simple-tr.vcf}   |   0
 tests/test_data.py                                  |  10 +++++-----
 5 files changed, 5 insertions(+), 5 deletions(-)
 rename tests/data/{simple-tr-valid.pgen => simple-tr.pgen} (100%)
 rename tests/data/{simple-tr-valid.psam => simple-tr.psam} (100%)
 rename tests/data/{simple-tr-valid.pvar => simple-tr.pvar} (100%)
 rename tests/data/{simple-tr-valid.vcf => simple-tr.vcf} (100%)

diff --git a/tests/data/simple-tr-valid.pgen b/tests/data/simple-tr.pgen
similarity index 100%
rename from tests/data/simple-tr-valid.pgen
rename to tests/data/simple-tr.pgen
diff --git a/tests/data/simple-tr-valid.psam b/tests/data/simple-tr.psam
similarity index 100%
rename from tests/data/simple-tr-valid.psam
rename to tests/data/simple-tr.psam
diff --git a/tests/data/simple-tr-valid.pvar b/tests/data/simple-tr.pvar
similarity index 100%
rename from tests/data/simple-tr-valid.pvar
rename to tests/data/simple-tr.pvar
diff --git a/tests/data/simple-tr-valid.vcf b/tests/data/simple-tr.vcf
similarity index 100%
rename from tests/data/simple-tr-valid.vcf
rename to tests/data/simple-tr.vcf
diff --git a/tests/test_data.py b/tests/test_data.py
index 6cbbc161..4b7394fc 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -333,7 +333,7 @@ def _get_fake_genotypes_multiallelic(self):
     def _get_fake_genotypes_multiallelic_tr(self):
         pgenlib = pytest.importorskip("pgenlib")
 
-        gts_tr = GenotypesTR(DATADIR / "simple-tr-valid.vcf")
+        gts_tr = GenotypesTR(DATADIR / "simple-tr.vcf")
         gts_tr.read()
 
         gts = GenotypesPLINK(fname="")
@@ -408,7 +408,7 @@ def test_load_genotypes_multiallelic(self):
     def test_load_genotypes_multiallelic_tr(self):
         expected = self._get_fake_genotypes_multiallelic_tr()
 
-        gts = GenotypesPLINK(DATADIR / "simple-tr-valid.pgen")
+        gts = GenotypesPLINK(DATADIR / "simple-tr.pgen")
         gts.read()
 
         # check that everything matches what we expected
@@ -464,7 +464,7 @@ def test_load_genotypes_iterate(self):
     def test_iter_multiallelic_tr(self):
         expected = self._get_fake_genotypes_multiallelic_tr()
 
-        gts = GenotypesPLINK(DATADIR / "simple-tr-valid.pgen")
+        gts = GenotypesPLINK(DATADIR / "simple-tr.pgen")
 
         # Check that everything matches what we expected
         for idx, line in enumerate(gts):
@@ -697,7 +697,7 @@ def _get_fake_genotypes_multiallelic(self):
     def test_iter(self):
         # Get the expected data
         expected = self._get_fake_genotypes_multiallelic()
-        gts = GenotypesPLINKTR(DATADIR / "simple-tr-valid.pgen")
+        gts = GenotypesPLINKTR(DATADIR / "simple-tr.pgen")
         # Check that everything matches what we expected
         for idx, line in enumerate(gts):
             np.testing.assert_allclose(line.data[:, :3], expected.data[:, idx])
@@ -710,7 +710,7 @@ def test_iter(self):
 
     def test_read(self):
         expected_alleles = self._get_fake_genotypes_multiallelic().data
-        gts = GenotypesPLINKTR(DATADIR / "simple-tr-valid.pgen")
+        gts = GenotypesPLINKTR(DATADIR / "simple-tr.pgen")
         gts.read()
         # check genotypes
         np.testing.assert_allclose(expected_alleles, gts.data)

From 61e7163e9b249f73809822dda0d157ec3252e313 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Tue, 15 Aug 2023 08:13:37 -0700
Subject: [PATCH 28/40] handle allele_cts parameter when writing PGEN files

---
 haptools/data/genotypes.py | 43 ++++++++++++++++++++++++++++++++++----
 1 file changed, 39 insertions(+), 4 deletions(-)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index 9d0073cf..3ee442f2 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -1436,7 +1436,7 @@ def write_variants(self):
             for contig in set(self.variants["chrom"]):
                 vcf.header.contigs.add(contig)
             self.log.info("Writing PVAR records")
-            for var_idx, var in enumerate(self.variants):
+            for var in self.variants:
                 rec = {
                     "contig": var["chrom"],
                     "start": var["pos"],
@@ -1453,6 +1453,34 @@ def write_variants(self):
                 # write the record to a file
                 vcf.write(record)
 
+    def _num_unique_alleles(self, arr: npt.NDArray):
+        """
+        Obtain the number of unique elements in each row of a genotype matrix
+        Needed by https://groups.google.com/g/plink2-users/c/Sn5qVCyDlDw/m/GOWScY6tAQAJ
+
+        Adapted from https://stackoverflow.com/a/46575580
+
+        Parameters
+        ----------
+        arr : npt.NDArray
+            A genotype matrix of shape (num_variants, num_samples, 2) and type np.uint8
+
+        Returns
+        -------
+        npt.NDArray
+            An array of shape (num_variants,) and type np.uint32 containing allele
+            counts for each variant
+        """
+        nrows = arr.shape[0]
+        row_coords = np.arange(nrows)[:, np.newaxis, np.newaxis]
+        allele_cts = np.zeros((nrows, np.iinfo(np.uint8).max+1, 2), dtype=np.bool_)
+        # mark whichever allele indices appear in arr then sum them to obtain counts
+        allele_cts[row_coords, arr, np.arange(2)] = 1
+        allele_cts = allele_cts.any(axis=2).sum(axis=1, dtype=np.uint32)
+        # there are always at least two alleles
+        allele_cts[allele_cts < 2] = 2
+        return allele_cts
+
     def write(self):
         """
         Write the variants in this class to PLINK2 files at
@@ -1475,7 +1503,6 @@ def write(self):
         # write the pgen file
         pv = pgenlib.PvarReader(bytes(str(self.fname.with_suffix(".pvar")), "utf8"))
         with pgenlib.PgenWriter(
-
             filename=bytes(str(self.fname), "utf8"),
             sample_ct=len(self.samples),
             variant_ct=len(self.variants),
@@ -1497,6 +1524,9 @@ def write(self):
                     missing = np.ascontiguousarray(
                         data[start:end] == np.iinfo(np.uint8).max
                     )
+                    # obtain the number of unique alleles for each variant
+                    # https://stackoverflow.com/a/46575580
+                    allele_cts = self._num_unique_alleles(data[start:end])
                     subset_data = np.ascontiguousarray(data[start:end], dtype=np.int32)
                 except np.core._exceptions._ArrayMemoryError as e:
                     raise ValueError(
@@ -1507,12 +1537,17 @@ def write(self):
                 missing.resize((len(self.variants), len(self.samples) * 2))
                 # convert any missing genotypes to -9
                 subset_data[missing] = -9
+                # finally, append the genotypes to the PGEN file
                 if self._prephased or self.data.shape[2] < 3:
-                    pgen.append_alleles_batch(subset_data, all_phased=True)
+                    pgen.append_alleles_batch(
+                        subset_data, all_phased=True, allele_cts=allele_cts,
+                    )
                 else:
                     # TODO: figure out why this sometimes leads to a corrupted file?
                     subset_phase = self.data[:, start:end, 2].T.copy(order="C")
-                    pgen.append_partially_phased_batch(subset_data, subset_phase)
+                    pgen.append_partially_phased_batch(
+                        subset_data, subset_phase, allele_cts=allele_cts,
+                    )
                     del subset_phase
             del subset_data
             del missing

From a32e1c3d7293ce3fc30d25b9a630a8a453b66463 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Tue, 15 Aug 2023 08:52:23 -0700
Subject: [PATCH 29/40] docs: the new GenotypesPLINKTR class also explain how
 to convert TR VCFs to PGEN

---
 docs/api/data.rst          | 25 ++++++++++++++++++++++-
 docs/formats/genotypes.rst | 19 +++++++++++------
 haptools/data/genotypes.py | 42 ++++++++++++++++++++++++++------------
 3 files changed, 66 insertions(+), 20 deletions(-)

diff --git a/docs/api/data.rst b/docs/api/data.rst
index e5a98e62..6be633c0 100644
--- a/docs/api/data.rst
+++ b/docs/api/data.rst
@@ -168,7 +168,7 @@ All of the other methods in the :class:`Genotypes` class are inherited, but the
 
 GenotypesTR
 ++++++++++++
-The :class:`GenotypesTR` class *extends* :class:`Genotypes` class. The :class:`GenotypesTR` class follows the same structure of :class:`GenotypesVCF`, but can now load repeat count of tandem repeats as the alleles.
+The :class:`GenotypesTR` class *extends* the :class:`Genotypes` class. The :class:`GenotypesTR` class follows the same structure of :class:`GenotypesVCF`, but can now load repeat counts of tandem repeats as the alleles.
 
 All of the other methods in the :class:`Genotypes` class are inherited, but the :class:`GenotypesTR` class' ``load()`` function is unique to loading tandem repeat variants.
 
@@ -178,6 +178,11 @@ All of the other methods in the :class:`Genotypes` class are inherited, but the
 	# make the first sample have 4 and 7 repeats for the alleles of the fourth variant
 	genotypes.data[0, 3] = (4, 7)
 
+The following methods from the :class:`Genotypes` class are disabled, however.
+
+1. ``check_biallelic``
+2. ``check_maf``
+
 .. _api-data-genotypestr:
 
 GenotypesPLINK
@@ -239,6 +244,24 @@ A large ``chunk_size`` is more likely to result in memory over-use while a small
 	genotypes = data.GenotypesPLINK('tests/data/simple.pgen', chunk_size=500)
 	genotypes.read()
 
+GenotypesPLINKTR
+++++++++++++++++
+The :class:`GenotypesPLINKTR`` class extends the :class:`GenotypesPLINK` class to support loading tandem repeat variants.
+The :class:`GenotypesPLINKTR` class works similarly to :class:`GenotypesTR` by filling the ``data`` property with repeat counts for each allele.
+
+The following methods from the :class:`GenotypesPLINK` class are disabled, however.
+
+1. ``write``
+2. ``check_maf``
+3. ``write_variants``
+4. ``check_biallelic``
+
+The :class:`GenotypesPLINKTR` uses INFO fields from the PVAR file to determine the repeat unit and the number of repeats for each allele. To ensure your PVAR file contains the necessary information, use the following command when converting from VCF.
+
+.. code-block:: bash
+
+	plink2 --vcf-half-call m --make-pgen 'pvar-cols=vcfheader,qual,filter,info' --vcf input.vcf --make-pgen --out output
+
 haplotypes.py
 ~~~~~~~~~~~~~
 Overview
diff --git a/docs/formats/genotypes.rst b/docs/formats/genotypes.rst
index 2657c5f0..5828926e 100644
--- a/docs/formats/genotypes.rst
+++ b/docs/formats/genotypes.rst
@@ -9,14 +9,14 @@ Genotypes
 	The time required to load various genotype file formats.
 
 VCF/BCF
--------
+~~~~~~~
 
 Genotype files must be specified as VCF or BCF files. They can be bgzip-compressed.
 
 .. _formats-genotypesplink:
 
 PLINK2 PGEN
------------
+~~~~~~~~~~~
 
 There is also experimental support for `PLINK2 PGEN <https://github.com/chrchang/plink-ng/blob/master/pgen_spec/pgen_spec.pdf>`_ files in some commands. These files can be loaded and created much more quickly than VCFs, so we highly recommend using them if you're working with large datasets. See the documentation for the :class:`GenotypesPLINK` class in :ref:`the API docs <api-data-genotypesplink>` for more information.
 
@@ -29,9 +29,16 @@ If you run out memory when using PGEN files, consider reading/writing variants f
 
 		pip install haptools[files]
 
-.. warning::
-	At the moment, only biallelic SNPs can be encoded in PGEN files because of limitations in the ``Pgenlib`` python library. It doesn't properly support multiallelic variants yet (`source <https://github.com/chrchang/plink-ng/blob/c4b8d4361de74c58f0cc11361062eca4f34210d3/2.0/Python/python_api.txt#L88-L89>`_). To ensure your PGEN files only contain SNPs, we recommend use the following command to convert from VCF to PGEN.
+Converting from VCF to PGEN
+---------------------------
+To convert a VCF containing only biallelic SNPs to PGEN, use the following command.
 
-	.. code-block:: bash
+.. code-block:: bash
+
+	plink2 --snps-only 'just-acgt' --max-alleles 2 --vcf input.vcf --make-pgen --out output
+
+To convert a VCF containing tandem repeats to PGEN, use this command, instead.
+
+.. code-block:: bash
 
-		plink2 --snps-only 'just-acgt' --max-alleles 2 --vcf input.vcf --make-pgen --out output
+	plink2 --vcf-half-call m --make-pgen 'pvar-cols=vcfheader,qual,filter,info' --vcf input.vcf --make-pgen --out output
diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index 3ee442f2..fb39109d 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 import re
 import gc
-import importlib.util
 from csv import reader
 from pathlib import Path
 from typing import Iterator
@@ -819,7 +818,7 @@ class GenotypesTR(Genotypes):
     log: Logger
         See documentation for :py:attr:`~.Genotypes.log`
     vcftype: str
-        TR vcf type currently being read.
+        TR vcf type currently being read. Options are
         {'auto', 'gangstr', 'advntr', 'hipstr', 'eh', 'popstr'}
     """
 
@@ -939,19 +938,17 @@ class GenotypesPLINK(GenotypesVCF):
     data : npt.NDArray
         See documentation for :py:attr:`~.GenotypesVCF.data`
     samples : tuple
-        See documentation for :py:attr:`~.GenotypesVCF.data`
+        See documentation for :py:attr:`~.GenotypesVCF.samples`
     variants : np.array
-        See documentation for :py:attr:`~.GenotypesVCF.data`
+        See documentation for :py:attr:`~.GenotypesVCF.variants`
     log: Logger
-        See documentation for :py:attr:`~.GenotypesVCF.data`
+        See documentation for :py:attr:`~.GenotypesVCF.log`
     chunk_size: int, optional
         The max number of variants to fetch from and write to the PGEN file at any
         given time
 
         If this value is provided, variants from the PGEN file will be loaded in
         chunks so as to use less memory
-    _prephased: bool
-        See documentation for :py:attr:`~.GenotypesVCF.data`
 
     Examples
     --------
@@ -1555,6 +1552,29 @@ def write(self):
 
 
 class GenotypesPLINKTR(GenotypesPLINK):
+    """
+    A class for processing repeat genotypes from a PLINK ``.pgen`` file
+
+    Attributes
+    ----------
+    data : npt.NDArray
+        See documentation for :py:attr:`~.GenotypesPLINK.data`
+    samples : tuple
+        See documentation for :py:attr:`~.GenotypesPLINK.samples`
+    variants : np.array
+        See documentation for :py:attr:`~.GenotypesPLINK.variants`
+    log: Logger
+        See documentation for :py:attr:`~.GenotypesPLINK.log`
+    chunk_size: int, optional
+        See documentation for :py:attr:`~.GenotypesPLINK.chunk_size`
+    vcftype: str, optional
+        See documentation for :py:attr:`~.GenotypesTR.vcftype`
+
+    Examples
+    --------
+    >>> genotypes = GenotypesPLINK.load('tests/data/simple.pgen')
+    """
+
     def __init__(self, fname: Path | str, log: Logger = None, chunk_size: int = None , vcftype: str = "auto"):
         super().__init__(fname, log, chunk_size)
         self.vcftype = vcftype
@@ -1582,11 +1602,9 @@ def load(
         samples : list[str], optional
             See documentation for :py:meth:`~.Genotypes.read`
         variants : set[str], optional
-            See documentation for :py:meth:`~.Genotypes.read
+            See documentation for :py:meth:`~.Genotypes.read`
         vcftype : str, optional
-        The type of TR VCF currently being read. Options are:
-        {'auto', 'gangstr', 'advntr', 'hipstr', 'eh', 'popstr'}
-`
+            See documentation for :py:meth:`~.GenotypesTR.vcftype`
 
         Returns
         -------
@@ -1723,5 +1741,3 @@ def check_biallelic(self):
 
     def check_maf(self):
         raise NotImplementedError
-
-

From 834968993d9abcc1f35cd58dc6c079e26310bfc3 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Tue, 15 Aug 2023 08:55:46 -0700
Subject: [PATCH 30/40] add support for TR PGENs in clump and sim_phenotype

---
 haptools/clump.py         | 11 ++++++-----
 haptools/sim_phenotype.py |  8 +++++++-
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/haptools/clump.py b/haptools/clump.py
index bf5d5d48..55fe74ce 100644
--- a/haptools/clump.py
+++ b/haptools/clump.py
@@ -1,13 +1,12 @@
 #!/usr/bin/env python
 
 # To test: ./clumpSTR.py --summstats-snps tests/eur_gwas_pvalue_chr19.LDL.glm.linear --clump-snp-field ID --clump-field p-value --clump-chrom-field CHROM --clump-pos-field position --clump-p1 0.2 --out test.clump
-import sys
 import math
 from logging import Logger, getLogger
 
 import numpy as np
 
-from .data import Genotypes, GenotypesVCF, GenotypesTR
+from .data import Genotypes, GenotypesVCF, GenotypesTR, GenotypesPLINKTR
 
 
 class Variant:
@@ -557,15 +556,17 @@ def clumpstr(
     strgts = None
     gts = None
     if gts_snps:
+        log.debug("Loading SNP Genotypes.")
         if str(gts_snps).endswith("pgen"):
-            log.debug("Loading SNP Genotypes.")
             snpgts = GenotypesPLINK.load(gts_snps)
         else:
-            log.debug("Loading SNP Genotypes.")
             snpgts = GenotypesVCF.load(gts_snps)
     if gts_strs:
         log.debug("Loading STR Genotypes.")
-        strgts = GenotypesTR.load(gts_strs)
+        if str(gts_strs).endswith("pgen"):
+            strgts = GenotypesPLINKTR.load(gts_strs)
+        else:
+            strgts = GenotypesTR.load(gts_strs)
 
     if gts_snps and gts_strs:
         log.debug("Calculating set of overlapping samples between STRs and SNPs.")
diff --git a/haptools/sim_phenotype.py b/haptools/sim_phenotype.py
index 1ed53ab8..df6909ec 100644
--- a/haptools/sim_phenotype.py
+++ b/haptools/sim_phenotype.py
@@ -14,6 +14,7 @@
     Haplotypes,
     GenotypesTR,
     GenotypesPLINK,
+    GenotypesPLINKTR,
     Repeat as RepeatBase,
     Haplotype as HaplotypeBase,
 )
@@ -435,7 +436,12 @@ def simulate_pt(
 
     if repeats:
         log.info("Merging with TR genotypes")
-        tr_gt = GenotypesTR(fname=repeats, log=log)
+        if repeats.suffix == ".pgen":
+            log.info("Loading repeat genotypes from PGEN file")
+            tr_gt = GenotypesPLINKTR(fname=repeats, log=log, chunk_size=chunk_size)
+        else:
+            log.info("Loading repeat genotypes from VCF/BCF file")
+            tr_gt = GenotypesTR(fname=repeats, log=log)
         tr_gt.read(region=region, samples=samples, variants=haplotype_ids)
         tr_gt.check_missing()
         gt = Genotypes.merge_variants((gt, tr_gt), fname=None)

From ec294bc94b1b17bc3265626477a0e32270f8e853 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Tue, 15 Aug 2023 08:56:52 -0700
Subject: [PATCH 31/40] add support for multiallelic PGENs in sim_genotype

---
 haptools/sim_genotype.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/haptools/sim_genotype.py b/haptools/sim_genotype.py
index d17f6dfd..dcac3c59 100644
--- a/haptools/sim_genotype.py
+++ b/haptools/sim_genotype.py
@@ -111,10 +111,6 @@ def output_vcf(
     else:
         vcf.read(region=f"{region['chr']}:{region['start']}-{region['end']}")
 
-    if variant_file.endswith(".pgen"):
-        # the pgenlib library collapses multiallelic variants into a single allele
-        vcf.check_biallelic()
-
     log.debug(f"Read in variants from {variant_file}")
 
     sample_dict = {}

From 02cbc727235a37dc4efa9c247b7d7952c65b0b46 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Thu, 31 Aug 2023 10:09:07 -0700
Subject: [PATCH 32/40] try and fail to speed up GenotypesPLINKTR.read()

---
 haptools/data/genotypes.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index fb39109d..cee1189e 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -1666,8 +1666,8 @@ def read(
         super().read(region,samples,variants,max_variants)
         num_variants = len(self.variants)
         # initialize a jagged array of allele lengths
-        max_num_alleles = max(map(len, self.variants["alleles"]))
-        allele_lens = np.empty((len(self.variants), max_num_alleles), dtype=self.data.dtype)
+        num_alleles = np.fromiter(map(len, self.variants["alleles"]), dtype=np.uint8, count=num_variants)
+        allele_lens = np.empty((num_variants, num_alleles.max()+1), dtype=self.data.dtype)
         # iterate through each TR and extract the REF and ALT allele lengths
         for idx, record in enumerate(self._iter_TRRecords(region, variants)):
             if idx > num_variants:
@@ -1675,16 +1675,16 @@ def read(
                 break
             # extract allele lengths from TRRecord object
             allele_lens[idx, 0] = record.ref_allele_length
-            num_alleles = len(record.alt_allele_lengths) + 1
-            allele_lens[idx, 1:num_alleles] = record.alt_allele_lengths
+            num_alleles_rec = num_alleles[idx]
+            allele_lens[idx, 1:num_alleles_rec] = record.alt_allele_lengths
+            allele_lens[idx, num_alleles_rec] = np.iinfo(np.uint8).max
         # record missing entries and then set them all to REF
         missing = self.data[:, :, :2] == np.iinfo(np.uint8).max
-        self.data[:, :, :2][missing] = 0
+        breakpoint()
+        np.putmask(self.data[:, :, :2], missing, num_alleles)
         # convert from genotype indices to allele lengths
         variant_coords = np.arange(num_variants)[:, np.newaxis]
         self.data[:, :, :2] = allele_lens[variant_coords, self.data[:, :, :2]]
-        # restore missing entries
-        self.data[:, :, :2][missing] = np.iinfo(np.uint8).max
         # clean up memory
         del missing
         del allele_lens

From b8f2c4577e338077c306e23300214188b2c8028a Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Thu, 31 Aug 2023 10:09:17 -0700
Subject: [PATCH 33/40] Revert "try and fail to speed up
 GenotypesPLINKTR.read()"

This reverts commit 02cbc727235a37dc4efa9c247b7d7952c65b0b46.
---
 haptools/data/genotypes.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index cee1189e..fb39109d 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -1666,8 +1666,8 @@ def read(
         super().read(region,samples,variants,max_variants)
         num_variants = len(self.variants)
         # initialize a jagged array of allele lengths
-        num_alleles = np.fromiter(map(len, self.variants["alleles"]), dtype=np.uint8, count=num_variants)
-        allele_lens = np.empty((num_variants, num_alleles.max()+1), dtype=self.data.dtype)
+        max_num_alleles = max(map(len, self.variants["alleles"]))
+        allele_lens = np.empty((len(self.variants), max_num_alleles), dtype=self.data.dtype)
         # iterate through each TR and extract the REF and ALT allele lengths
         for idx, record in enumerate(self._iter_TRRecords(region, variants)):
             if idx > num_variants:
@@ -1675,16 +1675,16 @@ def read(
                 break
             # extract allele lengths from TRRecord object
             allele_lens[idx, 0] = record.ref_allele_length
-            num_alleles_rec = num_alleles[idx]
-            allele_lens[idx, 1:num_alleles_rec] = record.alt_allele_lengths
-            allele_lens[idx, num_alleles_rec] = np.iinfo(np.uint8).max
+            num_alleles = len(record.alt_allele_lengths) + 1
+            allele_lens[idx, 1:num_alleles] = record.alt_allele_lengths
         # record missing entries and then set them all to REF
         missing = self.data[:, :, :2] == np.iinfo(np.uint8).max
-        breakpoint()
-        np.putmask(self.data[:, :, :2], missing, num_alleles)
+        self.data[:, :, :2][missing] = 0
         # convert from genotype indices to allele lengths
         variant_coords = np.arange(num_variants)[:, np.newaxis]
         self.data[:, :, :2] = allele_lens[variant_coords, self.data[:, :, :2]]
+        # restore missing entries
+        self.data[:, :, :2][missing] = np.iinfo(np.uint8).max
         # clean up memory
         del missing
         del allele_lens

From 394f5ceb83761a8fa8efe622bfda88cf3a45bd69 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 15 Sep 2023 05:33:10 -0700
Subject: [PATCH 34/40] remove gxx requirement now that pgenlib is published
 with wheels

---
 docs/project_info/contributing.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/project_info/contributing.rst b/docs/project_info/contributing.rst
index 5b413a41..1b278815 100644
--- a/docs/project_info/contributing.rst
+++ b/docs/project_info/contributing.rst
@@ -70,7 +70,7 @@ Follow these steps to set up a development environment.
 
     .. code-block:: bash
 
-        conda create -n haptools-dev -c conda-forge 'poetry>=1.1.15' 'python=3.7' 'gxx_linux-64'
+        conda create -n haptools-dev -c conda-forge 'poetry>=1.1.15' 'python=3.7'
 2. Activate the environment
 
     .. code-block:: bash

From d97c7575cd92b7c8627e6bae21845b7c973f1e28 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 15 Sep 2023 05:33:13 -0700
Subject: [PATCH 35/40] refmt with black

---
 haptools/data/genotypes.py | 46 ++++++++++++++++++++++++++------------
 tests/test_data.py         | 27 +++++++++++++++-------
 2 files changed, 51 insertions(+), 22 deletions(-)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index fb39109d..cff52ec8 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -1241,12 +1241,12 @@ def read(
         """
         super(Genotypes, self).read()
         import pgenlib
-    
+
         sample_idxs = self.read_samples(samples)
         pv = pgenlib.PvarReader(bytes(str(self.fname.with_suffix(".pvar")), "utf8"))
-    
+
         with pgenlib.PgenReader(
-            bytes(str(self.fname), "utf8"), sample_subset=sample_idxs, pvar = pv
+            bytes(str(self.fname), "utf8"), sample_subset=sample_idxs, pvar=pv
         ) as pgen:
             # how many variants to load?
             if variants is not None:
@@ -1403,7 +1403,7 @@ def __iter__(
 
         sample_idxs = self.read_samples(samples)
         pgen = pgenlib.PgenReader(
-            bytes(str(self.fname), "utf8"), sample_subset=sample_idxs, pvar = pv
+            bytes(str(self.fname), "utf8"), sample_subset=sample_idxs, pvar=pv
         )
         # call another function to force the lines above to be run immediately
         # see https://stackoverflow.com/a/36726497
@@ -1470,7 +1470,7 @@ def _num_unique_alleles(self, arr: npt.NDArray):
         """
         nrows = arr.shape[0]
         row_coords = np.arange(nrows)[:, np.newaxis, np.newaxis]
-        allele_cts = np.zeros((nrows, np.iinfo(np.uint8).max+1, 2), dtype=np.bool_)
+        allele_cts = np.zeros((nrows, np.iinfo(np.uint8).max + 1, 2), dtype=np.bool_)
         # mark whichever allele indices appear in arr then sum them to obtain counts
         allele_cts[row_coords, arr, np.arange(2)] = 1
         allele_cts = allele_cts.any(axis=2).sum(axis=1, dtype=np.uint32)
@@ -1503,7 +1503,7 @@ def write(self):
             filename=bytes(str(self.fname), "utf8"),
             sample_ct=len(self.samples),
             variant_ct=len(self.variants),
-            allele_ct_limit = pv.get_max_allele_ct(),
+            allele_ct_limit=pv.get_max_allele_ct(),
             nonref_flags=False,
             hardcall_phase_present=True,
         ) as pgen:
@@ -1537,13 +1537,17 @@ def write(self):
                 # finally, append the genotypes to the PGEN file
                 if self._prephased or self.data.shape[2] < 3:
                     pgen.append_alleles_batch(
-                        subset_data, all_phased=True, allele_cts=allele_cts,
+                        subset_data,
+                        all_phased=True,
+                        allele_cts=allele_cts,
                     )
                 else:
                     # TODO: figure out why this sometimes leads to a corrupted file?
                     subset_phase = self.data[:, start:end, 2].T.copy(order="C")
                     pgen.append_partially_phased_batch(
-                        subset_data, subset_phase, allele_cts=allele_cts,
+                        subset_data,
+                        subset_phase,
+                        allele_cts=allele_cts,
                     )
                     del subset_phase
             del subset_data
@@ -1575,7 +1579,13 @@ class GenotypesPLINKTR(GenotypesPLINK):
     >>> genotypes = GenotypesPLINK.load('tests/data/simple.pgen')
     """
 
-    def __init__(self, fname: Path | str, log: Logger = None, chunk_size: int = None , vcftype: str = "auto"):
+    def __init__(
+        self,
+        fname: Path | str,
+        log: Logger = None,
+        chunk_size: int = None,
+        vcftype: str = "auto",
+    ):
         super().__init__(fname, log, chunk_size)
         self.vcftype = vcftype
 
@@ -1615,7 +1625,7 @@ def load(
         genotypes.read(region, samples, variants)
         genotypes.check_phase()
         return genotypes
-    
+
     def _iter_TRRecords(self, region: str = None, variants: set[str] = None):
         """
         Yield TRRecord objects from the PVAR file
@@ -1634,7 +1644,10 @@ def _iter_TRRecords(self, region: str = None, variants: set[str] = None):
         """
         vcf = VCF(self.fname.with_suffix(".pvar"))
         tr_records = trh.TRRecordHarmonizer(
-            vcffile=vcf, vcfiter=vcf(region), region=region, vcftype=self.vcftype,
+            vcffile=vcf,
+            vcfiter=vcf(region),
+            region=region,
+            vcftype=self.vcftype,
         )
         # filter out TRs that we didn't want
         if variants is not None:
@@ -1663,11 +1676,13 @@ def read(
         max_variants : int, optional
             See documentation for :py:attr:`~.GenotypesVCF.read`
         """
-        super().read(region,samples,variants,max_variants)
+        super().read(region, samples, variants, max_variants)
         num_variants = len(self.variants)
         # initialize a jagged array of allele lengths
         max_num_alleles = max(map(len, self.variants["alleles"]))
-        allele_lens = np.empty((len(self.variants), max_num_alleles), dtype=self.data.dtype)
+        allele_lens = np.empty(
+            (len(self.variants), max_num_alleles), dtype=self.data.dtype
+        )
         # iterate through each TR and extract the REF and ALT allele lengths
         for idx, record in enumerate(self._iter_TRRecords(region, variants)):
             if idx > num_variants:
@@ -1720,7 +1735,10 @@ def _iterate(
         variants = super()._iterate(pgen, region, variants)
         for variant, record in zip(variants, tr_records):
             # extract the REF and ALT allele lengths
-            allele_lens = np.array([record.ref_allele_length, *record.alt_allele_lengths], dtype=variant.data.dtype)
+            allele_lens = np.array(
+                [record.ref_allele_length, *record.alt_allele_lengths],
+                dtype=variant.data.dtype,
+            )
             # record missing entries and then set them all to REF
             missing = variant.data[:, :2] == np.iinfo(np.uint8).max
             variant.data[:, :2][missing] = 0
diff --git a/tests/test_data.py b/tests/test_data.py
index 4b7394fc..02184949 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -367,7 +367,14 @@ def _get_fake_genotypes_multiallelic_tr(self):
             tuple("GTT" * i for i in range(1, 10)),
             ("ACACAC", "AC"),
             ("AAA", "AAAA"),
-            ("GTGT", "GTGTGT", "GTGTGTGT", "GTGTGTGTGTGTGT", "GTGTGTGTGTGTGTGTGTGTGT", "GT"),
+            (
+                "GTGT",
+                "GTGTGT",
+                "GTGTGTGT",
+                "GTGTGTGTGTGTGT",
+                "GTGTGTGTGTGTGTGTGTGTGT",
+                "GT",
+            ),
             ("GGG", "GGGGG"),
         ]
         gts.variants = np.array(
@@ -471,7 +478,9 @@ def test_iter_multiallelic_tr(self):
             np.testing.assert_allclose(line.data[:, :3], expected.data[:, idx])
             for col in ("chrom", "pos", "id"):
                 assert line.variants[col] == expected.variants[col][idx]
-            assert line.variants["alleles"].tolist() == expected.variants["alleles"][idx]
+            assert (
+                line.variants["alleles"].tolist() == expected.variants["alleles"][idx]
+            )
 
         # Check samples
         assert gts.samples == expected.samples
@@ -614,7 +623,7 @@ def test_write_multiallelic(self):
         gts_multiallelic.fname = fname
         gts_multiallelic.write()
 
-        #Read the data from the newly created PLINK file
+        # Read the data from the newly created PLINK file
         new_gts = GenotypesPLINK(fname)
         new_gts.read()
         new_gts.check_phase()
@@ -626,7 +635,7 @@ def test_write_multiallelic(self):
             for col in ("chrom", "pos", "id", "alleles"):
                 assert gts_multiallelic.variants[col][i] == new_gts.variants[col][i]
 
-        #clean the files created after the test
+        # clean the files created after the test
         fname.with_suffix(".psam").unlink()
         fname.with_suffix(".pvar").unlink()
         fname.unlink()
@@ -641,7 +650,7 @@ def test_write_multiallelic_tr(self):
         gts_multiallelic.fname = fname
         gts_multiallelic.write()
 
-        #Read the data from the newly created PLINK file
+        # Read the data from the newly created PLINK file
         new_gts = GenotypesPLINK(fname)
         new_gts.read()
 
@@ -652,7 +661,7 @@ def test_write_multiallelic_tr(self):
             for col in ("chrom", "pos", "id", "alleles"):
                 assert gts_multiallelic.variants[col][i] == new_gts.variants[col][i]
 
-        #clean the files created after the test
+        # clean the files created after the test
         fname.with_suffix(".psam").unlink()
         fname.with_suffix(".pvar").unlink()
         fname.unlink()
@@ -693,7 +702,7 @@ def _get_fake_genotypes_multiallelic(self):
         # to see if we can change chrX representations to properly simulate from it
 
         return gts
-    
+
     def test_iter(self):
         # Get the expected data
         expected = self._get_fake_genotypes_multiallelic()
@@ -703,7 +712,9 @@ def test_iter(self):
             np.testing.assert_allclose(line.data[:, :3], expected.data[:, idx])
             for col in ("chrom", "pos", "id"):
                 assert line.variants[col] == expected.variants[col][idx]
-            assert line.variants["alleles"].tolist() == expected.variants["alleles"][idx]
+            assert (
+                line.variants["alleles"].tolist() == expected.variants["alleles"][idx]
+            )
 
         # Check samples
         assert gts.samples == expected.samples

From b6bf7ecfe96dca4d494546c877b6e047e6af80ab Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 15 Sep 2023 13:09:41 -0700
Subject: [PATCH 36/40] do not copy alleles from list to tuple

---
 haptools/data/genotypes.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index cff52ec8..7f327c66 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -1072,15 +1072,13 @@ def _variant_arr(
             A row from the :py:attr:`~.GenotypesPLINK.variants` array
         """
         # Parse the REF and ALT alleles from the PVAR record
-        ref_allele = record[cid["REF"]]
-        alt_alleles = record[cid["ALT"]].split(",")
-        alleles = [ref_allele] + alt_alleles
+        alleles = (record[cid["REF"]], *record[cid["ALT"]].split(","))
         return np.array(
             (
                 record[cid["ID"]],
                 record[cid["CHROM"]],
                 record[cid["POS"]],
-                tuple(alleles),
+                alleles,
             ),
             dtype=self.variants.dtype,
         )

From 1edea76c0363bc44f218c7be6e5560176da6675a Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 15 Sep 2023 14:27:16 -0700
Subject: [PATCH 37/40] handle gts data arrays with dtype np.bool_ when
 counting uniq vals

---
 haptools/data/genotypes.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index 7f327c66..a72b3bab 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -1469,8 +1469,11 @@ def _num_unique_alleles(self, arr: npt.NDArray):
         nrows = arr.shape[0]
         row_coords = np.arange(nrows)[:, np.newaxis, np.newaxis]
         allele_cts = np.zeros((nrows, np.iinfo(np.uint8).max + 1, 2), dtype=np.bool_)
+        # ensure the arr values are interpreted as uint8's
+        if arr.dtype != np.uint8:
+            arr = arr.view("uint8")
         # mark whichever allele indices appear in arr then sum them to obtain counts
-        allele_cts[row_coords, arr, np.arange(2)] = 1
+        allele_cts[row_coords, arr] = 1
         allele_cts = allele_cts.any(axis=2).sum(axis=1, dtype=np.uint32)
         # there are always at least two alleles
         allele_cts[allele_cts < 2] = 2

From e13d6f24320bb65c499a8a9a0604dc016eff2dfd Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Sat, 16 Sep 2023 08:51:27 -0700
Subject: [PATCH 38/40] test multiallelic pgen reading/writing

---
 tests/data/simple-tr.pgen | Bin 54 -> 56 bytes
 tests/data/simple-tr.pvar |   2 +-
 tests/data/simple-tr.vcf  |   4 ++--
 tests/test_data.py        |   2 ++
 4 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/data/simple-tr.pgen b/tests/data/simple-tr.pgen
index b852ca9b9cebe6b0dbbf9d8f85a263f156012b66..da6d0141b9e938e6c16e5be01c3bb9d533ef5afe 100644
GIT binary patch
delta 36
rcmXrBm>?<7#lpzRw33ORU(q?WT$o`g12Y4|JZ1()MrLCHrgO{yZ-NEU

delta 34
pcmcC8n;<F6$;`;fw33ORU(q?WoM9;g1H(LK21Z6^V*#df%m8LK1-AeI

diff --git a/tests/data/simple-tr.pvar b/tests/data/simple-tr.pvar
index f7c2b3c1..7d4598e2 100644
--- a/tests/data/simple-tr.pvar
+++ b/tests/data/simple-tr.pvar
@@ -1,5 +1,5 @@
 ##fileformat=VCFv4.3
-##fileDate=20230813
+##fileDate=20230916
 ##source=PLINKv2.00
 ##command=HipSTR-v0.7 --test
 ##filedate=20180225
diff --git a/tests/data/simple-tr.vcf b/tests/data/simple-tr.vcf
index cf4ece04..92be0710 100644
--- a/tests/data/simple-tr.vcf
+++ b/tests/data/simple-tr.vcf
@@ -8,8 +8,8 @@
 ##contig=<ID=1>
 ##contig=<ID=X>
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	HG00096	HG00097	HG00099	HG00100	HG00101
-1	10114	1:10114:GTT	GTT	GTTGTT,GTTGTTGTT,GTTGTTGTTGTT,GTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTTGTTGTT	.	.	START=10114;END=10999;PERIOD=3	GT	0|1	2|3	4|5	6|7	8|8
-1	11000	1:10116:ACACAC	ACACAC	AC	.	.	START=11000;END=11016;PERIOD=2	GT	0|1	0|1	1|1	1|1	0|0
+1	10114	1:10114:GTT	GTT	GTTGTT,GTTGTTGTT,GTTGTTGTTGTT,GTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTTGTT,GTTGTTGTTGTTGTTGTTGTTGTTGTT	.	.	START=10114;END=10999;PERIOD=3	GT	0|1	2/3	4/5	6|7	8|8
+1	11000	1:10116:ACACAC	ACACAC	AC	.	.	START=11000;END=11016;PERIOD=2	GT	0|1	0/1	1/1	1|1	0|0
 1	12117	1:10117:AAA	AAA	AAAA	.	.	START=12117;END=12143;PERIOD=1	GT	0|0	0|0	0|0	0|0	0|0
 1	13122	1:10122:GTGT	GTGT	GTGTGT,GTGTGTGT,GTGTGTGTGTGTGT,GTGTGTGTGTGTGTGTGTGTGT,GT	.	.	START=13122;END=13150;PERIOD=2	GT	4|4	.	3|0	1|2	4|.
 X	20122	1:20122:GGG	GGG	GGGGG	.	.	START=20122;END=20150;PERIOD=1	GT	1	.	0	.	.
diff --git a/tests/test_data.py b/tests/test_data.py
index 02184949..e35dccc6 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -354,6 +354,8 @@ def _get_fake_genotypes_multiallelic_tr(self):
         phasing = np.ones(data.shape[:2] + (1,)).astype(np.uint8)
         phasing[4, [1, 3, 4]] = 0
         phasing[3, [1, 4]] = 0
+        phasing[:2, 1:3] = 0
+        phasing[1, 2] = 1 # since homozygous unphased variants are marked as phased
         gts.data = np.concatenate((data, phasing), axis=-1).transpose((1, 0, 2))
         # handle half-calls and chrX according to the flag:
         # --vcf-half-call m

From 7aab3c04bcca2bc88cbfe6810bef538284be1f2c Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Sat, 16 Sep 2023 08:51:49 -0700
Subject: [PATCH 39/40] make sure we can write bool gts too

---
 tests/test_data.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/tests/test_data.py b/tests/test_data.py
index e35dccc6..f88cf09e 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -615,6 +615,32 @@ def test_write_genotypes_unphased(self):
         fname.with_suffix(".pvar").unlink()
         fname.unlink()
 
+    def test_write_genotypes_biallelic(self):
+        gts = self._get_fake_genotypes_plink()
+
+        # convert to bool - we should be able to handle both
+        gts.data = gts.data.astype(np.bool_)
+
+        fname = DATADIR / "test_write.pgen"
+        gts.fname = fname
+        gts.write()
+
+        new_gts = GenotypesPLINK(fname)
+        new_gts.read()
+        new_gts.check_phase()
+
+        # check that everything matches what we expected
+        np.testing.assert_allclose(gts.data, new_gts.data)
+        assert gts.samples == new_gts.samples
+        for i in range(len(new_gts.variants)):
+            for col in ("chrom", "pos", "id", "alleles"):
+                assert gts.variants[col][i] == new_gts.variants[col][i]
+
+        # clean up afterwards: delete the files we created
+        fname.with_suffix(".psam").unlink()
+        fname.with_suffix(".pvar").unlink()
+        fname.unlink()
+
     def test_write_multiallelic(self):
         # Create fake multi-allelic genotype data to write to the PLINK file
         gts_multiallelic = self._get_fake_genotypes_multiallelic()

From f399d8e59d1cee119bbbca615090de48274733bd Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Sat, 16 Sep 2023 08:53:10 -0700
Subject: [PATCH 40/40] refmt with black

---
 tests/test_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_data.py b/tests/test_data.py
index f88cf09e..07ec5b02 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -355,7 +355,7 @@ def _get_fake_genotypes_multiallelic_tr(self):
         phasing[4, [1, 3, 4]] = 0
         phasing[3, [1, 4]] = 0
         phasing[:2, 1:3] = 0
-        phasing[1, 2] = 1 # since homozygous unphased variants are marked as phased
+        phasing[1, 2] = 1  # since homozygous unphased variants are marked as phased
         gts.data = np.concatenate((data, phasing), axis=-1).transpose((1, 0, 2))
         # handle half-calls and chrX according to the flag:
         # --vcf-half-call m