Merge pull request #48 from 4dn-dcic/0.3.1

0.3.1
4dn-dcic · Oct 24, 2017 · f251e4c · f251e4c
2 parents 862d807 + 4b61c3d
commit f251e4c
Show file tree

Hide file tree

Showing 13 changed files with 181 additions and 55 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -4,6 +4,8 @@ compiler: gcc
 python:
 - '3.6'
 - '2.7'
+before_install:
+- sudo apt-get install -qq valgrind
 script:
 - make
 - |
@@ -17,3 +19,4 @@ script:
   python test/test.py
   fi
 - source test/test_c.sh
+- valgrind --error-exitcode=42 --leak-check=full test/test_c.sh
diff --git a/README.md b/README.md
@@ -181,6 +181,12 @@ By default '|' is used to split the two genomic regions, but in some cases, a di
 pairix -W textfile.gz
 ```
 
+#### Print out number of bgzf blocks that span each chromosome pair.
+This command prints out the number of bgzk blocks for all chromosome pairs.
+```
+pairix -B textfile.gz
+```
+
 
 <br>
 
@@ -452,6 +458,10 @@ print (tb.get_header())
 # get chromsize
 tb=pypairix.open("textfile.gz")
 print (tb.get_chromsize())
+
+# get the number of bgzf blocks that span a given chromosome pair
+tb=pypairix.open("textfile.gz")
+print (tb.bgzf_block_count("chr1", "chr2"))
 ```
 
 <br>
@@ -619,6 +629,10 @@ ulimit -n 2000
 
 ## Version history
 
+### 0.3.1
+* `pairix -B` option is now available to print out the number of bgzf blocks for each chromosome (pair).
+* The same function is available for pypairix.
+
 ### 0.3.0
 * The problem with `fragment_4dnpairs.pl` of adding an extra column is now fixed.
 * 1D querying on 2D data now works with `pypairix` (function `querys2D`).

diff --git a/VERSION.txt b/VERSION.txt
@@ -1 +1 @@
-0.3.0
+0.3.1
diff --git a/samples/SRR1171591.variants.snp.vqsr.p.vcf.gz.px2 b/samples/SRR1171591.variants.snp.vqsr.p.vcf.gz.px2
diff --git a/samples/merged_nodup.tab.chrblock_sorted.txt.gz.px2 b/samples/merged_nodup.tab.chrblock_sorted.txt.gz.px2
diff --git a/samples/merged_nodups.space.chrblock_sorted.subsample1.txt.gz.px2 b/samples/merged_nodups.space.chrblock_sorted.subsample1.txt.gz.px2
diff --git a/src/bgzf.c b/src/bgzf.c
@@ -325,6 +325,27 @@ static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;}
 static void cache_block(BGZF *fp, int size) {}
 #endif
 
+int bgzf_block_length(BGZF *fp, int64_t block_start_offset)
+{
+	uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block;
+	int count, block_length, remaining;
+	int64_t block_address;
+        bgzf_seek(fp, block_start_offset, SEEK_SET);
+	block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
+	if (load_block_from_cache(fp, block_address)) return 0;
+	count = _bgzf_read(fp->fp, header, sizeof(header));
+	if (count == 0) { // no data read
+		fp->block_length = 0;
+		return 0;
+	}
+	if (count != sizeof(header) || !check_header(header)) {
+		fp->errcode |= BGZF_ERR_HEADER;
+		return -1;
+	}
+	block_length = unpackInt16((uint8_t*)&header[16]) + 1; // +1 because when writing this number, we used "-1"
+        return(block_length);
+}
+
 int bgzf_read_block(BGZF *fp)
 {
 	uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block;

diff --git a/src/bgzf.h b/src/bgzf.h
@@ -186,6 +186,11 @@ extern "C" {
 	 */
 	int bgzf_read_block(BGZF *fp);
 
+        /**
+         * returns block length for a given block start offset
+         */
+        int bgzf_block_length(BGZF *fp, int64_t block_start_offset);
+
 #ifdef __cplusplus
 }
 #endif