From 03be4ef691e0a6d329d1c2cebcd6ddc9370f5511 Mon Sep 17 00:00:00 2001 From: Philipp Zumstein Date: Sat, 18 Mar 2017 11:03:10 +0100 Subject: [PATCH 1/6] Add new script hocr-cut for cutting a pages This cuts a page (horizontally) into two pages in the middle such that the most of the bounding boxes are separated nicely, e.g. cutting double pages or double columns. --- hocr-cut | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 hocr-cut diff --git a/hocr-cut b/hocr-cut new file mode 100644 index 0000000..d578815 --- /dev/null +++ b/hocr-cut @@ -0,0 +1,95 @@ +#!/usr/bin/env python + +from __future__ import print_function +import argparse +import os +import re +import sys + +from lxml import html +from PIL import Image, ImageDraw + + +def get_prop(node,name): + title = node.get('title') + if not title: return None + props = title.split(';') + for prop in props: + (key,args) = prop.split(None,1) + if key==name: return args + return None + +def get_bbox(node): + bbox = get_prop(node,'bbox') + if not bbox: return None + return tuple([int(x) for x in bbox.split()]) + + +parser = argparse.ArgumentParser(description='Cut a page (horizontally) into two pages in the middle such that the most of the bounding boxes are separated nicely, e.g. cutting double pages or double columns') +parser.add_argument('file', nargs='?', default=sys.stdin) +parser.add_argument('-d', '--debug', action="store_true") +args = parser.parse_args() + +doc = html.parse(args.file) + +pages = doc.xpath("//*[@class='ocr_page']") + +for page in pages: + + bbox = get_bbox(page) + middle = bbox[2]/2 + + left_ends = [] + right_starts = [] + for line in doc.xpath("//*[@class='ocr_line']"): + b = get_bbox(line) + if (b[0]>middle): + #print("in the right halve") + right_starts.append(b[0]) + elif (b[2]middle-b[1]): + #print("in the right halve") + right_starts.append(b[0]) + else: + #print("in the left halve") + left_ends.append(b[2]) + + left_ends.sort() + right_starts.sort() + n = len(left_ends) + m = len(right_starts) + middle_left = left_ends[n//2] + middle_right = right_starts[m//2] + + middle = (middle_left+middle_right)/2 + print("Cutting at",middle) + + filename = get_prop(page,'image') + im = Image.open(filename) + + if filename[-4] == ".": + name = filename[:-3] + suffix = filename[-3:] + else: + name = filename + suffix = "" + + if (args.debug): + dr = ImageDraw.Draw(im) + dr.line((middle, 0, middle, im.size[1]), fill=0, width=3) + debug_output = name + "cut." + suffix + im.save(debug_output) + print("debug output is saved in", debug_output) + + left = im.crop((0, 0, middle, im.size[1])) + left_name = name + "left." + suffix + left.save(left_name) + print("left page is saved in", left_name) + right = im.crop((middle, 0, im.size[0], im.size[1])) + right_name = name + "right." + suffix + right.save(right_name) + print("right page is saved in", right_name) + + From 3b9d343c20c156f43a0e2c841841a3e067c80376 Mon Sep 17 00:00:00 2001 From: Philipp Zumstein Date: Tue, 21 Mar 2017 20:25:53 +0100 Subject: [PATCH 2/6] [hocr-cut]: Handle case that image is not present --- hocr-cut | 79 ++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 48 insertions(+), 31 deletions(-) diff --git a/hocr-cut b/hocr-cut index d578815..e77a53a 100644 --- a/hocr-cut +++ b/hocr-cut @@ -36,6 +36,18 @@ pages = doc.xpath("//*[@class='ocr_page']") for page in pages: + try: + filename = get_prop(page,'image') + image = Image.open(filename) + debug_image = Image.open(filename) + dr = ImageDraw.Draw(debug_image) + image_found = True + except: + print("Warning: Image not found!") + args.debug = False + image_found = False + + bbox = get_bbox(page) middle = bbox[2]/2 @@ -44,17 +56,21 @@ for page in pages: for line in doc.xpath("//*[@class='ocr_line']"): b = get_bbox(line) if (b[0]>middle): - #print("in the right halve") - right_starts.append(b[0]) + pos="right" elif (b[2]middle-b[1]): - #print("in the right halve") + pos="right" + else: + pos="left" + if (pos=="right"): right_starts.append(b[0]) + if (args.debug): + dr.rectangle(b, fill=32) else: - #print("in the left halve") left_ends.append(b[2]) + if (args.debug): + dr.rectangle(b, fill=96) left_ends.sort() right_starts.sort() @@ -66,30 +82,31 @@ for page in pages: middle = (middle_left+middle_right)/2 print("Cutting at",middle) - filename = get_prop(page,'image') - im = Image.open(filename) - - if filename[-4] == ".": - name = filename[:-3] - suffix = filename[-3:] - else: - name = filename - suffix = "" - - if (args.debug): - dr = ImageDraw.Draw(im) - dr.line((middle, 0, middle, im.size[1]), fill=0, width=3) - debug_output = name + "cut." + suffix - im.save(debug_output) - print("debug output is saved in", debug_output) - - left = im.crop((0, 0, middle, im.size[1])) - left_name = name + "left." + suffix - left.save(left_name) - print("left page is saved in", left_name) - right = im.crop((middle, 0, im.size[0], im.size[1])) - right_name = name + "right." + suffix - right.save(right_name) - print("right page is saved in", right_name) + + if (image_found): + + if filename[-4] == ".": + name = filename[:-3] + suffix = filename[-3:] + else: + name = filename + suffix = "" + + if (args.debug): + dr.line((middle_left, 0, middle_left, debug_image.size[1]), fill=64, width=3) + dr.line((middle_right, 0, middle_right, debug_image.size[1]), fill=64, width=3) + dr.line((middle, 0, middle, debug_image.size[1]), fill=128, width=5) + debug_output = name + "cut." + suffix + debug_image.save(debug_output) + print("debug output is saved in", debug_output) + + left = image.crop((0, 0, middle, image.size[1])) + left_name = name + "left." + suffix + left.save(left_name) + print("left page is saved in", left_name) + right = image.crop((middle, 0, image.size[0], image.size[1])) + right_name = name + "right." + suffix + right.save(right_name) + print("right page is saved in", right_name) From 48c93deb64c616bc1bd56202d2535cb0569d8a60 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Tue, 4 Sep 2018 22:00:23 +0200 Subject: [PATCH 3/6] hocr-cut: Fix whitespace issues Signed-off-by: Stefan Weil --- hocr-cut | 4 ---- 1 file changed, 4 deletions(-) diff --git a/hocr-cut b/hocr-cut index e77a53a..dec1f42 100644 --- a/hocr-cut +++ b/hocr-cut @@ -46,7 +46,6 @@ for page in pages: print("Warning: Image not found!") args.debug = False image_found = False - bbox = get_bbox(page) middle = bbox[2]/2 @@ -82,7 +81,6 @@ for page in pages: middle = (middle_left+middle_right)/2 print("Cutting at",middle) - if (image_found): if filename[-4] == ".": @@ -108,5 +106,3 @@ for page in pages: right_name = name + "right." + suffix right.save(right_name) print("right page is saved in", right_name) - - From ac36c0ca2cc185cb22218d4e6075e12180a71cf1 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Tue, 4 Sep 2018 22:01:13 +0200 Subject: [PATCH 4/6] hocr-cut: Set executable mode for file Signed-off-by: Stefan Weil --- hocr-cut | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 hocr-cut diff --git a/hocr-cut b/hocr-cut old mode 100644 new mode 100755 From f70b28fa059d7b30b94671e594cdcd6cbd449568 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Wed, 5 Sep 2018 07:50:24 +0200 Subject: [PATCH 5/6] hocr-cut: Fix PEP8 style It was fixed using `yapf -i --style pep8 hocr-cut`. Signed-off-by: Stefan Weil --- hocr-cut | 63 +++++++++++++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 26 deletions(-) diff --git a/hocr-cut b/hocr-cut index dec1f42..64338df 100755 --- a/hocr-cut +++ b/hocr-cut @@ -10,22 +10,26 @@ from lxml import html from PIL import Image, ImageDraw -def get_prop(node,name): +def get_prop(node, name): title = node.get('title') if not title: return None props = title.split(';') for prop in props: - (key,args) = prop.split(None,1) - if key==name: return args + (key, args) = prop.split(None, 1) + if key == name: return args return None + def get_bbox(node): - bbox = get_prop(node,'bbox') + bbox = get_prop(node, 'bbox') if not bbox: return None return tuple([int(x) for x in bbox.split()]) -parser = argparse.ArgumentParser(description='Cut a page (horizontally) into two pages in the middle such that the most of the bounding boxes are separated nicely, e.g. cutting double pages or double columns') +parser = argparse.ArgumentParser( + description= + 'Cut a page (horizontally) into two pages in the middle such that the most of the bounding boxes are separated nicely, e.g. cutting double pages or double columns' +) parser.add_argument('file', nargs='?', default=sys.stdin) parser.add_argument('-d', '--debug', action="store_true") args = parser.parse_args() @@ -37,7 +41,7 @@ pages = doc.xpath("//*[@class='ocr_page']") for page in pages: try: - filename = get_prop(page,'image') + filename = get_prop(page, 'image') image = Image.open(filename) debug_image = Image.open(filename) dr = ImageDraw.Draw(debug_image) @@ -48,24 +52,24 @@ for page in pages: image_found = False bbox = get_bbox(page) - middle = bbox[2]/2 + middle = bbox[2] / 2 left_ends = [] right_starts = [] for line in doc.xpath("//*[@class='ocr_line']"): b = get_bbox(line) - if (b[0]>middle): - pos="right" - elif (b[2]middle-b[1]): - pos="right" + if (b[0] > middle): + pos = "right" + elif (b[2] < middle): + pos = "left" + elif (b[2] - middle > middle - b[1]): + pos = "right" else: - pos="left" - if (pos=="right"): + pos = "left" + if (pos == "right"): right_starts.append(b[0]) if (args.debug): - dr.rectangle(b, fill=32) + dr.rectangle(b, fill=32) else: left_ends.append(b[2]) if (args.debug): @@ -75,11 +79,11 @@ for page in pages: right_starts.sort() n = len(left_ends) m = len(right_starts) - middle_left = left_ends[n//2] - middle_right = right_starts[m//2] + middle_left = left_ends[n // 2] + middle_right = right_starts[m // 2] - middle = (middle_left+middle_right)/2 - print("Cutting at",middle) + middle = (middle_left + middle_right) / 2 + print("Cutting at", middle) if (image_found): @@ -91,12 +95,19 @@ for page in pages: suffix = "" if (args.debug): - dr.line((middle_left, 0, middle_left, debug_image.size[1]), fill=64, width=3) - dr.line((middle_right, 0, middle_right, debug_image.size[1]), fill=64, width=3) - dr.line((middle, 0, middle, debug_image.size[1]), fill=128, width=5) - debug_output = name + "cut." + suffix - debug_image.save(debug_output) - print("debug output is saved in", debug_output) + dr.line( + (middle_left, 0, middle_left, debug_image.size[1]), + fill=64, + width=3) + dr.line( + (middle_right, 0, middle_right, debug_image.size[1]), + fill=64, + width=3) + dr.line( + (middle, 0, middle, debug_image.size[1]), fill=128, width=5) + debug_output = name + "cut." + suffix + debug_image.save(debug_output) + print("debug output is saved in", debug_output) left = image.crop((0, 0, middle, image.size[1])) left_name = name + "left." + suffix From 3a91a354eb851ac1c941d3772c018dcd28c90eda Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Wed, 5 Sep 2018 11:27:25 +0200 Subject: [PATCH 6/6] hocr-cut: Strip "" from image name Tesseract uses image names enclosed in "" which must be stripped because otherwise opening the image will fail. Signed-off-by: Stefan Weil --- hocr-cut | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hocr-cut b/hocr-cut index 64338df..974fa7b 100755 --- a/hocr-cut +++ b/hocr-cut @@ -16,7 +16,7 @@ def get_prop(node, name): props = title.split(';') for prop in props: (key, args) = prop.split(None, 1) - if key == name: return args + if key == name: return args.strip('"') return None