Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new script hocr-cut for cutting a page #108

Merged
merged 6 commits into from
Sep 7, 2018
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 119 additions & 0 deletions hocr-cut
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
#!/usr/bin/env python

from __future__ import print_function
import argparse
import os
import re
import sys

from lxml import html
from PIL import Image, ImageDraw


def get_prop(node, name):
title = node.get('title')
if not title: return None
props = title.split(';')
for prop in props:
(key, args) = prop.split(None, 1)
if key == name: return args.strip('"')
return None


def get_bbox(node):
bbox = get_prop(node, 'bbox')
if not bbox: return None
return tuple([int(x) for x in bbox.split()])


parser = argparse.ArgumentParser(
description=
'Cut a page (horizontally) into two pages in the middle such that the most of the bounding boxes are separated nicely, e.g. cutting double pages or double columns'
)
parser.add_argument('file', nargs='?', default=sys.stdin)
parser.add_argument('-d', '--debug', action="store_true")
args = parser.parse_args()

doc = html.parse(args.file)

pages = doc.xpath("//*[@class='ocr_page']")

for page in pages:

try:
filename = get_prop(page, 'image')
image = Image.open(filename)
debug_image = Image.open(filename)
dr = ImageDraw.Draw(debug_image)
image_found = True
except:
print("Warning: Image not found!")
args.debug = False
image_found = False

bbox = get_bbox(page)
middle = bbox[2] / 2

left_ends = []
right_starts = []
for line in doc.xpath("//*[@class='ocr_line']"):
b = get_bbox(line)
if (b[0] > middle):
pos = "right"
elif (b[2] < middle):
pos = "left"
elif (b[2] - middle > middle - b[1]):
pos = "right"
else:
pos = "left"
if (pos == "right"):
right_starts.append(b[0])
if (args.debug):
dr.rectangle(b, fill=32)
else:
left_ends.append(b[2])
if (args.debug):
dr.rectangle(b, fill=96)

left_ends.sort()
right_starts.sort()
n = len(left_ends)
m = len(right_starts)
middle_left = left_ends[n // 2]
middle_right = right_starts[m // 2]

middle = (middle_left + middle_right) / 2
print("Cutting at", middle)

if (image_found):

if filename[-4] == ".":
name = filename[:-3]
suffix = filename[-3:]
else:
name = filename
suffix = ""

if (args.debug):
dr.line(
(middle_left, 0, middle_left, debug_image.size[1]),
fill=64,
width=3)
dr.line(
(middle_right, 0, middle_right, debug_image.size[1]),
fill=64,
width=3)
dr.line(
(middle, 0, middle, debug_image.size[1]), fill=128, width=5)
debug_output = name + "cut." + suffix
debug_image.save(debug_output)
print("debug output is saved in", debug_output)

left = image.crop((0, 0, middle, image.size[1]))
left_name = name + "left." + suffix
left.save(left_name)
print("left page is saved in", left_name)
right = image.crop((middle, 0, image.size[0], image.size[1]))
right_name = name + "right." + suffix
right.save(right_name)
print("right page is saved in", right_name)