From 2c97aef26e90dfaa7231f146131a1c4a4cdb6c80 Mon Sep 17 00:00:00 2001
From: Philipp Zumstein <zuphilip@gmail.com>
Date: Sat, 25 Feb 2017 15:25:26 +0100
Subject: [PATCH 1/2] Add two additional options for hocr-wordfreq

This solves #101.
---
 hocr-wordfreq | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/hocr-wordfreq b/hocr-wordfreq
index fa2c804..c673577 100755
--- a/hocr-wordfreq
+++ b/hocr-wordfreq
@@ -9,6 +9,10 @@ from lxml import html
 parser = argparse.ArgumentParser(description='Calculate word frequency in an hOCR file')
 parser.add_argument('-i', '--case-insensitive', action='store_true',
                     default=False, help="ignore case")
+parser.add_argument('-s', '--spaces', action='store_true',
+                    default=False, help="split on spaces only")
+parser.add_argument('-y', '--dehyphonize', action='store_true',
+                    default=False, help="try to dehyphonize the text before analyis")
 parser.add_argument('-n', '--max', type=int, default=10,
                     help="number of hits (default: %(default)s)")
 parser.add_argument('hocr_in',
@@ -21,8 +25,18 @@ doc = html.parse(args.hocr_in)
 text = doc.find('//body').text_content().strip()
 if args.case_insensitive:
     text = text.lower()
+if args.dehyphonize:
+    #delete blank lines
+    text = re.sub(r"^\s*$\r?\n", "", text)
+    #dehyphonize
+    text = re.sub(r"-\r?\n", "", text)
+    #replace line breaks with a space
+    text = re.sub(r"\r?\n", " ", text)
 wc = {}
-for word in re.compile('\W+', re.UNICODE).split(text):
+separators = re.compile('\W+', re.UNICODE)
+if args.spaces:
+    separators = re.compile('\s+', re.UNICODE)
+for word in separators.split(text):
     if word == '': continue
     wc[word] = wc[word]+1 if word in wc else 1
 

From 84ba9088f8cd190a3b6a1c59bcecc4919a450c04 Mon Sep 17 00:00:00 2001
From: Philipp Zumstein <zuphilip@gmail.com>
Date: Tue, 21 Mar 2017 20:29:35 +0100
Subject: [PATCH 2/2] [hocr-wordfreq]: Change option name into dehyphenate

---
 hocr-wordfreq | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/hocr-wordfreq b/hocr-wordfreq
index c673577..542c8c2 100755
--- a/hocr-wordfreq
+++ b/hocr-wordfreq
@@ -11,8 +11,8 @@ parser.add_argument('-i', '--case-insensitive', action='store_true',
                     default=False, help="ignore case")
 parser.add_argument('-s', '--spaces', action='store_true',
                     default=False, help="split on spaces only")
-parser.add_argument('-y', '--dehyphonize', action='store_true',
-                    default=False, help="try to dehyphonize the text before analyis")
+parser.add_argument('-y', '--dehyphenate', action='store_true',
+                    default=False, help="try to dehyphenate the text before analyis")
 parser.add_argument('-n', '--max', type=int, default=10,
                     help="number of hits (default: %(default)s)")
 parser.add_argument('hocr_in',
@@ -25,10 +25,10 @@ doc = html.parse(args.hocr_in)
 text = doc.find('//body').text_content().strip()
 if args.case_insensitive:
     text = text.lower()
-if args.dehyphonize:
+if args.dehyphenate:
     #delete blank lines
     text = re.sub(r"^\s*$\r?\n", "", text)
-    #dehyphonize
+    #dehyphenate
     text = re.sub(r"-\r?\n", "", text)
     #replace line breaks with a space
     text = re.sub(r"\r?\n", " ", text)