ocropus · zuphilip · Mar 29, 2017 · Feb 25, 2017 · Mar 21, 2017
diff --git a/hocr-wordfreq b/hocr-wordfreq
@@ -9,6 +9,10 @@ from lxml import html
 parser = argparse.ArgumentParser(description='Calculate word frequency in an hOCR file')
 parser.add_argument('-i', '--case-insensitive', action='store_true',
                     default=False, help="ignore case")
+parser.add_argument('-s', '--spaces', action='store_true',
+                    default=False, help="split on spaces only")
+parser.add_argument('-y', '--dehyphenate', action='store_true',
+                    default=False, help="try to dehyphenate the text before analyis")
 parser.add_argument('-n', '--max', type=int, default=10,
                     help="number of hits (default: %(default)s)")
 parser.add_argument('hocr_in',
@@ -21,8 +25,18 @@ doc = html.parse(args.hocr_in)
 text = doc.find('//body').text_content().strip()
 if args.case_insensitive:
     text = text.lower()
+if args.dehyphenate:
+    #delete blank lines
+    text = re.sub(r"^\s*$\r?\n", "", text)
+    #dehyphenate
+    text = re.sub(r"-\r?\n", "", text)
+    #replace line breaks with a space
+    text = re.sub(r"\r?\n", " ", text)
 wc = {}
-for word in re.compile('\W+', re.UNICODE).split(text):
+separators = re.compile('\W+', re.UNICODE)
+if args.spaces:
+    separators = re.compile('\s+', re.UNICODE)
+for word in separators.split(text):
     if word == '': continue
     wc[word] = wc[word]+1 if word in wc else 1