From 2c97aef26e90dfaa7231f146131a1c4a4cdb6c80 Mon Sep 17 00:00:00 2001 From: Philipp Zumstein Date: Sat, 25 Feb 2017 15:25:26 +0100 Subject: [PATCH 1/2] Add two additional options for hocr-wordfreq This solves #101. --- hocr-wordfreq | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/hocr-wordfreq b/hocr-wordfreq index fa2c804..c673577 100755 --- a/hocr-wordfreq +++ b/hocr-wordfreq @@ -9,6 +9,10 @@ from lxml import html parser = argparse.ArgumentParser(description='Calculate word frequency in an hOCR file') parser.add_argument('-i', '--case-insensitive', action='store_true', default=False, help="ignore case") +parser.add_argument('-s', '--spaces', action='store_true', + default=False, help="split on spaces only") +parser.add_argument('-y', '--dehyphonize', action='store_true', + default=False, help="try to dehyphonize the text before analyis") parser.add_argument('-n', '--max', type=int, default=10, help="number of hits (default: %(default)s)") parser.add_argument('hocr_in', @@ -21,8 +25,18 @@ doc = html.parse(args.hocr_in) text = doc.find('//body').text_content().strip() if args.case_insensitive: text = text.lower() +if args.dehyphonize: + #delete blank lines + text = re.sub(r"^\s*$\r?\n", "", text) + #dehyphonize + text = re.sub(r"-\r?\n", "", text) + #replace line breaks with a space + text = re.sub(r"\r?\n", " ", text) wc = {} -for word in re.compile('\W+', re.UNICODE).split(text): +separators = re.compile('\W+', re.UNICODE) +if args.spaces: + separators = re.compile('\s+', re.UNICODE) +for word in separators.split(text): if word == '': continue wc[word] = wc[word]+1 if word in wc else 1 From 84ba9088f8cd190a3b6a1c59bcecc4919a450c04 Mon Sep 17 00:00:00 2001 From: Philipp Zumstein Date: Tue, 21 Mar 2017 20:29:35 +0100 Subject: [PATCH 2/2] [hocr-wordfreq]: Change option name into dehyphenate --- hocr-wordfreq | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hocr-wordfreq b/hocr-wordfreq index c673577..542c8c2 100755 --- a/hocr-wordfreq +++ b/hocr-wordfreq @@ -11,8 +11,8 @@ parser.add_argument('-i', '--case-insensitive', action='store_true', default=False, help="ignore case") parser.add_argument('-s', '--spaces', action='store_true', default=False, help="split on spaces only") -parser.add_argument('-y', '--dehyphonize', action='store_true', - default=False, help="try to dehyphonize the text before analyis") +parser.add_argument('-y', '--dehyphenate', action='store_true', + default=False, help="try to dehyphenate the text before analyis") parser.add_argument('-n', '--max', type=int, default=10, help="number of hits (default: %(default)s)") parser.add_argument('hocr_in', @@ -25,10 +25,10 @@ doc = html.parse(args.hocr_in) text = doc.find('//body').text_content().strip() if args.case_insensitive: text = text.lower() -if args.dehyphonize: +if args.dehyphenate: #delete blank lines text = re.sub(r"^\s*$\r?\n", "", text) - #dehyphonize + #dehyphenate text = re.sub(r"-\r?\n", "", text) #replace line breaks with a space text = re.sub(r"\r?\n", " ", text)