Skip to content

Commit

Permalink
Add new default Tesseract OCR backend
Browse files Browse the repository at this point in the history
This new backend uses a command call to avoid
Tesseract bug 1670
(tesseract-ocr/tesseract#1670).

Signed-off-by: Roberto Rosario <[email protected]>
  • Loading branch information
siloraptor committed Apr 27, 2019
1 parent e5aa455 commit 32cf0a0
Show file tree
Hide file tree
Showing 5 changed files with 131 additions and 1 deletion.
3 changes: 3 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,9 @@
* Remove app top level star imports.
* Monkeypatch group and user models to make their fields
translatable.
* Add new and default Tesseract OCR backend to avoid
Tesseract bug 1670
(https://github.com/tesseract-ocr/tesseract/issues/1670)

3.1.11 (2019-04-XX)
===================
Expand Down
3 changes: 3 additions & 0 deletions docs/releases/3.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,9 @@ Other changes
* Remove app top level star imports.
* Monkeypatch group and user models to make their fields
translatable.
* Add new and default Tesseract OCR backend to avoid
Tesseract bug 1670
(https://github.com/tesseract-ocr/tesseract/issues/1670)

Removals
--------
Expand Down
4 changes: 4 additions & 0 deletions mayan/apps/ocr/backends/literals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from __future__ import absolute_import, unicode_literals

DEFAULT_TESSERACT_BINARY_PATH = '/usr/bin/tesseract'
DEFAULT_TESSERACT_TIMEOUT = 600 # 600 seconds, 10 minutes
119 changes: 119 additions & 0 deletions mayan/apps/ocr/backends/tesseract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
from __future__ import absolute_import, unicode_literals

import logging
import shutil

import sh
import yaml
try:
from yaml import CSafeLoader as SafeLoader
except ImportError:
from yaml import SafeLoader

from django.utils.encoding import force_text
from django.utils.translation import ugettext_lazy as _

from mayan.apps.storage.utils import TemporaryFile

from ..classes import OCRBackendBase
from ..exceptions import OCRError
from ..settings import setting_ocr_backend_arguments

from .literals import DEFAULT_TESSERACT_BINARY_PATH, DEFAULT_TESSERACT_TIMEOUT

logger = logging.getLogger(__name__)


class Tesseract(OCRBackendBase):
def __init__(self, *args, **kwargs):
super(Tesseract, self).__init__(*args, **kwargs)
self.languages = ()

backend_arguments = yaml.load(
Loader=SafeLoader,
stream=setting_ocr_backend_arguments.value or '{}',
)

tesseract_binary_path = backend_arguments.get(
'tesseract_path', DEFAULT_TESSERACT_BINARY_PATH
)
self.command_timeout = backend_arguments.get(
'timeout', DEFAULT_TESSERACT_TIMEOUT
)

try:
self.command_tesseract = sh.Command(path=tesseract_binary_path)
except sh.CommandNotError:
self.command_tesseract = None
raise OCRError(
_('Tesseract not found.')
)
else:
# Get version
result = self.command_tesseract(v=True)
logger.debug('Tesseract version: %s', result.stdout)

# Get languages
result = self.command_tesseract(list_langs=True)
# Sample output format
# List of available languages (3):
# deu
# eng
# osd
# <- empty line

# Extaction: strip last line, split by newline, discard the first
# line
self.languages = force_text(result.stdout).strip().split('\n')[1:]

logger.debug('Available languages: %s', ', '.join(self.languages))

def execute(self, *args, **kwargs):
"""
Execute the command line binary of tesseract
"""
super(Tesseract, self).execute(*args, **kwargs)

if self.command_tesseract:
image = self.converter.get_page()

try:
temporary_image_file = TemporaryFile()
shutil.copyfileobj(image, temporary_image_file)
temporary_image_file.seek(0)

arguments = ['-', '-']

keyword_arguments = {
'_in': temporary_image_file,
'_timeout': self.command_timeout
}

if self.language:
keyword_arguments['l'] = self.language

try:

result = self.command_tesseract(
*arguments, **keyword_arguments
)
return force_text(result.stdout)
except Exception as exception:
error_message = (
'Exception calling Tesseract with language option: {}; {}'
).format(self.language, exception)

if self.language not in self.languages:
error_message = (
'{}\nThe requested OCR language "{}" is not '
'available and needs to be installed.\n'
).format(
error_message, self.language
)

logger.error(error_message)
raise OCRError(error_message)
else:
return result
finally:
temporary_image_file.close()
3 changes: 2 additions & 1 deletion mayan/apps/ocr/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
namespace = Namespace(label=_('OCR'), name='ocr')

setting_ocr_backend = namespace.add_setting(
global_name='OCR_BACKEND', default='mayan.apps.ocr.backends.pyocr.PyOCR',
global_name='OCR_BACKEND',
default='mayan.apps.ocr.backends.tesseract.Tesseract',
help_text=_('Full path to the backend to be used to do OCR.')
)
setting_ocr_backend_arguments = namespace.add_setting(
Expand Down

0 comments on commit 32cf0a0

Please sign in to comment.