-
-
Notifications
You must be signed in to change notification settings - Fork 457
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add new default Tesseract OCR backend
This new backend uses a command call to avoid Tesseract bug 1670 (tesseract-ocr/tesseract#1670). Signed-off-by: Roberto Rosario <[email protected]>
- Loading branch information
1 parent
e5aa455
commit 32cf0a0
Showing
5 changed files
with
131 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
from __future__ import absolute_import, unicode_literals | ||
|
||
DEFAULT_TESSERACT_BINARY_PATH = '/usr/bin/tesseract' | ||
DEFAULT_TESSERACT_TIMEOUT = 600 # 600 seconds, 10 minutes |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
from __future__ import absolute_import, unicode_literals | ||
|
||
import logging | ||
import shutil | ||
|
||
import sh | ||
import yaml | ||
try: | ||
from yaml import CSafeLoader as SafeLoader | ||
except ImportError: | ||
from yaml import SafeLoader | ||
|
||
from django.utils.encoding import force_text | ||
from django.utils.translation import ugettext_lazy as _ | ||
|
||
from mayan.apps.storage.utils import TemporaryFile | ||
|
||
from ..classes import OCRBackendBase | ||
from ..exceptions import OCRError | ||
from ..settings import setting_ocr_backend_arguments | ||
|
||
from .literals import DEFAULT_TESSERACT_BINARY_PATH, DEFAULT_TESSERACT_TIMEOUT | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class Tesseract(OCRBackendBase): | ||
def __init__(self, *args, **kwargs): | ||
super(Tesseract, self).__init__(*args, **kwargs) | ||
self.languages = () | ||
|
||
backend_arguments = yaml.load( | ||
Loader=SafeLoader, | ||
stream=setting_ocr_backend_arguments.value or '{}', | ||
) | ||
|
||
tesseract_binary_path = backend_arguments.get( | ||
'tesseract_path', DEFAULT_TESSERACT_BINARY_PATH | ||
) | ||
self.command_timeout = backend_arguments.get( | ||
'timeout', DEFAULT_TESSERACT_TIMEOUT | ||
) | ||
|
||
try: | ||
self.command_tesseract = sh.Command(path=tesseract_binary_path) | ||
except sh.CommandNotError: | ||
self.command_tesseract = None | ||
raise OCRError( | ||
_('Tesseract not found.') | ||
) | ||
else: | ||
# Get version | ||
result = self.command_tesseract(v=True) | ||
logger.debug('Tesseract version: %s', result.stdout) | ||
|
||
# Get languages | ||
result = self.command_tesseract(list_langs=True) | ||
# Sample output format | ||
# List of available languages (3): | ||
# deu | ||
# eng | ||
# osd | ||
# <- empty line | ||
|
||
# Extaction: strip last line, split by newline, discard the first | ||
# line | ||
self.languages = force_text(result.stdout).strip().split('\n')[1:] | ||
|
||
logger.debug('Available languages: %s', ', '.join(self.languages)) | ||
|
||
def execute(self, *args, **kwargs): | ||
""" | ||
Execute the command line binary of tesseract | ||
""" | ||
super(Tesseract, self).execute(*args, **kwargs) | ||
|
||
if self.command_tesseract: | ||
image = self.converter.get_page() | ||
|
||
try: | ||
temporary_image_file = TemporaryFile() | ||
shutil.copyfileobj(image, temporary_image_file) | ||
temporary_image_file.seek(0) | ||
|
||
arguments = ['-', '-'] | ||
|
||
keyword_arguments = { | ||
'_in': temporary_image_file, | ||
'_timeout': self.command_timeout | ||
} | ||
|
||
if self.language: | ||
keyword_arguments['l'] = self.language | ||
|
||
try: | ||
|
||
result = self.command_tesseract( | ||
*arguments, **keyword_arguments | ||
) | ||
return force_text(result.stdout) | ||
except Exception as exception: | ||
error_message = ( | ||
'Exception calling Tesseract with language option: {}; {}' | ||
).format(self.language, exception) | ||
|
||
if self.language not in self.languages: | ||
error_message = ( | ||
'{}\nThe requested OCR language "{}" is not ' | ||
'available and needs to be installed.\n' | ||
).format( | ||
error_message, self.language | ||
) | ||
|
||
logger.error(error_message) | ||
raise OCRError(error_message) | ||
else: | ||
return result | ||
finally: | ||
temporary_image_file.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters