-
Notifications
You must be signed in to change notification settings - Fork 186
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #309 from stefan6419846/pip_package
Migrate Python code to a dedicated package Signed-off-by: Stefan Weil <[email protected]>
- Loading branch information
Showing
12 changed files
with
954 additions
and
869 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
*.py[cod] | ||
*$py.class | ||
|
||
# Distribution / packaging | ||
.Python | ||
build/ | ||
develop-eggs/ | ||
dist/ | ||
downloads/ | ||
eggs/ | ||
.eggs/ | ||
lib/ | ||
lib64/ | ||
parts/ | ||
sdist/ | ||
var/ | ||
wheels/ | ||
share/python-wheels/ | ||
*.egg-info/ | ||
.installed.cfg | ||
*.egg | ||
MANIFEST | ||
|
||
# Installer logs | ||
pip-log.txt | ||
pip-delete-this-directory.txt | ||
|
||
# Unit test / coverage reports | ||
htmlcov/ | ||
.tox/ | ||
.nox/ | ||
.coverage | ||
.coverage.* | ||
.cache | ||
nosetests.xml | ||
coverage.xml | ||
*.cover | ||
*.py,cover | ||
.hypothesis/ | ||
.pytest_cache/ | ||
cover/ | ||
|
||
# Environments | ||
.env | ||
.venv | ||
env/ | ||
venv/ | ||
ENV/ | ||
env.bak/ | ||
venv.bak/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
# tesstrain.py | ||
|
||
Utilities for working with Tesseract >= 4 using artificial training data. | ||
|
||
## Install | ||
|
||
This package requires the Tesseract training tools to be available on your system. | ||
|
||
To install the PIP package, either use `pip install tesstrain` (for existing packages) or `pip install .` (from source checkout). | ||
A supported Python version (at least 3.7) is required for running. | ||
|
||
## Running | ||
|
||
* Use the terminal interface to directly interact with the tools: `python -m tesstrain --help`. | ||
* Call it from your own code using the high-level interface `tesstrain.run()`. | ||
|
||
## License | ||
|
||
Software is provided under the terms of the `Apache 2.0` license. | ||
|
||
Sample training data provided by [Deutsches Textarchiv](https://deutschestextarchiv.de) is [in the public domain](http://creativecommons.org/publicdomain/mark/1.0/). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
[metadata] | ||
version = attr: tesstrain.__version__ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
from pathlib import Path | ||
|
||
import setuptools | ||
|
||
|
||
ROOT_DIRECTORY = Path(__file__).parent.resolve() | ||
|
||
setuptools.setup( | ||
name='tesstrain', | ||
description='Training utils for Tesseract', | ||
long_description=(ROOT_DIRECTORY / 'README.md').read_text(encoding='utf-8'), | ||
long_description_content_type='text/markdown', | ||
url='https://github.com/tesseract-ocr/tesstrain', | ||
packages=setuptools.find_packages(), | ||
license='Apache Software License 2.0', | ||
author='Tesseract contributors', | ||
classifiers=[ | ||
'Development Status :: 5 - Production/Stable', | ||
'Intended Audience :: Developers', | ||
'Topic :: Scientific/Engineering :: Image Recognition', | ||
'License :: OSI Approved :: Apache Software License', | ||
'Programming Language :: Python :: 3', | ||
'Programming Language :: Python :: 3 :: Only', | ||
'Programming Language :: Python :: 3.7', | ||
'Programming Language :: Python :: 3.8', | ||
'Programming Language :: Python :: 3.9', | ||
'Programming Language :: Python :: 3.10', | ||
'Programming Language :: Python :: 3.11', | ||
], | ||
keywords='Tesseract,tesseract-ocr,OCR,optical character recognition', | ||
|
||
python_requires='>=3.7', | ||
install_requires=[ | ||
'tqdm', | ||
], | ||
|
||
entry_points={ | ||
'console_scripts': [ | ||
], | ||
}, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# (C) Copyright 2014, Google Inc. | ||
# (C) Copyright 2018, James R Barlow | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
from tesstrain.wrapper import run | ||
|
||
__version__ = '0.1' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
# (C) Copyright 2014, Google Inc. | ||
# (C) Copyright 2018, James R Barlow | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
# This script provides an easy way to execute various phases of training | ||
# Tesseract. For a detailed description of the phases, see | ||
# https://tesseract-ocr.github.io/tessdoc/Training-Tesseract.html. | ||
|
||
import logging | ||
|
||
from tesstrain.arguments import ( | ||
get_argument_parser, | ||
TrainingArguments, | ||
verify_parameters_and_handle_defaults | ||
) | ||
from tesstrain.generate import cleanup | ||
from tesstrain.wrapper import run_from_context | ||
|
||
|
||
log = logging.getLogger() | ||
|
||
|
||
def setup_logging_console(): | ||
log.setLevel(logging.DEBUG) | ||
console = logging.StreamHandler() | ||
console.setLevel(logging.INFO) | ||
console_formatter = logging.Formatter( | ||
"[%(asctime)s] %(levelname)s - %(message)s", datefmt="%H:%M:%S" | ||
) | ||
console.setFormatter(console_formatter) | ||
log.addHandler(console) | ||
|
||
|
||
def setup_logging_logfile(logfile): | ||
logfile = logging.FileHandler(logfile, encoding='utf-8') | ||
logfile.setLevel(logging.DEBUG) | ||
logfile_formatter = logging.Formatter( | ||
"[%(asctime)s] - %(levelname)s - %(name)s - %(message)s" | ||
) | ||
logfile.setFormatter(logfile_formatter) | ||
log.addHandler(logfile) | ||
return logfile | ||
|
||
|
||
def parse_flags(argv=None): | ||
ctx = TrainingArguments() | ||
log.debug(ctx) | ||
parser = get_argument_parser() | ||
parser.parse_args(args=argv, namespace=ctx) | ||
return verify_parameters_and_handle_defaults(ctx) | ||
|
||
|
||
def main(): | ||
setup_logging_console() | ||
ctx = parse_flags() | ||
logfile = setup_logging_logfile(ctx.log_file) | ||
|
||
run_from_context(ctx) | ||
|
||
log.removeHandler(logfile) | ||
logfile.close() | ||
cleanup(ctx) | ||
log.info("All done!") | ||
return 0 | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
Oops, something went wrong.