Skip to content

Commit

Permalink
Merge pull request #309 from stefan6419846/pip_package
Browse files Browse the repository at this point in the history
Migrate Python code to a dedicated package

Signed-off-by: Stefan Weil <[email protected]>
  • Loading branch information
stweil authored Jan 8, 2023
2 parents 0ce3ae9 + 345379f commit f4103bd
Show file tree
Hide file tree
Showing 12 changed files with 954 additions and 869 deletions.
52 changes: 52 additions & 0 deletions src/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
21 changes: 21 additions & 0 deletions src/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# tesstrain.py

Utilities for working with Tesseract >= 4 using artificial training data.

## Install

This package requires the Tesseract training tools to be available on your system.

To install the PIP package, either use `pip install tesstrain` (for existing packages) or `pip install .` (from source checkout).
A supported Python version (at least 3.7) is required for running.

## Running

* Use the terminal interface to directly interact with the tools: `python -m tesstrain --help`.
* Call it from your own code using the high-level interface `tesstrain.run()`.

## License

Software is provided under the terms of the `Apache 2.0` license.

Sample training data provided by [Deutsches Textarchiv](https://deutschestextarchiv.de) is [in the public domain](http://creativecommons.org/publicdomain/mark/1.0/).
2 changes: 2 additions & 0 deletions src/setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[metadata]
version = attr: tesstrain.__version__
41 changes: 41 additions & 0 deletions src/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from pathlib import Path

import setuptools


ROOT_DIRECTORY = Path(__file__).parent.resolve()

setuptools.setup(
name='tesstrain',
description='Training utils for Tesseract',
long_description=(ROOT_DIRECTORY / 'README.md').read_text(encoding='utf-8'),
long_description_content_type='text/markdown',
url='https://github.com/tesseract-ocr/tesstrain',
packages=setuptools.find_packages(),
license='Apache Software License 2.0',
author='Tesseract contributors',
classifiers=[
'Development Status :: 5 - Production/Stable',
'Intended Audience :: Developers',
'Topic :: Scientific/Engineering :: Image Recognition',
'License :: OSI Approved :: Apache Software License',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3 :: Only',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3.11',
],
keywords='Tesseract,tesseract-ocr,OCR,optical character recognition',

python_requires='>=3.7',
install_requires=[
'tqdm',
],

entry_points={
'console_scripts': [
],
},
)
15 changes: 15 additions & 0 deletions src/tesstrain/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# (C) Copyright 2014, Google Inc.
# (C) Copyright 2018, James R Barlow
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from tesstrain.wrapper import run

__version__ = '0.1'
76 changes: 76 additions & 0 deletions src/tesstrain/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# (C) Copyright 2014, Google Inc.
# (C) Copyright 2018, James R Barlow
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This script provides an easy way to execute various phases of training
# Tesseract. For a detailed description of the phases, see
# https://tesseract-ocr.github.io/tessdoc/Training-Tesseract.html.

import logging

from tesstrain.arguments import (
get_argument_parser,
TrainingArguments,
verify_parameters_and_handle_defaults
)
from tesstrain.generate import cleanup
from tesstrain.wrapper import run_from_context


log = logging.getLogger()


def setup_logging_console():
log.setLevel(logging.DEBUG)
console = logging.StreamHandler()
console.setLevel(logging.INFO)
console_formatter = logging.Formatter(
"[%(asctime)s] %(levelname)s - %(message)s", datefmt="%H:%M:%S"
)
console.setFormatter(console_formatter)
log.addHandler(console)


def setup_logging_logfile(logfile):
logfile = logging.FileHandler(logfile, encoding='utf-8')
logfile.setLevel(logging.DEBUG)
logfile_formatter = logging.Formatter(
"[%(asctime)s] - %(levelname)s - %(name)s - %(message)s"
)
logfile.setFormatter(logfile_formatter)
log.addHandler(logfile)
return logfile


def parse_flags(argv=None):
ctx = TrainingArguments()
log.debug(ctx)
parser = get_argument_parser()
parser.parse_args(args=argv, namespace=ctx)
return verify_parameters_and_handle_defaults(ctx)


def main():
setup_logging_console()
ctx = parse_flags()
logfile = setup_logging_logfile(ctx.log_file)

run_from_context(ctx)

log.removeHandler(logfile)
logfile.close()
cleanup(ctx)
log.info("All done!")
return 0


if __name__ == '__main__':
main()
Loading

0 comments on commit f4103bd

Please sign in to comment.