Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update bulk image conversion script to support multiple formats #226

Open
wants to merge 2 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ repos:
- id: convert-images
name: Convert png to jpg in samples
entry: ./scripts/run_python_hook.sh scripts/hooks/convert_images_hook.py
args: ["--trigger-size", "150"]
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is intentional. We are okay with having png images in the codebase if they're below this size

args: ["--format", "jpg"]
files: ^.*\.(png|PNG)$
pass_filenames: true
stages: [commit]
Expand Down
6 changes: 5 additions & 1 deletion scripts/hooks/convert_images_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
import argparse
import os

import argparse
import os
from scripts.local.utils.bulk_ops_common import convert_image, convert_pdf_to_jpg

from scripts.utils.image_utils import convert_image, get_size_in_kb, get_size_reduction
Comment on lines +10 to 12
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • The convert_pdf_to_jpg import wouldn't be needed for this hook.
  • And about convert_image let's update the existing function in image_utils itself
  • I think you should test this hook properly as well by adding some sample png images in local and trying to commit those files (after pre-commit install is done).
  • Let's also add docs to the function below



Expand Down Expand Up @@ -73,4 +77,4 @@ def parse_args():
exit(1)
else:
# print("All sample images are jpgs. Commit accepted.")
exit(0)
exit(0)
67 changes: 23 additions & 44 deletions scripts/local/convert_images.py
Original file line number Diff line number Diff line change
@@ -1,59 +1,38 @@
#!/usr/bin/env python3
import argparse
import os
from scripts.local.utils.bulk_ops_common import convert_image

from scripts.local.utils.bulk_ops_common import add_common_args, run_argparser


def convert_image_to():
# Wrapper to handle all available extensions
pass


def convert_images_in_tree(args):
input_directory = args.get("input", None)
recursive = args.get("recursive", None)
output_directory = args.get("output", None)
trigger_size = args.get("trigger_size", None)
def convert_images(filenames, output_format):
converted_count = 0
for image_path in filenames:
old_size = get_size_in_kb(image_path)
if old_size <= trigger_size:
continue

# Note: the pre-commit hook takes care of ensuring only image files are passed here.
new_image_path = convert_image(image_path)
new_size = get_size_in_kb(new_image_path)
if new_size <= old_size:
print(
f"Converted png to jpg: {image_path}: {new_size:.2f}KB {get_size_reduction(old_size, new_size)}"
)
for input_path in filenames:
if input_path.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.tif')):
name, ext = os.path.splitext(input_path)
output_path = f"{name}.{output_format.lower()}"
convert_image(input_path, output_path, output_format)
print(f"Converted {input_path} to {output_format}")
converted_count += 1
else:
print(
f"Skipping conversion for {image_path} as size is more than before ({new_size:.2f} KB > {old_size:.2f} KB)"
)
os.remove(new_image_path)

print(f"Skipping unsupported file: {input_path}")
return converted_count


def parse_args():
# construct the argument parse and parse the arguments
argparser = argparse.ArgumentParser()
add_common_args(argparser, ["--input", "--output", "--trigger-size", "--recursive"])
args = run_argparser(argparser)
parser = argparse.ArgumentParser(description="Convert images for pre-commit hook")
parser.add_argument(
"--format",
choices=['jpg', 'png', 'jpeg'],
default='jpg',
help="Output format for images (default: jpg)"
)
parser.add_argument("filenames", nargs="*", help="Files to convert.")
args = parser.parse_args()
return args


if __name__ == "__main__":
args = parse_args()

converted_count = convert_images_in_tree(args)
trigger_size = args["trigger_size"]
converted_count = convert_images(args.filenames, args.format.upper())
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here we can call the function bulk_apply_on_files(convert_image, input_dir, file_paths, in_place) or something similar based on our discussion (check other comments below)

if converted_count > 0:
print(
f"Note: {converted_count} png images above {trigger_size}KB were converted to jpg.\nPlease manually remove the png files and add your commit again."
)
print(f"Note: {converted_count} images were converted.")
exit(1)
else:
# print("All sample images are jpgs. Commit accepted.")
exit(0)
exit(0)
143 changes: 101 additions & 42 deletions scripts/local/utils/bulk_ops_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,43 +3,20 @@
import glob
import operator
import os

from PIL import Image
from pdf2image import convert_from_path
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

new packages in local scripts need to be added to requirements.dev.txt

from src.utils.file import PathUtils

# TODO: add shell utilities for simple local images processing such as:
# From issue: https://github.com/Udayraj123/OMRChecker/issues/213
# - bulk resize,
# - clip to max width (or height)
# - with a conditional trigger if the file size exceeds a provided value
# - bulk convert :
# - pdf to jpg
# - png to jpg or vice versa
# - tiff
# - bulk rename files
# - adding folder name to file name
# - removing non-utf characters from filename (to avoid processing errors)
# - add watermark to all images
# - blur a particular section of the images (e.g. student names and signatures)
# - create a gif from a folder of images
# - Save output of cropped pages to avoid cropping in each run (and merge with manually cropped images)
# - Save output of cropped markers to avoid cropping in each run (and merge with manually cropped images)

# Make sure to be cross-os compatible i.e. use Posix paths wherever possible


# Maybe have a common util file for bulk ops and then create one file for each of the above util.


# Usual pre-processing commands for speedups (useful during re-runs)
# python3 scripts/local/convert_images.py -i inputs/ --replace [--filter-ext png,jpg] --output-ext jpg
# python3 scripts/local/resize_images.py -i inputs/ -o outputs --max-width=1500
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's keep all these comments as a guide for the contributors to figure out what needs to be done here


# TODO: add shell utilities for bulk image processing, resizing, watermarking, etc.

def walk_and_extract_files(input_dir, file_extensions):
"""
Walks through the directory to extract files with specified extensions.
"""
extracted_files = []
for _dir, _subdir, _files in os.walk(input_dir):
matching_globs = [
glob(os.path.join(_dir, f"*.{file_extension}"))
glob.glob(os.path.join(_dir, f"*.{file_extension}"))
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's change the import to from glob import glob instead

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, thanks for adding the code comment

for file_extension in file_extensions
]
matching_files = functools.reduce(operator.iconcat, matching_globs, [])
Expand All @@ -49,8 +26,11 @@ def walk_and_extract_files(input_dir, file_extensions):


def get_local_argparser():
"""
Returns an argument parser with common input, output, and optional recursive processing flags.
"""
local_argparser = argparse.ArgumentParser()

local_argparser.add_argument(
"-i",
"--input",
Expand All @@ -72,29 +52,79 @@ def get_local_argparser():
local_argparser.add_argument(
"-r",
"--recursive",
required=True,
type=bool,
action='store_true',
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why should we have this default? It's better to take recursive updates as an explicit input

dest="recursive",
help="Specify whether to process subdirectories recursively",
)

local_argparser.add_argument(
"--trigger-size",
default=200,
required=True,
type=int,
dest="trigger_size",
help="Specify minimum file size to trigger the hook.",
help="Specify minimum file size (KB) to trigger the hook.",
)

return local_argparser


def convert_image(input_path, output_path, output_format):
"""
Converts an image to the specified output format.
"""
with Image.open(input_path) as img:
if output_format == 'JPG':
output_format = 'JPEG'
img.save(output_path, output_format)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's check the existing util in image_utils file. Merge the two functions as per the new output_formats support

def convert_image(image_path):
    with Image.open(image_path) as image:
        # Note: using hardcoded -4 as we're expected to receive .png or .PNG files only
        new_image_path = f"{image_path[:-4]}.jpg"
        if not image.mode == "RGB":
            image = image.convert("RGB")
        image.save(new_image_path, "JPEG", quality=90, optimize=True)

        return new_image_path

Please remove the existing image_path[:-4] logic too



def convert_pdf_to_jpg(input_path, output_dir):
"""
Converts a PDF to a series of JPG images, one per page.
"""
pages = convert_from_path(input_path)
for i, page in enumerate(pages):
output_path = os.path.join(output_dir, f"page_{i + 1}.jpg")
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have a feature extension idea here, see if you can implement -
Let's add a support to extract only the first page based on a flag (in that case the image's output_dir shouldn't be created and the single image should be output directly with same filename without the page_ prefix.

page.save(output_path, 'JPEG')
Comment on lines +85 to +88
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's also include file name here

Suggested change
pages = convert_from_path(input_path)
for i, page in enumerate(pages):
output_path = os.path.join(output_dir, f"page_{i + 1}.jpg")
page.save(output_path, 'JPEG')
pages = convert_from_path(input_path)
input_path = Path(PurePosixPath(input_path).as_posix())
file_name = input_path.name
for i, page in enumerate(pages):
output_path = os.path.join(output_dir, f"{file_name}_page_{i + 1}.jpg")
print(f"Saving page: {output_path}")
page.save(output_path, 'JPEG')

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also convert_image and convert_pdf_to_jpg should come from image_utils instead of bulk_ops_common.py.
I am thinking of keeping this file only for operations that are common across all sorts of bulk scripts (bulk resize, convert, watermark, etc)



def bulk_convert(input_dir, output_dir, output_format, in_place=False):
"""
Bulk converts images and PDFs to the specified format.
"""
os.makedirs(output_dir, exist_ok=True)
extensions = ['png', 'jpg', 'jpeg', 'tiff', 'tif', 'pdf']
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's keep pdf handling in a separate script (check other comments)

Suggested change
extensions = ['png', 'jpg', 'jpeg', 'tiff', 'tif', 'pdf']
extensions = ['png', 'jpg', 'jpeg', 'tiff', 'tif']


filepaths = walk_and_extract_files(input_dir, extensions)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

snake case

Suggested change
filepaths = walk_and_extract_files(input_dir, extensions)
file_paths = walk_and_extract_files(input_dir, extensions)


for input_path in filepaths:
relative_path = os.path.relpath(os.path.dirname(input_path), input_dir)
output_subdir = os.path.join(output_dir, relative_path) if not in_place else os.path.dirname(input_path)
os.makedirs(output_subdir, exist_ok=True)

Comment on lines +100 to +104
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The in_place flag handling can be a common bulk ops utility. I propose we define a function bulk_apply_on_files(func, input_dir, file_paths, in_place)

  • The first arg func can be convert_image or convert_pdf_to_image
  • The bulk_apply_on_files applies this passed function with the same arguments
    • if in_place is True and the conversion was successful, the input file gets removed (pdf or image) and the output files are added in the same directory
    • if in_place is False and the conversion was successful, the output files are added in a relative directory output_dir = f"{input_dir}/outputs/"
    • If the conversion was unsuccessful, no file changes happen

Let's discuss more about this in the discord group if needed.

filename = os.path.basename(input_path)
if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.tif')):
name, _ = os.path.splitext(filename)
output_path = os.path.join(output_subdir, f"{name}.{output_format.lower()}")
Comment on lines +107 to +108
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this shouldn't be needed here. The convert_image util already takes output_format as the input we can move this logic there.

convert_image(input_path, output_path, output_format)
print(f"Converted {filename} to {output_format}")
elif filename.lower().endswith('.pdf'):
pdf_output_dir = os.path.join(output_subdir, os.path.splitext(filename)[0])
os.makedirs(pdf_output_dir, exist_ok=True)
convert_pdf_to_jpg(input_path, pdf_output_dir)
Comment on lines +112 to +114
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move this logic to a separate script (see the other comment below)

  • Apart from making filename as the directory, we should have the filename in the output image as well(suggested in another comment)
  • In case of single-page PDFs:
    • A file image1.pdf would become image1/image1_page1.jpg, but a user may desire to extract the first page into image1.jpg (without the directory)
    • let's have a flag in the script that allows this e.g. --first-page-only

print(f"Converted {filename} to JPG")
else:
print(f"Skipping unsupported file: {filename}")
Comment on lines +91 to +117
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function looks bulky. Also the user is less likely to have both pdfs and other image formats together.
We want each script to do small dedicated tasks. We can split it into two use cases and user may run the two scripts separately when needed.

So we can break the function down this way:

  • Createscripts/local/convert_pdf_to_image.py : Only handles PDF conversions explicity
  • Updatescripts/local/convert_images.py: Handles all image conversions (png, jpg, tiff, etc)
  • Minor code duplication is okay within these two files
  • Only bulk operation related code can be kept in bulk_ops_common

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you want you can raise a separate PR for PDF handling altogether (for more credit in terms of # of PRs)



def add_common_args(argparser, arguments):
"""
Adds arguments from the local argparser to the main argument parser.
"""
local_argparser = get_local_argparser()
for argument in arguments:
for action in local_argparser._actions:
if argument in action.option_strings:
# Copy the argument from local_argparser to argparser
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's not remove this comment, it is to explain what the next line is doing

argparser.add_argument(
*action.option_strings,
dest=action.dest,
Expand All @@ -107,15 +137,44 @@ def add_common_args(argparser, arguments):


def run_argparser(argparser):
(
args,
unknown,
) = argparser.parse_known_args()

"""
Runs the argument parser and returns parsed arguments.
"""
args, unknown = argparser.parse_known_args()
args = vars(args)

if len(unknown) > 0:
if unknown:
argparser.print_help()
raise Exception(f"\nError: Unknown arguments: {unknown}")

return args


def main():
"""
Main entry point for the script. Handles argument parsing and starts the bulk conversion process.
"""
parser = argparse.ArgumentParser(description="Bulk image and PDF converter")
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function should be present in local/convert_images.py instead. The bulk_ops_common.py is for utilities across all bulk converters


# Add standard arguments
add_common_args(parser, ['-i', '--input', '-o', '--output', '--recursive', '--trigger-size'])

parser.add_argument(
"--format",
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's make this a bit more specific

Suggested change
"--format",
"--out-format",

choices=['jpg', 'png', 'jpeg'],
default='jpg',
help="Output format for images (default: jpg)"
)
parser.add_argument(
"--in-place",
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"--in-place",
"--inplace",

action='store_true',
help="Modify files in place"
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also mention what happens if inplace=False is passed in the description

)

args = run_argparser(parser)

bulk_convert(args['input'], args['output'], args['format'].upper(), args['in_place'])


if __name__ == "__main__":
main()