Skip to content

Commit

Permalink
Update the processing so bbox coords are adjusted for padding
Browse files Browse the repository at this point in the history
  • Loading branch information
amyeroberts committed Feb 1, 2024
1 parent e19c12e commit 3c819b0
Show file tree
Hide file tree
Showing 10 changed files with 820 additions and 117 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from ...image_utils import (
OPENAI_CLIP_MEAN,
OPENAI_CLIP_STD,
AnnotationType,
ChannelDimension,
ImageInput,
PILImageResampling,
Expand All @@ -42,6 +43,20 @@
logger = logging.get_logger(__name__)


# Copied from transformers.models.vilt.image_processing_vilt.safe_squeeze
def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
"""
Squeezes an array, but only if the axis specified has dim 1.
"""
if axis is None:
return arr.squeeze()

try:
return arr.squeeze(axis=axis)
except ValueError:
return arr


# Copied from transformers.models.vilt.image_processing_vilt.max_across_indices
def max_across_indices(values: Iterable[Any]) -> List[Any]:
"""
Expand Down Expand Up @@ -280,11 +295,55 @@ def center_crop(
**kwargs,
)

# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
def _update_annotation_for_padded_image(
self, annotation: Dict, input_image_size: Tuple[int, int], output_image_size: Tuple[int, int], padding
) -> Dict:
"""
Update the annotation for a padded image.
"""
new_annotation = {}
new_annotation["size"] = output_image_size

input_height, input_width = input_image_size
output_height, output_width = output_image_size
for key, value in annotation.items():
if key == "boxes":
boxes = value
boxes *= np.asarray(
[
input_width / output_width,
input_height / output_height,
input_width / output_width,
input_height / output_height,
],
dtype=np.float32,
)
new_annotation["boxes"] = boxes
elif key == "masks":
masks = value
# FIXME - check the value to pad with here
masks = pad(
masks[:, None],
padding,
mode=PaddingMode.CONSTANT,
constant_values=0,
input_data_format=ChannelDimension.LAST,
)
masks = safe_squeeze(masks, 1)
new_annotation["masks"] = masks
elif key == "size":
new_annotation["size"] = output_image_size
else:
new_annotation[key] = value
return new_annotation

# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
def _pad_image(
self,
image: np.ndarray,
output_size: Tuple[int, int],
annotation: Optional[Dict[str, Any]] = None,
constant_values: Union[float, Iterable[float]] = 0,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
Expand All @@ -306,12 +365,17 @@ def _pad_image(
data_format=data_format,
input_data_format=input_data_format,
)
return padded_image
if annotation is not None:
annotation = self._update_annotation_for_padded_image(
annotation, (input_height, input_width), (output_height, output_width), padding
)
return padded_image, annotation

# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
def pad(
self,
images: List[np.ndarray],
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
constant_values: Union[float, Iterable[float]] = 0,
return_pixel_mask: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
Expand All @@ -323,8 +387,10 @@ def pad(
in the batch and optionally returns their corresponding pixel mask.
Args:
image (`np.ndarray`):
Image to pad.
images (List[`np.ndarray`]):
Images to pad.
annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
Annotations to transform according to the padding that is applied to the images.
constant_values (`float` or `Iterable[float]`, *optional*):
The value to use for the padding if `mode` is `"constant"`.
return_pixel_mask (`bool`, *optional*, defaults to `True`):
Expand All @@ -343,16 +409,21 @@ def pad(
"""
pad_size = get_max_height_width(images, input_data_format=input_data_format)

padded_images = [
self._pad_image(
annotation_list = annotations if annotations is not None else [None] * len(images)
padded_images = []
padded_annotations = []
for image, annotation in zip(images, annotation_list):
padded_image, padded_annotation = self._pad_image(
image,
pad_size,
annotation,
constant_values=constant_values,
data_format=data_format,
input_data_format=input_data_format,
)
for image in images
]
padded_images.append(padded_image)
padded_annotations.append(padded_annotation)

data = {"pixel_values": padded_images}

if return_pixel_mask:
Expand All @@ -362,7 +433,14 @@ def pad(
]
data["pixel_mask"] = masks

return BatchFeature(data=data, tensor_type=return_tensors)
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)

if annotations is not None:
encoded_inputs["labels"] = [
BatchFeature(padded_annotations, tensor_type=return_tensors) for annotation in annotations
]

return encoded_inputs

def preprocess(
self,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1011,11 +1011,55 @@ def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) ->
"""
return normalize_annotation(annotation, image_size=image_size)

# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
def _update_annotation_for_padded_image(
self, annotation: Dict, input_image_size: Tuple[int, int], output_image_size: Tuple[int, int], padding
) -> Dict:
"""
Update the annotation for a padded image.
"""
new_annotation = {}
new_annotation["size"] = output_image_size

input_height, input_width = input_image_size
output_height, output_width = output_image_size
for key, value in annotation.items():
if key == "boxes":
boxes = value
boxes *= np.asarray(
[
input_width / output_width,
input_height / output_height,
input_width / output_width,
input_height / output_height,
],
dtype=np.float32,
)
new_annotation["boxes"] = boxes
elif key == "masks":
masks = value
# FIXME - check the value to pad with here
masks = pad(
masks[:, None],
padding,
mode=PaddingMode.CONSTANT,
constant_values=0,
input_data_format=ChannelDimension.LAST,
)
masks = safe_squeeze(masks, 1)
new_annotation["masks"] = masks
elif key == "size":
new_annotation["size"] = output_image_size
else:
new_annotation[key] = value
return new_annotation

# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
def _pad_image(
self,
image: np.ndarray,
output_size: Tuple[int, int],
annotation: Optional[Dict[str, Any]] = None,
constant_values: Union[float, Iterable[float]] = 0,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
Expand All @@ -1037,12 +1081,17 @@ def _pad_image(
data_format=data_format,
input_data_format=input_data_format,
)
return padded_image
if annotation is not None:
annotation = self._update_annotation_for_padded_image(
annotation, (input_height, input_width), (output_height, output_width), padding
)
return padded_image, annotation

# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
def pad(
self,
images: List[np.ndarray],
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
constant_values: Union[float, Iterable[float]] = 0,
return_pixel_mask: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
Expand All @@ -1054,8 +1103,10 @@ def pad(
in the batch and optionally returns their corresponding pixel mask.
Args:
image (`np.ndarray`):
Image to pad.
images (List[`np.ndarray`]):
Images to pad.
annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
Annotations to transform according to the padding that is applied to the images.
constant_values (`float` or `Iterable[float]`, *optional*):
The value to use for the padding if `mode` is `"constant"`.
return_pixel_mask (`bool`, *optional*, defaults to `True`):
Expand All @@ -1074,16 +1125,21 @@ def pad(
"""
pad_size = get_max_height_width(images, input_data_format=input_data_format)

padded_images = [
self._pad_image(
annotation_list = annotations if annotations is not None else [None] * len(images)
padded_images = []
padded_annotations = []
for image, annotation in zip(images, annotation_list):
padded_image, padded_annotation = self._pad_image(
image,
pad_size,
annotation,
constant_values=constant_values,
data_format=data_format,
input_data_format=input_data_format,
)
for image in images
]
padded_images.append(padded_image)
padded_annotations.append(padded_annotation)

data = {"pixel_values": padded_images}

if return_pixel_mask:
Expand All @@ -1093,7 +1149,14 @@ def pad(
]
data["pixel_mask"] = masks

return BatchFeature(data=data, tensor_type=return_tensors)
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)

if annotations is not None:
encoded_inputs["labels"] = [
BatchFeature(padded_annotations, tensor_type=return_tensors) for annotation in annotations
]

return encoded_inputs

# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.preprocess
def preprocess(
Expand Down Expand Up @@ -1308,21 +1371,23 @@ def preprocess(

if do_pad:
# Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
data = self.pad(
images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format
encoded_inputs = self.pad(
images,
annotations=annotations,
return_pixel_mask=True,
data_format=data_format,
input_data_format=input_data_format,
)
else:
images = [
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
for image in images
]
data = {"pixel_values": images}

encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
if annotations is not None:
encoded_inputs["labels"] = [
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
]
encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
if annotations is not None:
encoded_inputs["labels"] = [
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
]

return encoded_inputs

Expand Down
Loading

0 comments on commit 3c819b0

Please sign in to comment.