Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DETR] Update the processing to adapt masks & bboxes to reflect padding #28363

Merged
merged 7 commits into from
Feb 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ def center_crop(
**kwargs,
)

# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
# Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image
def _pad_image(
self,
image: np.ndarray,
Expand Down Expand Up @@ -308,7 +308,7 @@ def _pad_image(
)
return padded_image

# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
# Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad
def pad(
self,
images: List[np.ndarray],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -785,9 +785,14 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
do_convert_annotations (`bool`, *optional*, defaults to `True`):
Controls whether to convert the annotations to the format expected by the DETR model. Converts the
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
overridden by the `do_pad` parameter in the `preprocess` method.
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
method. If `True` will pad the images in the batch to the largest height and width in the batch.
Padding will be applied to the bottom and right of the image with zeros.
"""

model_input_names = ["pixel_values", "pixel_mask"]
Expand All @@ -804,6 +809,7 @@ def __init__(
do_normalize: bool = True,
image_mean: Union[float, List[float]] = None,
image_std: Union[float, List[float]] = None,
do_convert_annotations: Optional[bool] = None,
do_pad: bool = True,
**kwargs,
) -> None:
Expand All @@ -822,6 +828,10 @@ def __init__(
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
size = get_size_dict(size, max_size=max_size, default_to_square=False)

# Backwards compatibility
if do_convert_annotations is None:
do_convert_annotations = do_normalize

super().__init__(**kwargs)
self.format = format
self.do_resize = do_resize
Expand All @@ -830,6 +840,7 @@ def __init__(
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.do_convert_annotations = do_convert_annotations
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.do_pad = do_pad
Expand Down Expand Up @@ -1007,18 +1018,64 @@ def rescale(
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
"""
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
`[center_x, center_y, width, height]` format.
`[center_x, center_y, width, height]` format and from absolute to relative pixel values.
"""
return normalize_annotation(annotation, image_size=image_size)

# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
def _update_annotation_for_padded_image(
self,
annotation: Dict,
input_image_size: Tuple[int, int],
output_image_size: Tuple[int, int],
padding,
update_bboxes,
) -> Dict:
"""
Update the annotation for a padded image.
"""
new_annotation = {}
new_annotation["size"] = output_image_size

for key, value in annotation.items():
if key == "masks":
masks = value
masks = pad(
masks,
padding,
mode=PaddingMode.CONSTANT,
constant_values=0,
input_data_format=ChannelDimension.FIRST,
)
masks = safe_squeeze(masks, 1)
new_annotation["masks"] = masks
elif key == "boxes" and update_bboxes:
boxes = value
boxes *= np.asarray(
[
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
]
)
new_annotation["boxes"] = boxes
elif key == "size":
new_annotation["size"] = output_image_size
else:
new_annotation[key] = value
return new_annotation

# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
def _pad_image(
self,
image: np.ndarray,
output_size: Tuple[int, int],
annotation: Optional[Dict[str, Any]] = None,
constant_values: Union[float, Iterable[float]] = 0,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
) -> np.ndarray:
"""
Pad an image with zeros to the given size.
Expand All @@ -1037,25 +1094,33 @@ def _pad_image(
data_format=data_format,
input_data_format=input_data_format,
)
return padded_image
if annotation is not None:
annotation = self._update_annotation_for_padded_image(
annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
)
return padded_image, annotation

# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
def pad(
self,
images: List[np.ndarray],
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
constant_values: Union[float, Iterable[float]] = 0,
return_pixel_mask: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
) -> BatchFeature:
"""
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
in the batch and optionally returns their corresponding pixel mask.

Args:
image (`np.ndarray`):
Image to pad.
images (List[`np.ndarray`]):
Images to pad.
annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
Annotations to transform according to the padding that is applied to the images.
constant_values (`float` or `Iterable[float]`, *optional*):
The value to use for the padding if `mode` is `"constant"`.
return_pixel_mask (`bool`, *optional*, defaults to `True`):
Expand All @@ -1071,19 +1136,29 @@ def pad(
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
update_bboxes (`bool`, *optional*, defaults to `True`):
Whether to update the bounding boxes in the annotations to match the padded images. If the
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
format, the bounding boxes will not be updated.
"""
pad_size = get_max_height_width(images, input_data_format=input_data_format)

padded_images = [
self._pad_image(
annotation_list = annotations if annotations is not None else [None] * len(images)
padded_images = []
padded_annotations = []
for image, annotation in zip(images, annotation_list):
padded_image, padded_annotation = self._pad_image(
image,
pad_size,
annotation,
constant_values=constant_values,
data_format=data_format,
input_data_format=input_data_format,
update_bboxes=update_bboxes,
)
for image in images
]
padded_images.append(padded_image)
padded_annotations.append(padded_annotation)

data = {"pixel_values": padded_images}

if return_pixel_mask:
Expand All @@ -1093,7 +1168,14 @@ def pad(
]
data["pixel_mask"] = masks

return BatchFeature(data=data, tensor_type=return_tensors)
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)

if annotations is not None:
encoded_inputs["labels"] = [
BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
]

return encoded_inputs

# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.preprocess
def preprocess(
Expand All @@ -1108,6 +1190,7 @@ def preprocess(
do_rescale: Optional[bool] = None,
rescale_factor: Optional[Union[int, float]] = None,
do_normalize: Optional[bool] = None,
do_convert_annotations: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_pad: Optional[bool] = None,
Expand Down Expand Up @@ -1151,12 +1234,17 @@ def preprocess(
Rescale factor to use when rescaling the image.
do_normalize (`bool`, *optional*, defaults to self.do_normalize):
Whether to normalize the image.
do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
Whether to convert the annotations to the format expected by the model. Converts the bounding
boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
and in relative coordinates.
image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
Mean to use when normalizing the image.
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
Standard deviation to use when normalizing the image.
do_pad (`bool`, *optional*, defaults to self.do_pad):
Whether to pad the image.
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
Format of the annotations.
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
Expand Down Expand Up @@ -1197,6 +1285,9 @@ def preprocess(
do_normalize = self.do_normalize if do_normalize is None else do_normalize
image_mean = self.image_mean if image_mean is None else image_mean
image_std = self.image_std if image_std is None else image_std
do_convert_annotations = (
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
)
do_pad = self.do_pad if do_pad is None else do_pad
format = self.format if format is None else format

Expand Down Expand Up @@ -1300,29 +1391,34 @@ def preprocess(
images = [
self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
]
if annotations is not None:
annotations = [
self.normalize_annotation(annotation, get_image_size(image, input_data_format))
for annotation, image in zip(annotations, images)
]

if do_convert_annotations and annotations is not None:
annotations = [
self.normalize_annotation(annotation, get_image_size(image, input_data_format))
for annotation, image in zip(annotations, images)
]

if do_pad:
# Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
data = self.pad(
images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format
encoded_inputs = self.pad(
images,
annotations=annotations,
return_pixel_mask=True,
data_format=data_format,
input_data_format=input_data_format,
return_tensors=return_tensors,
update_bboxes=do_convert_annotations,
)
else:
images = [
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
for image in images
]
data = {"pixel_values": images}

encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
if annotations is not None:
encoded_inputs["labels"] = [
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
]
encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
if annotations is not None:
encoded_inputs["labels"] = [
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
]

return encoded_inputs

Expand Down
Loading
Loading