Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix duplicate categorical features added from image exif metadata in RAI Vision Dashboard #2483

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ def extract_features(image_dataset: pd.DataFrame,
results = []
dropped_features = feature_metadata.dropped_features \
if feature_metadata else None
if feature_metadata and feature_metadata.categorical_features is None:
if not feature_metadata:
feature_metadata = FeatureMetadata()
if feature_metadata.categorical_features is None:
feature_metadata.categorical_features = []
exif_feature_names = get_all_exif_feature_names(image_dataset)
feature_names = [MEAN_PIXEL_VALUE] + exif_feature_names
Expand Down Expand Up @@ -73,40 +75,8 @@ def extract_features(image_dataset: pd.DataFrame,
row_feature_values = [mean_pixel_value] + \
[None] * len(exif_feature_names)

# append all exif features
if isinstance(image, str):
image_pointer_path = get_image_pointer_from_path(image)
with Image.open(image_pointer_path) as im:
exifdata = im.getexif()
for tag_id in exifdata:
# get the tag name, instead of human unreadable tag id
tag = TAGS.get(tag_id, tag_id)
data = exifdata.get(tag_id)
# decode bytes
if isinstance(data, bytes):
data = data.decode()
if len(data) > MAX_CUSTOM_LEN:
data = data[:MAX_CUSTOM_LEN] + '...'
if isinstance(data, str):
if not feature_metadata:
feature_metadata = FeatureMetadata()
feature_metadata.categorical_features = []
if tag in feature_names:
feature_metadata.categorical_features.append(
str(tag))
tag_index = feature_names.index(tag)
row_feature_values[tag_index] = data
else:
# in theory this should now never happen with
# latest code, but adding this check for safety
if tag not in blacklisted_tags:
blacklisted_tags.add(tag)
warnings.warn(
f'Exif tag {tag} could not be found '
'in the feature names. Ignoring tag '
'from extracted metadata.')
elif isinstance(data, int) or isinstance(data, float):
row_feature_values[feature_names.index(tag)] = data
append_exif_features(image, row_feature_values, feature_names,
blacklisted_tags, feature_metadata)

# append all features other than target column and label
for j in range(start_meta_index, image_dataset.shape[1]):
Expand All @@ -115,3 +85,37 @@ def extract_features(image_dataset: pd.DataFrame,
row_feature_values.append(image_dataset.iloc[i, j])
results.append(row_feature_values)
return results, feature_names


def append_exif_features(image, row_feature_values, feature_names,
blacklisted_tags, feature_metadata):
if isinstance(image, str):
image_pointer_path = get_image_pointer_from_path(image)
with Image.open(image_pointer_path) as im:
exifdata = im.getexif()
for tag_id in exifdata:
# get the tag name, instead of human unreadable tag id
tag = str(TAGS.get(tag_id, tag_id))
data = exifdata.get(tag_id)
# decode bytes
if isinstance(data, bytes):
data = data.decode()
if len(data) > MAX_CUSTOM_LEN:
data = data[:MAX_CUSTOM_LEN] + '...'
if isinstance(data, str):
if tag in feature_names:
imatiach-msft marked this conversation as resolved.
Show resolved Hide resolved
if tag not in feature_metadata.categorical_features:
feature_metadata.categorical_features.append(tag)
tag_index = feature_names.index(tag)
row_feature_values[tag_index] = data
else:
# in theory this should now never happen with
# latest code, but adding this check for safety
if tag not in blacklisted_tags:
blacklisted_tags.add(tag)
warnings.warn(
f'Exif tag {tag} could not be found '
'in the feature names. Ignoring tag '
'from extracted metadata.')
elif isinstance(data, int) or isinstance(data, float):
row_feature_values[feature_names.index(tag)] = data
27 changes: 20 additions & 7 deletions responsibleai_vision/tests/test_feature_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
load_fridge_object_detection_dataset,
load_imagenet_dataset)

from responsibleai.feature_metadata import FeatureMetadata
from responsibleai_vision.common.constants import (ExtractedFeatures,
ImageColumns, ImageModes)
from responsibleai_vision.utils.feature_extractors import extract_features
Expand All @@ -18,17 +19,23 @@


def validate_extracted_features(extracted_features, feature_names,
expected_feature_names, data):
expected_feature_names, data,
feature_metadata=None):
assert len(extracted_features) == len(data)
assert feature_names[0] == expected_feature_names[0]
for i in range(1, len(feature_names)):
assert feature_names[i] in expected_feature_names
assert len(feature_names) == len(expected_feature_names)
assert len(extracted_features[0]) == len(feature_names)
if feature_metadata is not None:
assert len(feature_metadata.categorical_features) <= len(feature_names)
for categorical_feature in feature_metadata.categorical_features:
assert categorical_feature in feature_names


def extract_dataset_features(data):
return extract_features(data, ImageColumns.LABEL, ImageModes.RGB, None)
def extract_dataset_features(data, feature_metadata=None):
return extract_features(data, ImageColumns.LABEL, ImageModes.RGB,
feature_metadata=feature_metadata)


class TestFeatureExtractors(object):
Expand All @@ -55,15 +62,21 @@ def test_extract_features_imagenet_metadata(self):

def test_extract_features_flowers_metadata(self):
data = load_flowers_dataset(upscale=False)
extracted_features, feature_names = extract_dataset_features(data)
feature_metadata = FeatureMetadata()
extracted_features, feature_names = extract_dataset_features(
data, feature_metadata=feature_metadata)
expected_feature_names = [MEAN_PIXEL_VALUE]
validate_extracted_features(extracted_features, feature_names,
expected_feature_names, data)
expected_feature_names, data,
feature_metadata)

def test_extract_features_mixed_exif_XPComment_metadata(self):
data = load_fridge_dataset(add_extra_mixed_metadata=True)
extracted_features, feature_names = extract_dataset_features(data)
feature_metadata = FeatureMetadata()
extracted_features, feature_names = extract_dataset_features(
data, feature_metadata=feature_metadata)
expected_feature_names = [MEAN_PIXEL_VALUE, 'XPComment']
expected_feature_names += FRIDGE_METADATA_FEATURES
validate_extracted_features(extracted_features, feature_names,
expected_feature_names, data)
expected_feature_names, data,
feature_metadata)
Loading