From c79ecb91e2ebc4d25e5947352261b88c97a12099 Mon Sep 17 00:00:00 2001 From: yasakova-anastasia Date: Mon, 19 Apr 2021 17:09:53 +0300 Subject: [PATCH 1/4] add classification --- datumaro/plugins/lfw_format.py | 242 +++++++++++++++++------ tests/assets/lfw_dataset/test/people.txt | 3 + tests/test_lfw_format.py | 176 ++++++++--------- 3 files changed, 268 insertions(+), 153 deletions(-) create mode 100644 tests/assets/lfw_dataset/test/people.txt diff --git a/datumaro/plugins/lfw_format.py b/datumaro/plugins/lfw_format.py index 5799ad87e5..8033450120 100644 --- a/datumaro/plugins/lfw_format.py +++ b/datumaro/plugins/lfw_format.py @@ -8,7 +8,7 @@ from datumaro.components.converter import Converter from datumaro.components.extractor import (AnnotationType, DatasetItem, - Importer, Points, SourceExtractor) + Importer, Label, LabelCategories, Points, SourceExtractor) from datumaro.util.image import find_images @@ -16,8 +16,9 @@ class LfwPath: IMAGES_DIR = 'images' LANDMARKS_FILE = 'landmarks.txt' PAIRS_FILE = 'pairs.txt' + PEOPLE_FILE = 'people.txt' IMAGE_EXT = '.jpg' - PATTERN = re.compile(r'([\w]+)_([-\d]+)') + PATTERN = re.compile(r'([\w-]+)_([-\d]+)') class LfwExtractor(SourceExtractor): def __init__(self, path, subset=None): @@ -29,14 +30,29 @@ def __init__(self, path, subset=None): super().__init__(subset=subset) self._dataset_dir = osp.dirname(osp.dirname(path)) + + people_file = osp.join(osp.dirname(path), LfwPath.PEOPLE_FILE) + self._categories = self._load_categories(people_file) + self._items = list(self._load_items(path).values()) + def _load_categories(self, path): + label_cat = LabelCategories() + if osp.isfile(path): + with open(path, encoding='utf-8') as labels_file: + for line in labels_file: + objects = line.strip().split('\t') + if len(objects) == 2: + label_cat.add(objects[0]) + return { AnnotationType.label: label_cat } + def _load_items(self, path): items = {} + label_categories = self._categories.get(AnnotationType.label) images_dir = osp.join(self._dataset_dir, self._subset, LfwPath.IMAGES_DIR) if osp.isdir(images_dir): - images = { osp.splitext(osp.relpath(p, images_dir))[0]: p + images = { osp.splitext(osp.relpath(p, images_dir))[0].replace('\\', '/'): p for p in find_images(images_dir, recursive=True) } else: images = {} @@ -44,44 +60,71 @@ def _load_items(self, path): with open(path, encoding='utf-8') as f: for line in f: pair = line.strip().split('\t') - if len(pair) == 3: - if pair[0] == '-': - image1 = pair[1] - image2 = pair[2] - else: - image1 = self.get_image_name(pair[0], pair[1]) - image2 = self.get_image_name(pair[0], pair[2]) - if image1 not in items: - items[image1] = DatasetItem(id=image1, subset=self._subset, - image=images.get(image1), - attributes={'positive_pairs': [], 'negative_pairs': []}) - if image2 not in items: - items[image2] = DatasetItem(id=image2, subset=self._subset, - image=images.get(image2), - attributes={'positive_pairs': [], 'negative_pairs': []}) + if len(pair) == 1 and pair[0] != '': + annotations = [] + image = pair[0] + item_id = pair[0] + objects = item_id.split('/') + if 1 < len(objects): + label_name = objects[0] + label = label_categories.find(label_name)[0] + if label != None: + annotations.append(Label(label)) + item_id = item_id[len(label_name) + 1:] + if item_id not in items: + items[item_id] = DatasetItem(id=item_id, subset=self._subset, + image=images.get(image), annotations=annotations) + elif len(pair) == 3: + image1, id1 = self.get_image_name(pair[0], pair[1]) + image2, id2 = self.get_image_name(pair[0], pair[2]) + label = label_categories.find(pair[0])[0] + if label == None: + raise Exception("Line %s: people file doesn't " + "contain person %s " % (line, pair[0])) + if id1 not in items: + annotations = [] + annotations.append(Label(label)) + items[id1] = DatasetItem(id=id1, subset=self._subset, + image=images.get(image1), annotations=annotations) + if id2 not in items: + annotations = [] + annotations.append(Label(label)) + items[id2] = DatasetItem(id=id2, subset=self._subset, + image=images.get(image2), annotations=annotations) # pairs form a directed graph - items[image1].attributes['positive_pairs'].append(image2) + if not items[id1].annotations[0].attributes.get('positive_pairs'): + items[id1].annotations[0].attributes['positive_pairs'] = [] + items[id1].annotations[0].attributes['positive_pairs'].append(image2) + elif len(pair) == 4: - if pair[0] == '-': - image1 = pair[1] - else: - image1 = self.get_image_name(pair[0], pair[1]) + image1, id1 = self.get_image_name(pair[0], pair[1]) if pair[2] == '-': image2 = pair[3] + id2 = pair[3] else: - image2 = self.get_image_name(pair[2], pair[3]) - if image1 not in items: - items[image1] = DatasetItem(id=image1, subset=self._subset, - image=images.get(image1), - attributes={'positive_pairs': [], 'negative_pairs': []}) - if image2 not in items: - items[image2] = DatasetItem(id=image2, subset=self._subset, - image=images.get(image2), - attributes={'positive_pairs': [], 'negative_pairs': []}) + image2, id2 = self.get_image_name(pair[2], pair[3]) + if id1 not in items: + annotations = [] + label = label_categories.find(pair[0])[0] + if label == None: + raise Exception("Line %s: people file doesn't " + "contain person %s " % (line, pair[0])) + annotations.append(Label(label)) + items[id1] = DatasetItem(id=id1, subset=self._subset, + image=images.get(image1), annotations=annotations) + if id2 not in items: + annotations = [] + label = label_categories.find(pair[2])[0] + if label != None: + annotations.append(Label(label)) + items[id2] = DatasetItem(id=id2, subset=self._subset, + image=images.get(image2), annotations=annotations) # pairs form a directed graph - items[image1].attributes['negative_pairs'].append(image2) + if not items[id1].annotations[0].attributes.get('negative_pairs'): + items[id1].annotations[0].attributes['negative_pairs'] = [] + items[id1].annotations[0].attributes['negative_pairs'].append(image2) landmarks_file = osp.join(self._dataset_dir, self._subset, LfwPath.LANDMARKS_FILE) @@ -91,10 +134,15 @@ def _load_items(self, path): line = line.split('\t') item_id = osp.splitext(line[0])[0] + objects = item_id.split('/') + if 1 < len(objects): + label_name = objects[0] + label = label_categories.find(label_name)[0] + if label != None: + item_id = item_id[len(label_name) + 1:] if item_id not in items: items[item_id] = DatasetItem(id=item_id, subset=self._subset, - image=osp.join(images_dir, line[0]), - attributes={'positive_pairs': [], 'negative_pairs': []}) + image=osp.join(images_dir, line[0])) annotations = items[item_id].annotations annotations.append(Points([float(p) for p in line[1:]])) @@ -103,7 +151,15 @@ def _load_items(self, path): @staticmethod def get_image_name(person, image_id): - return '{}/{}_{:04d}'.format(person, person, int(image_id)) + image, item_id = '', '' + try: + image_id = int(image_id) + image = '{}/{}_{:04d}'.format(person, person, image_id) + item_id = '{}_{:04d}'.format(person, image_id) + except ValueError: + image = '{}/{}'.format(person, image_id) + item_id = image_id + return image, item_id class LfwImporter(Importer): @classmethod @@ -115,42 +171,90 @@ class LfwConverter(Converter): def apply(self): for subset_name, subset in self._extractor.subsets().items(): + label_categories = self._extractor.categories()[AnnotationType.label] + labels = {} + for label in label_categories: + f = label.name + labels[label.name] = 0 + positive_pairs = [] negative_pairs = [] + neutral_items = [] landmarks = [] + included_items = [] for item in subset: + anns = [ann for ann in item.annotations + if ann.type == AnnotationType.label] + label, label_name = None, None + if anns: + label = anns[0] + label_name = label_categories[anns[0].label].name + labels[label_name] += 1 + if self._save_images and item.has_image: - self._save_image(item, - subdir=osp.join(subset_name, LfwPath.IMAGES_DIR)) - - search = LfwPath.PATTERN.search(item.id) - if search: - person1, num1 = search.groups() - num1 = int(num1) - else: - person1 = '-' + subdir=osp.join(subset_name, LfwPath.IMAGES_DIR) + if label_name: + subdir=osp.join(subdir, label_name) + self._save_image(item, subdir=subdir) + + if label != None: + person1 = label_name num1 = item.id - if 'positive_pairs' in item.attributes: - for pair in item.attributes['positive_pairs']: - search = LfwPath.PATTERN.search(pair) - if search: - num2 = search.groups()[1] - num2 = int(num2) - else: - num2 = pair - positive_pairs.append('%s\t%s\t%s' % (person1, num1, num2)) - if 'negative_pairs' in item.attributes: - for pair in item.attributes['negative_pairs']: - search = LfwPath.PATTERN.search(pair) - if search: - person2, num2 = search.groups() - num2 = int(num2) - else: - person2 = '-' - num2 = pair - negative_pairs.append('%s\t%s\t%s\t%s' % \ - (person1, num1, person2, num2)) + if num1.startswith(person1): + num1 = int(num1.replace(person1, '')[1:]) + curr_item = person1 + '/' + str(num1) + + if 'positive_pairs' in label.attributes: + if curr_item not in included_items: + included_items.append(curr_item) + for pair in label.attributes['positive_pairs']: + search = LfwPath.PATTERN.search(pair) + if search: + num2 = search.groups()[1] + num2 = int(num2) + else: + num2 = pair + if num2.startswith(person1): + num2 = num2.replace(person1, '')[1:] + curr_item = person1 + '/' + str(num2) + if curr_item not in included_items: + included_items.append(curr_item) + positive_pairs.append('%s\t%s\t%s' % (person1, num1, num2)) + + if 'negative_pairs' in label.attributes: + if curr_item not in included_items: + included_items.append(curr_item) + for pair in label.attributes['negative_pairs']: + search = LfwPath.PATTERN.search(pair) + curr_item = '' + if search: + person2, num2 = search.groups() + num2 = int(num2) + curr_item += person2 + '/' + else: + person2 = '-' + num2 = pair + objects = pair.split('/') + if 1 < len(objects) and objects[0] in labels: + person2 = objects[0] + num2 = pair.replace(person2, '')[1:] + curr_item += person2 + '/' + curr_item += str(num2) + if curr_item not in included_items: + included_items.append(curr_item) + negative_pairs.append('%s\t%s\t%s\t%s' % \ + (person1, num1, person2, num2)) + + if 'positive_pairs' not in label.attributes and \ + 'negative_pairs' not in label.attributes and \ + curr_item not in included_items: + neutral_items.append('%s/%s' % (person1, item.id)) + included_items.append(curr_item) + + elif item.id not in included_items: + neutral_items.append(item.id) + included_items.append(item.id) item_landmarks = [p for p in item.annotations if p.type == AnnotationType.points] @@ -163,9 +267,17 @@ def apply(self): with open(pairs_file, 'w', encoding='utf-8') as f: f.writelines(['%s\n' % pair for pair in positive_pairs]) f.writelines(['%s\n' % pair for pair in negative_pairs]) + f.writelines(['%s\n' % item for item in neutral_items]) if landmarks: landmarks_file = osp.join(self._save_dir, subset_name, LfwPath.LANDMARKS_FILE) with open(landmarks_file, 'w', encoding='utf-8') as f: f.writelines(['%s\n' % landmark for landmark in landmarks]) + + if labels: + people_file = osp.join(self._save_dir, subset_name, + LfwPath.PEOPLE_FILE) + with open(people_file, 'w', encoding='utf-8') as f: + f.writelines(['%s\t%d\n' % (label, labels[label]) + for label in labels]) diff --git a/tests/assets/lfw_dataset/test/people.txt b/tests/assets/lfw_dataset/test/people.txt new file mode 100644 index 0000000000..015b83c6af --- /dev/null +++ b/tests/assets/lfw_dataset/test/people.txt @@ -0,0 +1,3 @@ +2 +name0 2 +name1 2 \ No newline at end of file diff --git a/tests/test_lfw_format.py b/tests/test_lfw_format.py index 3aa64365d1..2adaae52a4 100644 --- a/tests/test_lfw_format.py +++ b/tests/test_lfw_format.py @@ -3,7 +3,7 @@ import numpy as np from datumaro.components.dataset import Dataset -from datumaro.components.extractor import DatasetItem, Points +from datumaro.components.extractor import DatasetItem, Label, Points from datumaro.plugins.lfw_format import LfwConverter, LfwImporter from datumaro.util.image import Image from datumaro.util.test_utils import TestDir, compare_datasets @@ -12,35 +12,33 @@ class LfwFormatTest(TestCase): def test_can_save_and_load(self): source_dataset = Dataset.from_iterable([ - DatasetItem(id='name0/name0_0001', - subset='test', image=np.ones((2, 5, 3)), - attributes={ - 'positive_pairs': ['name0/name0_0002'], - 'negative_pairs': [] - } + DatasetItem(id='name0_0001', subset='test', + image=np.ones((2, 5, 3)), + annotations=[Label(0, attributes={ + 'positive_pairs': ['name0/name0_0002'] + })] ), - DatasetItem(id='name0/name0_0002', - subset='test', image=np.ones((2, 5, 3)), - attributes={ + DatasetItem(id='name0_0002', subset='test', + image=np.ones((2, 5, 3)), + annotations=[Label(0, attributes={ 'positive_pairs': ['name0/name0_0001'], 'negative_pairs': ['name1/name1_0001'] - } + })] ), - DatasetItem(id='name1/name1_0001', - subset='test', image=np.ones((2, 5, 3)), - attributes={ - 'positive_pairs': ['name1/name1_0002'], - 'negative_pairs': [] - } + DatasetItem(id='name1_0001', subset='test', + image=np.ones((2, 5, 3)), + annotations=[Label(1, attributes={ + 'positive_pairs': ['name1/name1_0002'] + })] ), - DatasetItem(id='name1/name1_0002', - subset='test', image=np.ones((2, 5, 3)), - attributes={ + DatasetItem(id='name1_0002', subset='test', + image=np.ones((2, 5, 3)), + annotations=[Label(1, attributes={ 'positive_pairs': ['name1/name1_0002'], 'negative_pairs': ['name0/name0_0001'] - } + })] ), - ]) + ], categories=['name0', 'name1']) with TestDir() as test_dir: LfwConverter.convert(source_dataset, test_dir, save_images=True) @@ -50,27 +48,23 @@ def test_can_save_and_load(self): def test_can_save_and_load_with_landmarks(self): source_dataset = Dataset.from_iterable([ - DatasetItem(id='name0/name0_0001', + DatasetItem(id='name0_0001', subset='test', image=np.ones((2, 5, 3)), - attributes={ - 'positive_pairs': ['name0/name0_0002'], - 'negative_pairs': [] - }, annotations=[ + Label(0, attributes={ + 'positive_pairs': ['name0/name0_0002'] + }), Points([0, 4, 3, 3, 2, 2, 1, 0, 3, 0]), ] ), - DatasetItem(id='name0/name0_0002', + DatasetItem(id='name0_0002', subset='test', image=np.ones((2, 5, 3)), - attributes={ - 'positive_pairs': [], - 'negative_pairs': [] - }, annotations=[ + Label(0), Points([0, 5, 3, 5, 2, 2, 1, 0, 3, 0]), ] ), - ]) + ], categories=['name0']) with TestDir() as test_dir: LfwConverter.convert(source_dataset, test_dir, save_images=True) @@ -80,21 +74,45 @@ def test_can_save_and_load_with_landmarks(self): def test_can_save_and_load_with_no_subsets(self): source_dataset = Dataset.from_iterable([ - DatasetItem(id='name0/name0_0001', + DatasetItem(id='name0_0001', image=np.ones((2, 5, 3)), - attributes={ - 'positive_pairs': ['name0/name0_0002'], - 'negative_pairs': [] - }, + annotations=[Label(0, attributes={ + 'positive_pairs': ['name0/name0_0002'] + })], ), - DatasetItem(id='name0/name0_0002', + DatasetItem(id='name0_0002', image=np.ones((2, 5, 3)), - attributes={ - 'positive_pairs': [], - 'negative_pairs': [] - }, + annotations=[Label(0)] ), - ]) + ], categories=['name0']) + + with TestDir() as test_dir: + LfwConverter.convert(source_dataset, test_dir, save_images=True) + parsed_dataset = Dataset.import_from(test_dir, 'lfw') + + compare_datasets(self, source_dataset, parsed_dataset) + + def test_can_save_and_load_with_no_format_names(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id='a/1', + image=np.ones((2, 5, 3)), + annotations=[Label(0, attributes={ + 'positive_pairs': ['name0/b/2'], + 'negative_pairs': ['d/4'] + })], + ), + DatasetItem(id='b/2', + image=np.ones((2, 5, 3)), + annotations=[Label(0)] + ), + DatasetItem(id='c/3', + image=np.ones((2, 5, 3)), + annotations=[Label(1)] + ), + DatasetItem(id='d/4', + image=np.ones((2, 5, 3)), + ), + ], categories=['name0', 'name1']) with TestDir() as test_dir: LfwConverter.convert(source_dataset, test_dir, save_images=True) @@ -105,20 +123,15 @@ def test_can_save_and_load_with_no_subsets(self): def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): dataset = Dataset.from_iterable([ DatasetItem(id='кириллица с пробелом', - image=np.ones((2, 5, 3)), - attributes = { - 'positive_pairs': [], - 'negative_pairs': [] - }, + image=np.ones((2, 5, 3)) ), - DatasetItem(id='name0/name0_0002', + DatasetItem(id='name0_0002', image=np.ones((2, 5, 3)), - attributes = { - 'positive_pairs': [], + annotations=[Label(0, attributes={ 'negative_pairs': ['кириллица с пробелом'] - }, + })] ), - ]) + ], categories=['name0']) with TestDir() as test_dir: LfwConverter.convert(dataset, test_dir, save_images=True) @@ -128,21 +141,13 @@ def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): def test_can_save_and_load_image_with_arbitrary_extension(self): dataset = Dataset.from_iterable([ - DatasetItem(id='name0/name0_0001', image=Image( - path='name0/name0_0001.JPEG', data=np.zeros((4, 3, 3))), - attributes={ - 'positive_pairs': [], - 'negative_pairs': [] - }, - ), - DatasetItem(id='name0/name0_0002', image=Image( - path='name0/name0_0002.bmp', data=np.zeros((3, 4, 3))), - attributes={ - 'positive_pairs': ['name0/name0_0001'], - 'negative_pairs': [] - }, + DatasetItem(id='a/1', image=Image( + path='a/1.JPEG', data=np.zeros((4, 3, 3))), ), - ]) + DatasetItem(id='b/c/d/2', image=Image( + path='b/c/d/2.bmp', data=np.zeros((3, 4, 3))), + ), + ], categories=[]) with TestDir() as test_dir: LfwConverter.convert(dataset, test_dir, save_images=True) @@ -158,38 +163,33 @@ def test_can_detect(self): def test_can_import(self): expected_dataset = Dataset.from_iterable([ - DatasetItem(id='name0/name0_0001', - subset='test', image=np.ones((2, 5, 3)), - attributes={ - 'positive_pairs': [], - 'negative_pairs': ['name1/name1_0001', - 'name1/name1_0002'] - }, + DatasetItem(id='name0_0001', subset='test', + image=np.ones((2, 5, 3)), annotations=[ + Label(0, attributes={ + 'negative_pairs': ['name1/name1_0001', + 'name1/name1_0002'] + }), Points([0, 4, 3, 3, 2, 2, 1, 0, 3, 0]), ] ), - DatasetItem(id='name1/name1_0001', - subset='test', image=np.ones((2, 5, 3)), - attributes={ - 'positive_pairs': ['name1/name1_0002'], - 'negative_pairs': [] - }, + DatasetItem(id='name1_0001', subset='test', + image=np.ones((2, 5, 3)), annotations=[ + Label(1, attributes={ + 'positive_pairs': ['name1/name1_0002'], + }), Points([1, 6, 4, 6, 3, 3, 2, 1, 4, 1]), ] ), - DatasetItem(id='name1/name1_0002', - subset='test', image=np.ones((2, 5, 3)), - attributes={ - 'positive_pairs': [], - 'negative_pairs': [] - }, + DatasetItem(id='name1_0002', subset='test', + image=np.ones((2, 5, 3)), annotations=[ + Label(1), Points([0, 5, 3, 5, 2, 2, 1, 0, 3, 0]), ] ), - ]) + ], categories=['name0', 'name1']) dataset = Dataset.import_from(DUMMY_DATASET_DIR, 'lfw') From 14c51739637c46aed80d024966136b2c46fbffb2 Mon Sep 17 00:00:00 2001 From: yasakova-anastasia Date: Mon, 19 Apr 2021 17:38:31 +0300 Subject: [PATCH 2/4] update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f0e966c5fc..42e250314e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - LabelMe format saves dataset items with their relative paths by subsets without changing names () - Allowed arbitrary subset count and names in classification and detection splitters () - Annotation-less dataset elements are now participate in subset splitting () +- Classification task in LFW dataset format () ### Deprecated - From 06f2fdce317dfae6084e5e8edcb9619387be0c26 Mon Sep 17 00:00:00 2001 From: yasakova-anastasia Date: Mon, 19 Apr 2021 18:27:59 +0300 Subject: [PATCH 3/4] remove whitespace --- datumaro/plugins/lfw_format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datumaro/plugins/lfw_format.py b/datumaro/plugins/lfw_format.py index 8033450120..c4806647cb 100644 --- a/datumaro/plugins/lfw_format.py +++ b/datumaro/plugins/lfw_format.py @@ -197,7 +197,7 @@ def apply(self): if label_name: subdir=osp.join(subdir, label_name) self._save_image(item, subdir=subdir) - + if label != None: person1 = label_name num1 = item.id From 3a7d18b18cfb3ae52b9f6b54603f3afb99308327 Mon Sep 17 00:00:00 2001 From: yasakova-anastasia Date: Mon, 19 Apr 2021 18:31:27 +0300 Subject: [PATCH 4/4] update documentation --- README.md | 2 +- docs/user_manual.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 99ebac3664..3fba9e10fb 100644 --- a/README.md +++ b/README.md @@ -138,7 +138,7 @@ CVAT annotations ---> Publication, statistics etc. - [LabelMe](http://labelme.csail.mit.edu/Release3.0) - [ICDAR13/15](https://rrc.cvc.uab.es/?ch=2) (`word_recognition`, `text_localization`, `text_segmentation`) - [Market-1501](https://www.aitribune.com/dataset/2018051063) (`person re-identification`) - - [LFW](http://vis-www.cs.umass.edu/lfw/) (`person re-identification`, `landmarks`) + - [LFW](http://vis-www.cs.umass.edu/lfw/) (`classification`, `person re-identification`, `landmarks`) - Dataset building - Merging multiple datasets into one - Dataset filtering by a custom criteria: diff --git a/docs/user_manual.md b/docs/user_manual.md index 06585d36fd..40f2f061b0 100644 --- a/docs/user_manual.md +++ b/docs/user_manual.md @@ -128,7 +128,7 @@ List of supported formats: - Market-1501 (`person re-identification`) - [Format specification](https://www.aitribune.com/dataset/2018051063) - [Dataset example](../tests/assets/market1501_dataset) -- LFW (`person re-identification`, `landmarks`) +- LFW (`classification`, `person re-identification`, `landmarks`) - [Format specification](http://vis-www.cs.umass.edu/lfw/) - [Dataset example](../tests/assets/lfw_dataset)