openvinotoolkit · zhiltsov-max · Apr 20, 2021 · Apr 19, 2021 · Apr 19, 2021 · Apr 19, 2021
@@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - LabelMe format saves dataset items with their relative paths by subsets without changing names (<https://github.com/openvinotoolkit/datumaro/pull/200>)
 - Allowed arbitrary subset count and names in classification and detection splitters (<https://github.com/openvinotoolkit/datumaro/pull/207>)
 - Annotation-less dataset elements are now participate in subset splitting (<https://github.com/openvinotoolkit/datumaro/pull/211>)
+- Classification task in LFW dataset format (<https://github.com/openvinotoolkit/datumaro/pull/222>)
 
 ### Deprecated
 -

@@ -138,7 +138,7 @@ CVAT annotations                             ---> Publication, statistics etc.
   - [LabelMe](http://labelme.csail.mit.edu/Release3.0)
   - [ICDAR13/15](https://rrc.cvc.uab.es/?ch=2) (`word_recognition`, `text_localization`, `text_segmentation`)
   - [Market-1501](https://www.aitribune.com/dataset/2018051063) (`person re-identification`)
-  - [LFW](http://vis-www.cs.umass.edu/lfw/) (`person re-identification`, `landmarks`)
+  - [LFW](http://vis-www.cs.umass.edu/lfw/) (`classification`, `person re-identification`, `landmarks`)
 - Dataset building
   - Merging multiple datasets into one
   - Dataset filtering by a custom criteria:

@@ -8,16 +8,17 @@
 
 from datumaro.components.converter import Converter
 from datumaro.components.extractor import (AnnotationType, DatasetItem,
-    Importer, Points, SourceExtractor)
+    Importer, Label, LabelCategories, Points, SourceExtractor)
 from datumaro.util.image import find_images
 
 
 class LfwPath:
     IMAGES_DIR = 'images'
     LANDMARKS_FILE = 'landmarks.txt'
     PAIRS_FILE = 'pairs.txt'
+    PEOPLE_FILE = 'people.txt'
     IMAGE_EXT = '.jpg'
-    PATTERN = re.compile(r'([\w]+)_([-\d]+)')
+    PATTERN = re.compile(r'([\w-]+)_([-\d]+)')
 
 class LfwExtractor(SourceExtractor):
     def __init__(self, path, subset=None):
@@ -29,59 +30,101 @@ def __init__(self, path, subset=None):
         super().__init__(subset=subset)
 
         self._dataset_dir = osp.dirname(osp.dirname(path))
+
+        people_file = osp.join(osp.dirname(path), LfwPath.PEOPLE_FILE)
+        self._categories = self._load_categories(people_file)
+
         self._items = list(self._load_items(path).values())
 
+    def _load_categories(self, path):
+        label_cat = LabelCategories()
+        if osp.isfile(path):
+            with open(path, encoding='utf-8') as labels_file:
+                for line in labels_file:
+                    objects = line.strip().split('\t')
+                    if len(objects) == 2:
+                        label_cat.add(objects[0])
+        return { AnnotationType.label: label_cat }
+
     def _load_items(self, path):
         items = {}
+        label_categories = self._categories.get(AnnotationType.label)
 
         images_dir = osp.join(self._dataset_dir, self._subset, LfwPath.IMAGES_DIR)
         if osp.isdir(images_dir):
-            images = { osp.splitext(osp.relpath(p, images_dir))[0]: p
+            images = { osp.splitext(osp.relpath(p, images_dir))[0].replace('\\', '/'): p
                 for p in find_images(images_dir, recursive=True) }
         else:
             images = {}
 
         with open(path, encoding='utf-8') as f:
             for line in f:
                 pair = line.strip().split('\t')
-                if len(pair) == 3:
-                    if pair[0] == '-':
-                        image1 = pair[1]
-                        image2 = pair[2]
-                    else:
-                        image1 = self.get_image_name(pair[0], pair[1])
-                        image2 = self.get_image_name(pair[0], pair[2])
-                    if image1 not in items:
-                        items[image1] = DatasetItem(id=image1, subset=self._subset,
-                            image=images.get(image1),
-                            attributes={'positive_pairs': [], 'negative_pairs': []})
-                    if image2 not in items:
-                        items[image2] = DatasetItem(id=image2, subset=self._subset,
-                            image=images.get(image2),
-                            attributes={'positive_pairs': [], 'negative_pairs': []})
+                if len(pair) == 1 and pair[0] != '':
+                    annotations = []
+                    image = pair[0]
+                    item_id = pair[0]
+                    objects = item_id.split('/')
+                    if 1 < len(objects):
+                        label_name = objects[0]
+                        label = label_categories.find(label_name)[0]
+                        if label != None:
+                            annotations.append(Label(label))
+                            item_id = item_id[len(label_name) + 1:]
+                    if item_id not in items:
+                        items[item_id] = DatasetItem(id=item_id, subset=self._subset,
+                            image=images.get(image), annotations=annotations)
+                elif len(pair) == 3:
+                    image1, id1 = self.get_image_name(pair[0], pair[1])
+                    image2, id2 = self.get_image_name(pair[0], pair[2])
+                    label = label_categories.find(pair[0])[0]
+                    if label == None:
+                        raise Exception("Line %s: people file doesn't "
+                            "contain person %s " % (line, pair[0]))
+                    if id1 not in items:
+                        annotations = []
+                        annotations.append(Label(label))
+                        items[id1] = DatasetItem(id=id1, subset=self._subset,
+                            image=images.get(image1), annotations=annotations)
+                    if id2 not in items:
+                        annotations = []
+                        annotations.append(Label(label))
+                        items[id2] = DatasetItem(id=id2, subset=self._subset,
+                            image=images.get(image2), annotations=annotations)
 
                     # pairs form a directed graph
-                    items[image1].attributes['positive_pairs'].append(image2)
+                    if not items[id1].annotations[0].attributes.get('positive_pairs'):
+                        items[id1].annotations[0].attributes['positive_pairs'] = []
+                    items[id1].annotations[0].attributes['positive_pairs'].append(image2)
+
                 elif len(pair) == 4:
-                    if pair[0] == '-':
-                        image1 = pair[1]
-                    else:
-                        image1 = self.get_image_name(pair[0], pair[1])
+                    image1, id1 = self.get_image_name(pair[0], pair[1])
                     if pair[2] == '-':
                         image2 = pair[3]
+                        id2 = pair[3]
                     else:
-                        image2 = self.get_image_name(pair[2], pair[3])
-                    if image1 not in items:
-                        items[image1] = DatasetItem(id=image1, subset=self._subset,
-                            image=images.get(image1),
-                            attributes={'positive_pairs': [], 'negative_pairs': []})
-                    if image2 not in items:
-                        items[image2] = DatasetItem(id=image2, subset=self._subset,
-                            image=images.get(image2),
-                            attributes={'positive_pairs': [], 'negative_pairs': []})
+                        image2, id2 = self.get_image_name(pair[2], pair[3])
+                    if id1 not in items:
+                        annotations = []
+                        label = label_categories.find(pair[0])[0]
+                        if label == None:
+                            raise Exception("Line %s: people file doesn't "
+                                "contain person %s " % (line, pair[0]))
+                        annotations.append(Label(label))
+                        items[id1] = DatasetItem(id=id1, subset=self._subset,
+                            image=images.get(image1), annotations=annotations)
+                    if id2 not in items:
+                        annotations = []
+                        label = label_categories.find(pair[2])[0]
+                        if label != None:
+                            annotations.append(Label(label))
+                        items[id2] = DatasetItem(id=id2, subset=self._subset,
+                            image=images.get(image2), annotations=annotations)
 
                     # pairs form a directed graph
-                    items[image1].attributes['negative_pairs'].append(image2)
+                    if not items[id1].annotations[0].attributes.get('negative_pairs'):
+                        items[id1].annotations[0].attributes['negative_pairs'] = []
+                    items[id1].annotations[0].attributes['negative_pairs'].append(image2)
 
         landmarks_file = osp.join(self._dataset_dir, self._subset,
             LfwPath.LANDMARKS_FILE)
@@ -91,10 +134,15 @@ def _load_items(self, path):
                     line = line.split('\t')
 
                     item_id = osp.splitext(line[0])[0]
+                    objects = item_id.split('/')
+                    if 1 < len(objects):
+                        label_name = objects[0]
+                        label = label_categories.find(label_name)[0]
+                        if label != None:
+                            item_id = item_id[len(label_name) + 1:]
                     if item_id not in items:
                         items[item_id] = DatasetItem(id=item_id, subset=self._subset,
-                            image=osp.join(images_dir, line[0]),
-                            attributes={'positive_pairs': [], 'negative_pairs': []})
+                            image=osp.join(images_dir, line[0]))
 
                     annotations = items[item_id].annotations
                     annotations.append(Points([float(p) for p in line[1:]]))
@@ -103,7 +151,15 @@ def _load_items(self, path):
 
     @staticmethod
     def get_image_name(person, image_id):
-        return '{}/{}_{:04d}'.format(person, person, int(image_id))
+        image, item_id = '', ''
+        try:
+            image_id = int(image_id)
+            image = '{}/{}_{:04d}'.format(person, person, image_id)
+            item_id = '{}_{:04d}'.format(person, image_id)
+        except ValueError:
+            image = '{}/{}'.format(person, image_id)
+            item_id = image_id
+        return image, item_id
 
 class LfwImporter(Importer):
     @classmethod
@@ -115,42 +171,90 @@ class LfwConverter(Converter):
 
     def apply(self):
         for subset_name, subset in self._extractor.subsets().items():
+            label_categories = self._extractor.categories()[AnnotationType.label]
+            labels = {}
+            for label in label_categories:
+                f = label.name
+                labels[label.name] = 0
+
             positive_pairs = []
             negative_pairs = []
+            neutral_items = []
             landmarks = []
+            included_items = []
 
             for item in subset:
+                anns = [ann for ann in item.annotations
+                    if ann.type == AnnotationType.label]
+                label, label_name = None, None
+                if anns:
+                    label = anns[0]
+                    label_name = label_categories[anns[0].label].name
+                    labels[label_name] += 1
+
                 if self._save_images and item.has_image:
-                    self._save_image(item,
-                        subdir=osp.join(subset_name, LfwPath.IMAGES_DIR))
-
-                search = LfwPath.PATTERN.search(item.id)
-                if search:
-                    person1, num1 = search.groups()
-                    num1 = int(num1)
-                else:
-                    person1 = '-'
+                    subdir=osp.join(subset_name, LfwPath.IMAGES_DIR)
+                    if label_name:
+                        subdir=osp.join(subdir, label_name)
+                    self._save_image(item, subdir=subdir)
+
+                if label != None:
+                    person1 = label_name
                     num1 = item.id
-                if 'positive_pairs' in item.attributes:
-                    for pair in item.attributes['positive_pairs']:
-                        search = LfwPath.PATTERN.search(pair)
-                        if search:
-                            num2 = search.groups()[1]
-                            num2 = int(num2)
-                        else:
-                            num2 = pair
-                        positive_pairs.append('%s\t%s\t%s' % (person1, num1, num2))
-                if 'negative_pairs' in item.attributes:
-                    for pair in item.attributes['negative_pairs']:
-                        search = LfwPath.PATTERN.search(pair)
-                        if search:
-                            person2, num2 = search.groups()
-                            num2 = int(num2)
-                        else:
-                            person2 = '-'
-                            num2 = pair
-                        negative_pairs.append('%s\t%s\t%s\t%s' % \
-                            (person1, num1, person2, num2))
+                    if num1.startswith(person1):
+                        num1 = int(num1.replace(person1, '')[1:])
+                    curr_item = person1 + '/' + str(num1)
+
+                    if 'positive_pairs' in label.attributes:
+                        if curr_item not in included_items:
+                            included_items.append(curr_item)
+                        for pair in label.attributes['positive_pairs']:
+                            search = LfwPath.PATTERN.search(pair)
+                            if search:
+                                num2 = search.groups()[1]
+                                num2 = int(num2)
+                            else:
+                                num2 = pair
+                                if num2.startswith(person1):
+                                    num2 = num2.replace(person1, '')[1:]
+                            curr_item = person1 + '/' + str(num2)
+                            if curr_item not in included_items:
+                                included_items.append(curr_item)
+                            positive_pairs.append('%s\t%s\t%s' % (person1, num1, num2))
+
+                    if 'negative_pairs' in label.attributes:
+                        if curr_item not in included_items:
+                            included_items.append(curr_item)
+                        for pair in label.attributes['negative_pairs']:
+                            search = LfwPath.PATTERN.search(pair)
+                            curr_item = ''
+                            if search:
+                                person2, num2 = search.groups()
+                                num2 = int(num2)
+                                curr_item += person2 + '/'
+                            else:
+                                person2 = '-'
+                                num2 = pair
+                                objects = pair.split('/')
+                                if 1 < len(objects) and objects[0] in labels:
+                                    person2 = objects[0]
+                                    num2 = pair.replace(person2, '')[1:]
+                                    curr_item += person2 + '/'
+                            curr_item += str(num2)
+                            if curr_item not in included_items:
+                                included_items.append(curr_item)
+                            negative_pairs.append('%s\t%s\t%s\t%s' % \
+                                (person1, num1, person2, num2))
+
+                    if 'positive_pairs' not in label.attributes and \
+                            'negative_pairs' not in label.attributes and \
+                            curr_item not in included_items:
+                        neutral_items.append('%s/%s' % (person1, item.id))
+                        included_items.append(curr_item)
+
+                elif item.id not in included_items:
+                    neutral_items.append(item.id)
+                    included_items.append(item.id)
 
                 item_landmarks = [p for p in item.annotations
                     if p.type == AnnotationType.points]
@@ -163,9 +267,17 @@ def apply(self):
             with open(pairs_file, 'w', encoding='utf-8') as f:
                 f.writelines(['%s\n' % pair for pair in positive_pairs])
                 f.writelines(['%s\n' % pair for pair in negative_pairs])
+                f.writelines(['%s\n' % item for item in neutral_items])
 
             if landmarks:
                 landmarks_file = osp.join(self._save_dir, subset_name,
                     LfwPath.LANDMARKS_FILE)
                 with open(landmarks_file, 'w', encoding='utf-8') as f:
                     f.writelines(['%s\n' % landmark for landmark in landmarks])
+
+            if labels:
+                people_file = osp.join(self._save_dir, subset_name,
+                    LfwPath.PEOPLE_FILE)
+                with open(people_file, 'w', encoding='utf-8') as f:
+                    f.writelines(['%s\t%d\n' % (label, labels[label])
+                        for label in labels])
@@ -128,7 +128,7 @@ List of supported formats:
 - Market-1501 (`person re-identification`)
   - [Format specification](https://www.aitribune.com/dataset/2018051063)
   - [Dataset example](../tests/assets/market1501_dataset)
-- LFW (`person re-identification`, `landmarks`)
+- LFW (`classification`, `person re-identification`, `landmarks`)
   - [Format specification](http://vis-www.cs.umass.edu/lfw/)
   - [Dataset example](../tests/assets/lfw_dataset)
 

@@ -0,0 +1,3 @@
+2
+name0	2
+name1	2