Source code for imantics.dataset

import random
import numpy as np

from .annotation import Annotation
from .category import Category
from .basic import Semantic
from .image import Image


[docs]class Dataset(Semantic):
    @classmethod
    def from_xml(cls, xml_folder, name="XML Dataset"):
        extensions = ("jpg","JPG","png")

        from xmljson import badgerfish as bf
        from xml.etree.ElementTree import fromstring
        """
        Generates a dataset from a folder with XML and corresponding images

        :param xml_folder:
        :type xml_folder: pathlib.Path
        :raise ImportError: Raised if xml_folder is a `pathlib.Path`
                            object and it cannot be imported
        """
        dataset = cls(name)
        xml_list = []
        id_counter = 0

        for ext in extensions:
            xml_list += list(xml_folder.glob(f"*.{ext}"))
        categories = []
        for idx, imgp in enumerate(xml_list):
            xml = bf.data(fromstring(open(imgp.with_suffix(".xml"),"r").read()))
            if "object" in xml["annotation"].keys():
                if type(xml["annotation"]["object"]) is not list:
                    cat = xml["annotation"]["object"]["name"]["$"]
                    categories.append(cat)
                else:
                    for ann in xml["annotation"]["object"]:
                        cat = ann["name"]["$"]
                        categories.append(cat)

        categories = list(set(categories))

        xml_categories = {cat: Category(cat,id=idx+1) for idx,cat in enumerate(categories)}

        for idx, imgp in enumerate(xml_list):
            image = Image.from_path(str(imgp))
            image.id = idx
            image.dataset = name


            xml = bf.data(fromstring(open(imgp.with_suffix(".xml"),"r").read()))
            if "object" in xml["annotation"].keys():

                # Handle single object case
                if type(xml["annotation"]["object"]) is not list:
                    xml["annotation"]["object"] = [xml["annotation"]["object"]]

                for ann in xml["annotation"]["object"]:
                    i = ann["bndbox"]
                    cat = ann["name"]["$"]

                    x,y,xx,yy = (int(i["xmin"]["$"]), int(i["ymin"]["$"]),int(i["xmax"]["$"]),int(i["ymax"]["$"]))
                    bbox = [x,y,xx,yy]

                    fin_ann = Annotation(id=id_counter, image=image, bbox=bbox,category=xml_categories[cat])
                    id_counter += 1

                    image.add(fin_ann)
            dataset.add(image)
        return dataset


[docs]    @classmethod
    def from_coco(cls, coco_obj, name="COCO Datset"):
        """
        Generates a dataset from a COCO object or python dict

        :param coco_obj:
        :type coco_obj: dict, pycocotools.coco.COCO
        :raise ImportError: Raised if coco_obj is a `pycocotools.coco.COCO`
                            object and it cannot be imported
        """
        if isinstance(coco_obj, dict):
            dataset = cls(name)

            coco_info = coco_obj.get('info', [])
            coco_annotations = coco_obj.get('annotations', [])
            coco_images = coco_obj.get('images', [])
            coco_categories = coco_obj.get('categories', [])

            index_categories = {}
            for category in coco_categories:
                category = Category.from_coco(category)
                index_categories[category.id] = category

            for image in coco_images:
                image = Image.from_coco(image, dataset=dataset)
                dataset.add(image)

            for annotation in coco_annotations:

                image_id = annotation.get('image_id')
                category_id = annotation.get('category_id')

                image = dataset.images[image_id]
                category = index_categories[category_id]
                segmentation = annotation.get('segmentation')
                metadata = annotation.get('metadata', {})

                # color can be stored in the metadata
                color = annotation.get('color', metadata.get('color'))

                annotation = Annotation(image, category, polygons=segmentation,\
                                        color=color, metadata=metadata)
                dataset.add(annotation)

            return dataset

        from pycocotools.coco import COCO
        if isinstance(coco_obj, COCO):
            pass

        return None

    def __init__(self, name, images=[], id=0, metadata={}):
        self.annotations = {}
        self.categories = {}
        self.images = {}
        self.name = name
        self._max_ann_id = None
        self._max_img_id = None
        for image in images:
            image.index(self)

        super(Dataset, self).__init__(id, metadata)

[docs]    def add(self, image):
        """
        Adds image(s) to the current dataset

        :param image: list, object or path to add to dataset
        :type image: :class:`Image` :class:`Annotation`, list, typle, path
        """

        if isinstance(image, (list, tuple)):
            for img in image:
                img.index(self)
            return

        if isinstance(image, Annotation):
            annotation = image
            image = self.images.get(annotation.image.id)

            annotation.index(self)
            image.add(annotation)
            return

        if isinstance(image, str):
            image = Image.from_path(image)

        image.index(self)

[docs]    def iter_images(self):
        """
        Generator to iterate over all images
        """
        for _, image in self.images.items():
            yield image

[docs]    def iter_annotations(self):
        """
        Generator to iterate over all annotations
        """
        for key, annotation in self.annotations.items():
            if isinstance(key, int):
                yield annotation

[docs]    def iter_categories(self):
        """
        Generator to iterate over all categories
        """
        for _, category in self.categories.items():
            yield category

[docs]    def split(self, ratios, random=False):
        """
        Splits dataset images into mutiple sub datasets of the given ratios

        If a tuple of (1, 1, 2) was passed in the result would return 3 dataset
        objects of 25%, 25% and 50% of the images.

        .. code-block:: python

            percents = ratios / ratios.sum()

        :param ratios: ratios to split dataset into
        :type ratios: tuple, list
        :param random: randomize the images before spliting
        :returns: tuple of datasets with length of the number of ratios
        :rtype: tuple
        """

        if len(ratios) >= len(self.images):
            raise ValueError("Too many values in ratio array compared to dataset size")

        ratios = np.array(ratios)
        percents = ratios / ratios.sum()

        if percents.sum() != 1:
            raise ValueError("Percents don't add up to 100%")

        percents = percents[:-1] # don't need last percent, just take what is left
        percents *= len(self.images) # how many images in each dataset
        percents = percents.round().astype(np.int) # prepare where we split

        if random:
            im = random.sample(list(self.images.keys()))
        else:
            im = list(self.images.keys())

        splits = np.split(im, percents)

        datasets = []
        for idx, split in enumerate(splits):
            tmp_images = []

            for key in split:
                # get all images corresponding to the split's keys
                tmp_images.append(self.images.get(key))

            dataset = Dataset("split" + str(idx), images=tmp_images)
            datasets.append(dataset)

        return datasets

[docs]    def coco(self):
        coco = {
            'info': {},
            'categories': [c.coco(include=False) for c in self.iter_categories()],
            'images': [i.coco(include=False) for i in self.iter_images()],
            'annotations': [a.coco(include=False) for a in self.iter_annotations()]
        }

        return coco

[docs]    def yolo(self):
        yolo = {}

        for image in self.iter_images():
            yolo[image.path] = image.yolo()

        return yolo


__all__ = ["Dataset"]