Source code for uval.stages.dataset_specification

# -*- coding: utf-8 -*-
"""A StageData object acts like a data container that manages results generated by stages and read by subsequent
stages.
"""

from os import path

from uval.stages.stage_data import DatasetSpecificationData
from uval.utils.label_naming import label_short_to_long
from uval.utils.log import logger
from uval.utils.yaml_io import load_yaml_data


# @uval_stage
[docs]def load_datasplit(file_path: str, subsets=None, output=None) -> DatasetSpecificationData:
    """This stage will load a data split file in YAML format and keep the id lists for
    train, val and test set.

    Args:
        file_path (str): path to the yaml file.
        subsets ([str], optional): which subsets of the yaml files are to be loaded,
        if set to None, all subsets are used. Defaults to None.
        output: The output file to be generated, if needed

    Returns:
        DatasetSpecificationData: returns the dataset.
    """

    # Set up result data store
    stage_results = DatasetSpecificationData()
    logger.debug("Reading YAML file started")
    data_split_dict = load_yaml_data(file_path)
    logger.debug("Reading YAML file finished.")

    if not data_split_dict or not data_split_dict.get("split"):
        logger.error("No data could be read from data split file '{}'".format(file_path))
        return DatasetSpecificationData()
    available_subsets = list()
    if data_split_dict.get("split"):
        available_subsets = [subset_name for subset_name in data_split_dict.get("split").keys()]  # type: ignore
    if subsets is None:
        subsets = available_subsets
    else:
        for subset in subsets:
            if subset not in available_subsets:
                raise ValueError(f"The requested split subset {subset} not found in the YAML file.")
    logger.debug(subsets)

    # frame_columns = ["volume_id", "label_id", "is_negative", "subset", "class_name"]
    dic = dict()

    for subset_name, subset_data in data_split_dict.get("split").items():  # type: ignore
        if subset_name not in subsets:
            continue
        for class_name, image_list in subset_data.items():
            is_negative = True if class_name == "negative" else False
            logger.debug("Reading list of '{}' images for split {}/{}".format(len(image_list), subset_name, class_name))
            for volume_id, labels in image_list.items():
                labels_list = labels if isinstance(labels, list) else [labels]
                for label_id in labels_list:
                    try:
                        item_key = label_short_to_long(volume_id, label_id) if label_id else volume_id
                        dic[item_key] = {
                            "volume_id": volume_id,
                            "label_id": label_id,
                            "is_negative": is_negative,
                            "subset": subset_name,
                            "class_name": class_name,
                        }
                    except KeyError:
                        logger.warning(f"Unable to find proper keys in the YAML file: '{file_path}'")

    stage_results.from_dict_as_rows(dic)
    if output:
        if output.DATASET_OVERVIEW_FILE.split(".")[-1] == "html":
            stage_results.to_html(path.join(output.PATH, output.DATASET_OVERVIEW_FILE))
        elif output.DATASET_OVERVIEW_FILE.split(".")[-1] == "csv":
            stage_results.to_csv(path.join(output.PATH, output.DATASET_OVERVIEW_FILE))
        else:
            raise NotImplementedError("Other output formats rather than 'html' and 'csv' are not implemented yet!")
        logger.info(f"dataset overview saved to {path.join(output.PATH, output.DATASET_OVERVIEW_FILE)}.")

    return stage_results