Source code for uval.utils.hdf5_io

# -*- coding: utf-8 -*-
"""This module provides functions to read and write uval HDF5 files.
For a format specification, have a look at `hdf5_format.py`
"""

import getpass  # type: ignore
import os  # type: ignore
import socket  # type: ignore
from abc import ABC  # type: ignore
from datetime import datetime, timezone  # type: ignore
from typing import Optional, Union  # type: ignore

import h5py  # type: ignore
from h5py import Group as H5Group  # type: ignore

from uval.utils.hdf5_format import *  # type: ignore
from uval.utils.log import logger  # type: ignore


[docs]class UvalHdfFile(ABC):
    def __init__(self, filepath: str, mode: str = "r"):
        self.filepath = filepath
        self.h5 = None
        self.file_mode = mode


[docs]class UvalHdfFileInput(UvalHdfFile):
    """A class to manage, read from a single uval-specific HDF5 file.
    The file on disk will not be held open by default.
    Every operation will open and close the file.
    If a bunch of operations is executed in a row and the file shall be held open, use
    a with-context on the UvalHdfFile object.
    """

    def __init__(self, filepath: str):
        super().__init__(filepath, "r")

        """Keeps track of nested with-statements.
           File will only be closed after exiting the last with-block"""
        self.context_counter = 0

    def __enter__(self):
        """
            Called on `with uval_file_obj:` to ensure the file is open.
            This is also used internally by the other member functions.
        Returns:
            The self object
        """
        self.context_counter += 1
        if self.h5 is None:
            try:
                self.h5 = h5py.File(self.filepath, self.file_mode)
            except OSError as err:
                logger.error(f"OS error '{err}' occurred while reading file: {self.filepath}")
                self.h5 = None
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        """Ensure the h5 file on disk is closed when we leave the with-context"""
        self.context_counter -= 1

        if self.context_counter == 0 and self.h5 is not None:
            self.h5.close()
            self.h5 = None

[docs]    def is_closed(self):
        """Checks if the file is closed or not initialized"""
        return not self.h5.__bool__()

    def _join_path(self, *args) -> str:
        """
            joins multiple elements of an HDF5 group or dataset path to one string.
        Examples:
            ["root", "some/other", "field"] would map to "root/some/other/field"
        Args:
            *args: The elements to be joined

        Returns:
            The joined input elements as a string
        """

        return "/".join(args)

    def _convert_read(self, item: object):
        """
            Takes a data field read from an HDF5 file and makes sure it's in the right format,
            to pass it to python. Bytes for example will be decoded to unicode strings.
        Args:
            item: the object to be decoded after reading

        Returns:
            The decoded object to pass to python
        """

        if isinstance(item, bytes):
            return item.decode()
        else:
            return item

    def _read_listgroup_to_dict(
        self, listgroup: str, listitem_fields: Union[dict, List[str], str], start_path: str = ""
    ) -> Optional[list]:
        try:
            list_keys = self.h5[self._join_path(start_path, listgroup)].keys()  # type: ignore
        except (TypeError, KeyError):
            return None

        return [
            self._read_to_dict(listitem_fields, start_path=self._join_path(start_path, listgroup, list_key))
            for list_key in list_keys
        ]

    def _read_dictgroup_to_dict(
        self, dictgroup: str, dictitem_fields: Union[dict, List[str], str], start_path: str = ""
    ) -> Optional[dict]:
        try:
            dict_keys = self.h5[self._join_path(start_path, dictgroup)].keys()  # type: ignore
        except (TypeError, KeyError):
            return None

        return {
            dict_key: self._read_to_dict(dictitem_fields, start_path=self._join_path(start_path, dictgroup, dict_key))
            for dict_key in dict_keys
        }

    def _read_to_dict(self, fields: Union[dict, List[str], str], start_path: str = "") -> Optional[dict]:
        """
            Extract (copy) all the data for the given fields from the HDF5 file, and return them in a dictionary.
            This is done recursively.

        Args:
            fields: The fields in HDF5 file to extract data from
            start_path: The start path to start extracting requested data from
        Returns:
            A dictionary of data in the requested filed
        """
        result = {}
        with self:
            if start_path:
                try:
                    # Let's check if the start_path exists within the HDF5 file.
                    # If not, we return None
                    self.h5[start_path]  # type: ignore
                except (TypeError, KeyError):
                    return None

            if isinstance(fields, str):
                # Single string (reading one data set) will be transformed to a list with one element
                fields = [fields]

            if isinstance(fields, list):
                # To not replicate reading code, transform list into dict with None-values
                fields = {k: None for k in fields}

            if isinstance(fields, dict):
                for key, subfields in fields.items():
                    if subfields is None:
                        # In this case, the key is the name of the dataset to read
                        try:
                            result[key] = self._convert_read(
                                self.h5[self._join_path(start_path, key)][()]  # type: ignore
                            )  # type: ignore
                        except KeyError:
                            # Don't set if not available
                            # Don't throw an error either
                            pass
                    else:
                        cur_result = self._read_to_dict(subfields, start_path=self._join_path(start_path, key))
                        if cur_result is not None:
                            result[key] = cur_result

        return result

    def _read_to_dict_outer(self, fields: Union[dict, List[str], str], start_path: str = "") -> Optional[dict]:
        """
            A wrapper around _read_to_dict, which leaves away the first hierarchy level of the returned dict.
            That means, if you request fields like {'root': {'A': None, 'B': None}}, it would directly return the
            inner dict with A and B and it's values, but not the root node.

        Args:
            fields: The requested fields to be read from the HDF5 file
            start_path: The field to start reading from it

        Returns:
            The results read from the HDF5 file in a dictionary
        """
        if isinstance(fields, dict):
            assert len(fields.keys()) == 1

        result = self._read_to_dict(fields, start_path)
        if result:
            return result[next(iter(fields.keys()))]  # type: ignore

        return result

[docs]    def file_meta(self):
        return self._read_to_dict_outer(
            {
                GROUP_FILE_META: {
                    DSET_HOST_NAME: None,
                    DSET_USER_NAME: None,
                    DSET_DT_GENERATED: None,
                    DSET_DET_VERSION: None,
                }
            }
        )

[docs]    def volume_meta(self, include_caches=False):
        """
            The metadata information regarding the ct 3d image.
            Please refer to UVal hdf5 format.
        Args:
            include_caches: Whether to include the cached data or not

        Returns:
            None
        """
        if include_caches:
            return self._read_to_dict_outer(
                {
                    GROUP_VOLUME_META: {
                        DSET_ID: None,
                        DSET_FILE_MD5: None,
                        DSET_FULL_SHAPE: None,
                        DSET_IS_CROPPED: None,
                        DSET_ROI_SHAPE: None,
                        DSET_ROI_START: None,
                        GROUP_CACHE: {DSET_PROJECTION_X, DSET_PROJECTION_Y, DSET_PROJECTION_Z},
                    }
                }
            )
        else:
            return self._read_to_dict_outer(
                {
                    GROUP_VOLUME_META: {
                        DSET_ID: None,
                        DSET_FILE_MD5: None,
                        DSET_FULL_SHAPE: None,
                        DSET_IS_CROPPED: None,
                        DSET_ROI_SHAPE: None,
                        DSET_ROI_START: None,
                    }
                }
            )

[docs]    def volume(self):
        """
            The ct 3d volume stored in hdf5 file
            Please refer to UVal hdf5 format.
        Returns:
            None
        """
        return self._read_to_dict_outer(DSET_VOLUME)

[docs]    def ground_truth(self, include_masks=False, include_caches=False):
        """
            A list of 3d groundtruth data belonging to a ct image
            Please refer to UVal hdf5 format.
        Args:
            include_masks: To include 3d masks while reading or not
            include_caches: To include cached data while reading ot not

        Returns:
            None
        """
        item_descriptor = {
            DSET_CLASS_NAME: None,
            DSET_TARGET_ID: None,
            DSET_ROI_START: None,
            DSET_ROI_SHAPE: None,
        }
        if include_masks:
            item_descriptor.update({DSET_MASK: None})

        if include_caches:
            item_descriptor.update({GROUP_CACHE: {DSET_PROJECTION_X, DSET_PROJECTION_Y, DSET_PROJECTION_Z}})

        return self._read_dictgroup_to_dict(DICTGROUP_GROUNDTRUTH, item_descriptor)

[docs]    def detections(self, include_masks=False, include_caches=False):
        """
            A list of detected areas in the ct image
            Please refer to UVal hdf5 format.
        Args:
            include_masks: To include 3d masks while reading or not
            include_caches: To include cached data while reading ot not

        Returns:
            None
        """
        item_descriptor = {
            DSET_CLASS_NAME: None,
            DSET_ROI_START: None,
            DSET_ROI_SHAPE: None,
            DSET_SCORE: None,
        }
        if include_masks:
            item_descriptor.update({DSET_MASK: None})

        if include_caches:
            item_descriptor.update({GROUP_CACHE: {DSET_PROJECTION_X, DSET_PROJECTION_Y, DSET_PROJECTION_Z}})

        return self._read_listgroup_to_dict(LISTGROUP_DETECTIONS, item_descriptor)

[docs]    def read_all_fields(self) -> dict:
        """
            Reads all the existing fields in h5 file
        Returns:
            All the groups in a dictionary
        """
        return {
            GROUP_FILE_META: self.file_meta(),
            GROUP_VOLUME_META: self.volume_meta(),
            DSET_VOLUME: self.volume(),
            DICTGROUP_GROUNDTRUTH: self.ground_truth(),
            LISTGROUP_DETECTIONS: self.detections(),
        }


[docs]class UvalHdfFileOutput(UvalHdfFile):
    """A class to manage, write to a single uval-specific HDF5 file.
    The file on disk will not be held open by default.
    Every operation will open and close the file.
    If a bunch of operations is executed in a row and the file shall be held open, use
    a with-context on the UvalHdfFile object.
    """

    _volume: Optional[np.ndarray]
    _volume_meta: Optional[dict]
    _groundtruth: Optional[dict]
    _detections: Optional[list]
    _file_meta: Optional[dict]

    # Always keeps all things in memory before writing
    def __init__(self, filepath: str, copy_from_input: Optional[UvalHdfFileInput] = None):
        super().__init__(filepath, "w")
        if copy_from_input:
            self.read_all_from(copy_from_input)
        else:
            self._volume = None
            self._volume_meta = None
            self._groundtruth = None
            self._detections = None
            self._file_meta = None

    def __enter__(self):
        """
            For the output class, we only actually write when done, not at this point.
        Returns:
            Self object of type UvalHdfFileOutput
        """
        if not os.access(os.path.abspath(os.path.dirname(self.filepath)), os.W_OK):
            raise IOError(f"Cannot write to file '{self.filepath}'")
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        """We are done, we can write the file now, if no exception occurred."""
        if exc_type is None:
            # Do not write if an exception occurred
            self.write()

    def _on_close(self):
        """Before closing, write the changes"""
        self.write()

    def _write_dict_to_group(self, h5group: H5Group, fields: dict):
        """In the HDF5 file, create groups and field at given start_path, containing the data from fields.

        Args:
            h5group: The HDF5 group to write the filed into
            fields: A nested dictionary representing the data to be written
        Returns:
            None
        """

        # The file needs to be writable
        assert self.file_mode == "w"

        for k, v in fields.items():
            if isinstance(v, dict):
                h5group.create_group(k)
                self._write_dict_to_group(h5group[k], v)
            else:
                h5group.create_dataset(k, data=v)

[docs]    def write(self):
        """
            Explicitly call this method to write the file. However, changes will automatically
            be written when the file is closed.
        Returns:
            None
        """

        # Check for meta data
        if self._volume_meta is None:
            raise ValueError(
                "No volume metadata was set before writing. " "Uval files always must contain volume metadata."
            )

        with h5py.File(self.filepath, "w") as f:
            # Write file meta data (This will always be generated here, cannot be set manually)
            group_file_meta = f.create_group(GROUP_FILE_META)
            group_file_meta.create_dataset(DSET_HOST_NAME, data=socket.gethostname())
            group_file_meta.create_dataset(DSET_USER_NAME, data=getpass.getuser())
            group_file_meta.create_dataset(
                DSET_DT_GENERATED, data=str(datetime.now(timezone.utc).astimezone().isoformat())
            )

            # As stated above, file_meta cannot be set manually - with one exception: det_version
            try:
                group_file_meta.create_dataset(DSET_DET_VERSION, data=self._file_meta[DSET_DET_VERSION])
            except (TypeError, KeyError):
                group_file_meta.create_dataset(DSET_DET_VERSION, data="not specified")

            # Write volume meta data
            check_volume_meta_fields(self._volume_meta, self._volume)
            group_volume_meta = f.create_group(GROUP_VOLUME_META)
            self._write_dict_to_group(group_volume_meta, self._volume_meta)

            # Write volume data if available
            if self._volume is not None:
                f.create_dataset(DSET_VOLUME, data=self._volume)

            # Write detections
            if self._detections:
                check_detection_fields(self._detections)
                listgroup_detections = f.create_group(LISTGROUP_DETECTIONS)
                for idx, detection in enumerate(self._detections):
                    group_detection = listgroup_detections.create_group(str(idx))
                    self._write_dict_to_group(group_detection, detection)

            # Write groundtruth:
            if self._groundtruth:
                check_groundtruth_fields(self._groundtruth)
                listgroup_groundtruth = f.create_group(DICTGROUP_GROUNDTRUTH)
                for label_name, gt in self._groundtruth.items():
                    group_gt = listgroup_groundtruth.create_group(str(label_name))
                    self._write_dict_to_group(group_gt, gt)

[docs]    def is_closed(self):
        """Checks if the file is closed or not initialized"""
        return not self.h5.__bool__()

[docs]    def read_all_from(self, input_file: UvalHdfFileInput) -> None:
        """Reading all the meta data included in another HDF5 file
        Args:
            input_file: HDF file to read from

        Returns:
            None
        """

        self._groundtruth = input_file.ground_truth()
        self._detections = input_file.detections()
        self._file_meta = input_file.file_meta()
        self._volume_meta = input_file.volume_meta()
        self._volume = input_file.volume()

    # File_meta
    @property
    def file_meta(self):
        raise IOError("This class should only be used to write, not to read a uval file")

    @file_meta.setter
    def file_meta(self, value: dict):
        self._file_meta = value

    @file_meta.deleter
    def file_meta(self):
        self._file_meta = None

    # Volume
    @property
    def volume(self):
        raise IOError("This class should only be used to write, not to read a uval file")

    @volume.setter
    def volume(self, value: np.ndarray):
        self._volume = value

    @volume.deleter
    def volume(self):
        self._volume = None

    # Volume meta
    @property
    def volume_meta(self):
        raise IOError("This class should only be used to write, not to read a uval file")

    @volume_meta.setter
    def volume_meta(self, value: dict):
        self._volume_meta = value

    @volume_meta.deleter
    def volume_meta(self):
        self._volume_meta = None

    # Detections
    @property
    def detections(self):
        raise IOError("This class should only be used to write, not to read a uval file")

    @detections.setter
    def detections(self, value: list):
        self._detections = value

    @detections.deleter
    def detections(self):
        self._detections = None

    # Groundtruth
    @property
    def groundtruth(self):
        raise IOError("This class should only be used to write, not to read a uval file")

    @groundtruth.setter
    def groundtruth(self, value: list):
        self._groundtruth = value  # type: ignore

    @groundtruth.deleter
    def groundtruth(self):
        self._groundtruth = None