Source code for uval.utils.hdf5_io

# -*- coding: utf-8 -*-
"""This module provides functions to read and write uval HDF5 files.
For a format specification, have a look at `hdf5_format.py`
"""

import getpass  # type: ignore
import os  # type: ignore
import socket  # type: ignore
from abc import ABC  # type: ignore
from datetime import datetime, timezone  # type: ignore
from typing import Optional, Union  # type: ignore

import h5py  # type: ignore
from h5py import Group as H5Group  # type: ignore

from uval.utils.hdf5_format import *  # type: ignore
from uval.utils.log import logger  # type: ignore


[docs]class UvalHdfFile(ABC): def __init__(self, filepath: str, mode: str = "r"): self.filepath = filepath self.h5 = None self.file_mode = mode
[docs]class UvalHdfFileInput(UvalHdfFile): """A class to manage, read from a single uval-specific HDF5 file. The file on disk will not be held open by default. Every operation will open and close the file. If a bunch of operations is executed in a row and the file shall be held open, use a with-context on the UvalHdfFile object. """ def __init__(self, filepath: str): super().__init__(filepath, "r") """Keeps track of nested with-statements. File will only be closed after exiting the last with-block""" self.context_counter = 0 def __enter__(self): """ Called on `with uval_file_obj:` to ensure the file is open. This is also used internally by the other member functions. Returns: The self object """ self.context_counter += 1 if self.h5 is None: try: self.h5 = h5py.File(self.filepath, self.file_mode) except OSError as err: logger.error(f"OS error '{err}' occurred while reading file: {self.filepath}") self.h5 = None return self def __exit__(self, exc_type, exc_val, exc_tb): """Ensure the h5 file on disk is closed when we leave the with-context""" self.context_counter -= 1 if self.context_counter == 0 and self.h5 is not None: self.h5.close() self.h5 = None
[docs] def is_closed(self): """Checks if the file is closed or not initialized""" return not self.h5.__bool__()
def _join_path(self, *args) -> str: """ joins multiple elements of an HDF5 group or dataset path to one string. Examples: ["root", "some/other", "field"] would map to "root/some/other/field" Args: *args: The elements to be joined Returns: The joined input elements as a string """ return "/".join(args) def _convert_read(self, item: object): """ Takes a data field read from an HDF5 file and makes sure it's in the right format, to pass it to python. Bytes for example will be decoded to unicode strings. Args: item: the object to be decoded after reading Returns: The decoded object to pass to python """ if isinstance(item, bytes): return item.decode() else: return item def _read_listgroup_to_dict( self, listgroup: str, listitem_fields: Union[dict, List[str], str], start_path: str = "" ) -> Optional[list]: try: list_keys = self.h5[self._join_path(start_path, listgroup)].keys() # type: ignore except (TypeError, KeyError): return None return [ self._read_to_dict(listitem_fields, start_path=self._join_path(start_path, listgroup, list_key)) for list_key in list_keys ] def _read_dictgroup_to_dict( self, dictgroup: str, dictitem_fields: Union[dict, List[str], str], start_path: str = "" ) -> Optional[dict]: try: dict_keys = self.h5[self._join_path(start_path, dictgroup)].keys() # type: ignore except (TypeError, KeyError): return None return { dict_key: self._read_to_dict(dictitem_fields, start_path=self._join_path(start_path, dictgroup, dict_key)) for dict_key in dict_keys } def _read_to_dict(self, fields: Union[dict, List[str], str], start_path: str = "") -> Optional[dict]: """ Extract (copy) all the data for the given fields from the HDF5 file, and return them in a dictionary. This is done recursively. Args: fields: The fields in HDF5 file to extract data from start_path: The start path to start extracting requested data from Returns: A dictionary of data in the requested filed """ result = {} with self: if start_path: try: # Let's check if the start_path exists within the HDF5 file. # If not, we return None self.h5[start_path] # type: ignore except (TypeError, KeyError): return None if isinstance(fields, str): # Single string (reading one data set) will be transformed to a list with one element fields = [fields] if isinstance(fields, list): # To not replicate reading code, transform list into dict with None-values fields = {k: None for k in fields} if isinstance(fields, dict): for key, subfields in fields.items(): if subfields is None: # In this case, the key is the name of the dataset to read try: result[key] = self._convert_read( self.h5[self._join_path(start_path, key)][()] # type: ignore ) # type: ignore except KeyError: # Don't set if not available # Don't throw an error either pass else: cur_result = self._read_to_dict(subfields, start_path=self._join_path(start_path, key)) if cur_result is not None: result[key] = cur_result return result def _read_to_dict_outer(self, fields: Union[dict, List[str], str], start_path: str = "") -> Optional[dict]: """ A wrapper around _read_to_dict, which leaves away the first hierarchy level of the returned dict. That means, if you request fields like {'root': {'A': None, 'B': None}}, it would directly return the inner dict with A and B and it's values, but not the root node. Args: fields: The requested fields to be read from the HDF5 file start_path: The field to start reading from it Returns: The results read from the HDF5 file in a dictionary """ if isinstance(fields, dict): assert len(fields.keys()) == 1 result = self._read_to_dict(fields, start_path) if result: return result[next(iter(fields.keys()))] # type: ignore return result
[docs] def file_meta(self): return self._read_to_dict_outer( { GROUP_FILE_META: { DSET_HOST_NAME: None, DSET_USER_NAME: None, DSET_DT_GENERATED: None, DSET_DET_VERSION: None, } } )
[docs] def volume_meta(self, include_caches=False): """ The metadata information regarding the ct 3d image. Please refer to UVal hdf5 format. Args: include_caches: Whether to include the cached data or not Returns: None """ if include_caches: return self._read_to_dict_outer( { GROUP_VOLUME_META: { DSET_ID: None, DSET_FILE_MD5: None, DSET_FULL_SHAPE: None, DSET_IS_CROPPED: None, DSET_ROI_SHAPE: None, DSET_ROI_START: None, GROUP_CACHE: {DSET_PROJECTION_X, DSET_PROJECTION_Y, DSET_PROJECTION_Z}, } } ) else: return self._read_to_dict_outer( { GROUP_VOLUME_META: { DSET_ID: None, DSET_FILE_MD5: None, DSET_FULL_SHAPE: None, DSET_IS_CROPPED: None, DSET_ROI_SHAPE: None, DSET_ROI_START: None, } } )
[docs] def volume(self): """ The ct 3d volume stored in hdf5 file Please refer to UVal hdf5 format. Returns: None """ return self._read_to_dict_outer(DSET_VOLUME)
[docs] def ground_truth(self, include_masks=False, include_caches=False): """ A list of 3d groundtruth data belonging to a ct image Please refer to UVal hdf5 format. Args: include_masks: To include 3d masks while reading or not include_caches: To include cached data while reading ot not Returns: None """ item_descriptor = { DSET_CLASS_NAME: None, DSET_TARGET_ID: None, DSET_ROI_START: None, DSET_ROI_SHAPE: None, } if include_masks: item_descriptor.update({DSET_MASK: None}) if include_caches: item_descriptor.update({GROUP_CACHE: {DSET_PROJECTION_X, DSET_PROJECTION_Y, DSET_PROJECTION_Z}}) return self._read_dictgroup_to_dict(DICTGROUP_GROUNDTRUTH, item_descriptor)
[docs] def detections(self, include_masks=False, include_caches=False): """ A list of detected areas in the ct image Please refer to UVal hdf5 format. Args: include_masks: To include 3d masks while reading or not include_caches: To include cached data while reading ot not Returns: None """ item_descriptor = { DSET_CLASS_NAME: None, DSET_ROI_START: None, DSET_ROI_SHAPE: None, DSET_SCORE: None, } if include_masks: item_descriptor.update({DSET_MASK: None}) if include_caches: item_descriptor.update({GROUP_CACHE: {DSET_PROJECTION_X, DSET_PROJECTION_Y, DSET_PROJECTION_Z}}) return self._read_listgroup_to_dict(LISTGROUP_DETECTIONS, item_descriptor)
[docs] def read_all_fields(self) -> dict: """ Reads all the existing fields in h5 file Returns: All the groups in a dictionary """ return { GROUP_FILE_META: self.file_meta(), GROUP_VOLUME_META: self.volume_meta(), DSET_VOLUME: self.volume(), DICTGROUP_GROUNDTRUTH: self.ground_truth(), LISTGROUP_DETECTIONS: self.detections(), }
[docs]class UvalHdfFileOutput(UvalHdfFile): """A class to manage, write to a single uval-specific HDF5 file. The file on disk will not be held open by default. Every operation will open and close the file. If a bunch of operations is executed in a row and the file shall be held open, use a with-context on the UvalHdfFile object. """ _volume: Optional[np.ndarray] _volume_meta: Optional[dict] _groundtruth: Optional[dict] _detections: Optional[list] _file_meta: Optional[dict] # Always keeps all things in memory before writing def __init__(self, filepath: str, copy_from_input: Optional[UvalHdfFileInput] = None): super().__init__(filepath, "w") if copy_from_input: self.read_all_from(copy_from_input) else: self._volume = None self._volume_meta = None self._groundtruth = None self._detections = None self._file_meta = None def __enter__(self): """ For the output class, we only actually write when done, not at this point. Returns: Self object of type UvalHdfFileOutput """ if not os.access(os.path.abspath(os.path.dirname(self.filepath)), os.W_OK): raise IOError(f"Cannot write to file '{self.filepath}'") return self def __exit__(self, exc_type, exc_val, exc_tb): """We are done, we can write the file now, if no exception occurred.""" if exc_type is None: # Do not write if an exception occurred self.write() def _on_close(self): """Before closing, write the changes""" self.write() def _write_dict_to_group(self, h5group: H5Group, fields: dict): """In the HDF5 file, create groups and field at given start_path, containing the data from fields. Args: h5group: The HDF5 group to write the filed into fields: A nested dictionary representing the data to be written Returns: None """ # The file needs to be writable assert self.file_mode == "w" for k, v in fields.items(): if isinstance(v, dict): h5group.create_group(k) self._write_dict_to_group(h5group[k], v) else: h5group.create_dataset(k, data=v)
[docs] def write(self): """ Explicitly call this method to write the file. However, changes will automatically be written when the file is closed. Returns: None """ # Check for meta data if self._volume_meta is None: raise ValueError( "No volume metadata was set before writing. " "Uval files always must contain volume metadata." ) with h5py.File(self.filepath, "w") as f: # Write file meta data (This will always be generated here, cannot be set manually) group_file_meta = f.create_group(GROUP_FILE_META) group_file_meta.create_dataset(DSET_HOST_NAME, data=socket.gethostname()) group_file_meta.create_dataset(DSET_USER_NAME, data=getpass.getuser()) group_file_meta.create_dataset( DSET_DT_GENERATED, data=str(datetime.now(timezone.utc).astimezone().isoformat()) ) # As stated above, file_meta cannot be set manually - with one exception: det_version try: group_file_meta.create_dataset(DSET_DET_VERSION, data=self._file_meta[DSET_DET_VERSION]) except (TypeError, KeyError): group_file_meta.create_dataset(DSET_DET_VERSION, data="not specified") # Write volume meta data check_volume_meta_fields(self._volume_meta, self._volume) group_volume_meta = f.create_group(GROUP_VOLUME_META) self._write_dict_to_group(group_volume_meta, self._volume_meta) # Write volume data if available if self._volume is not None: f.create_dataset(DSET_VOLUME, data=self._volume) # Write detections if self._detections: check_detection_fields(self._detections) listgroup_detections = f.create_group(LISTGROUP_DETECTIONS) for idx, detection in enumerate(self._detections): group_detection = listgroup_detections.create_group(str(idx)) self._write_dict_to_group(group_detection, detection) # Write groundtruth: if self._groundtruth: check_groundtruth_fields(self._groundtruth) listgroup_groundtruth = f.create_group(DICTGROUP_GROUNDTRUTH) for label_name, gt in self._groundtruth.items(): group_gt = listgroup_groundtruth.create_group(str(label_name)) self._write_dict_to_group(group_gt, gt)
[docs] def is_closed(self): """Checks if the file is closed or not initialized""" return not self.h5.__bool__()
[docs] def read_all_from(self, input_file: UvalHdfFileInput) -> None: """Reading all the meta data included in another HDF5 file Args: input_file: HDF file to read from Returns: None """ self._groundtruth = input_file.ground_truth() self._detections = input_file.detections() self._file_meta = input_file.file_meta() self._volume_meta = input_file.volume_meta() self._volume = input_file.volume()
# File_meta @property def file_meta(self): raise IOError("This class should only be used to write, not to read a uval file") @file_meta.setter def file_meta(self, value: dict): self._file_meta = value @file_meta.deleter def file_meta(self): self._file_meta = None # Volume @property def volume(self): raise IOError("This class should only be used to write, not to read a uval file") @volume.setter def volume(self, value: np.ndarray): self._volume = value @volume.deleter def volume(self): self._volume = None # Volume meta @property def volume_meta(self): raise IOError("This class should only be used to write, not to read a uval file") @volume_meta.setter def volume_meta(self, value: dict): self._volume_meta = value @volume_meta.deleter def volume_meta(self): self._volume_meta = None # Detections @property def detections(self): raise IOError("This class should only be used to write, not to read a uval file") @detections.setter def detections(self, value: list): self._detections = value @detections.deleter def detections(self): self._detections = None # Groundtruth @property def groundtruth(self): raise IOError("This class should only be used to write, not to read a uval file") @groundtruth.setter def groundtruth(self, value: list): self._groundtruth = value # type: ignore @groundtruth.deleter def groundtruth(self): self._groundtruth = None