Source code for uval.stages.combine_files

# -*- coding: utf-8 -*-
"""A StageData object acts like a data container that manages results generated by stages and read by subsequent
stages.
"""

from uval.stages.stage import uval_stage  # type: ignore
from uval.stages.stage_data import Hdf5FilesData, SupportedDatasetSpecificationData  # type: ignore
from uval.utils.log import logger  # type: ignore


[docs]@uval_stage def create_supported_dataset_from_intersection(gt_files: Hdf5FilesData, det_files: Hdf5FilesData): """Given a list of ground truth and detection files, try to join them given their volume ids.""" # Let GT be a set of ground truth volume ids, DET be a set of detection volume ids # Determine GT & DET set, and the size of the GT-DET and DET-GT sets gt_ids = set(gt_files.table["volume_id"]) det_ids = set(det_files.table["volume_id"]) set_intersection = gt_ids & det_ids set_gt_minus_det = gt_ids - det_ids set_det_minus_gt = det_ids - gt_ids if len(set_intersection) == 0: logger.error("Did not find any matching files between groundtruth and detections") return SupportedDatasetSpecificationData() # Return empty set # This is important feedback for the user to see if this matches their expectations logger.info( f"Dataset intersection statistics:\n" f" GT & DET = {len(set_intersection)}\n" f" GT - DET = {len(set_gt_minus_det)}\n" f" DET - GT = {len(set_det_minus_gt)}\n" ) result = SupportedDatasetSpecificationData() # Now create the supported dataset from the intersection set idx = 0 for vol_id in sorted(list(set_intersection)): cur_gt_file = gt_files.table.loc[vol_id] cur_det_file = det_files.table.loc[vol_id] # Now iterate over every ground truth label for gt_label, gt in cur_gt_file["hdf5_meta"]["groundtruth"].items(): result.table.loc[idx] = { "label_id": gt_label, "volume_id": vol_id, "is_negative": False, # For now, no SOC data used here "target_id": gt["target_id"], "subset": None, # Since this is not based on a data split, we don't have a subset "class_name": gt["class_name"], "hdf5_detection": cur_det_file, "hdf5_groundtruth": cur_gt_file, } idx += 1 # import pandas # pandas.set_option("display.max_rows", None, "display.max_columns", None) # pandas.set_option("display.max_colwidth", 200, "display.width", 5000) # print(result.table) # result.table.to_html("test.htm") return result
[docs]@uval_stage def support_dataset_with_file_paths(gt_files: Hdf5FilesData, det_files: Hdf5FilesData, soc_files: Hdf5FilesData): """Given a dataset and sets of Hdf5 files found on disk, try to combine everything to create a supported dataset.""" # Let GT be a set of ground truth volume ids, DET be a set of detection volume ids # Determine GT & DET set, and the size of the GT-DET and DET-GT sets gt_ids = set(gt_files.table["volume_id"]) det_ids = set(det_files.table["volume_id"]) soc_ids = set(soc_files.table["volume_id"]) set_union = gt_ids | det_ids | soc_ids set_intersection = gt_ids & det_ids set_gt_minus_det = gt_ids - det_ids set_det_minus_gt = det_ids - gt_ids if len(set_union) == 0: logger.error("Did not find any matching files between groundtruth and detections") return SupportedDatasetSpecificationData() # Return empty set # This is important feedback for the user to see if this matches their expectations logger.info( f"Dataset union statistics:\n" f" GT & DET = {len(set_intersection)}\n" f" GT - DET = {len(set_gt_minus_det)}\n" f" DET - GT = {len(set_det_minus_gt)}\n" f" DET + GT + SOC = {len(set_union)}\n" ) result = SupportedDatasetSpecificationData() # Now create the supported dataset from the intersection set idx = 0 for vol_id in sorted(list(set_intersection)): # Now iterate over every ground truth label result.table.loc[idx] = { "volume_id": vol_id, "is_negative": False, # For now, no SOC data used here "hdf5_detection": det_files.table.loc[vol_id]["hdf5_meta"]["detections"], "hdf5_groundtruth": gt_files.table.loc[vol_id]["hdf5_meta"]["groundtruth"], } idx += 1 for vol_id in sorted(list(soc_ids)): # Now iterate over every ground truth label result.table.loc[idx] = { "volume_id": vol_id, "is_negative": True, "hdf5_detection": soc_files.table.loc[vol_id]["hdf5_meta"]["detections"], "hdf5_groundtruth": None, } idx += 1 return result