DatasetManager

Interfaces with the node component database.

Attributes¶

Classes¶

DatasetManager ¶

DatasetManager(db)

Interfaces with the node component database.

Facility for storing data, retrieving data and getting data info for the node. Currently uses TinyDB.

Parameters:

Name	Type	Description	Default
`db`	`str`	Path to the database file	required

Source code in fedbiomed/node/dataset_manager.py

def __init__(self, db: str):
    """Constructor of the class.

    Args:
        db: Path to the database file
    """
    self._db = TinyDB(db)
    self._database = Query()

    # don't use DB read cache to ensure coherence
    # (eg when mixing CLI commands with a GUI session)
    self._dataset_table = DBTable(self._db.storage, name='Datasets', cache_size=0)
    self._dlp_table = DBTable(self._db.storage, name='Data_Loading_Plans', cache_size=0)

Functions¶

add_database ¶

add_database(name, data_type, tags, description, path=None, dataset_id=None, dataset_parameters=None, data_loading_plan=None, save_dlp=True)

Adds a new dataset contained in a file to node's database.

Parameters:

Name	Type	Description	Default
`name`	`str`	Name of the dataset	required
`data_type`	`str`	File extension/format of the dataset (*.csv, images, ...)	required
`tags`	`Union[tuple, list]`	Tags of the dataset.	required
`description`	`str`	Human readable description of the dataset.	required
`path`	`Optional[str]`	Path to the dataset. Defaults to None.	`None`
`dataset_id`	`Optional[str]`	Id of the dataset. Defaults to None.	`None`
`dataset_parameters`	`Optional[dict]`	a dictionary of additional (customized) parameters, or None	`None`
`data_loading_plan`	`Optional[DataLoadingPlan]`	a DataLoadingPlan to be linked to this dataset, or None	`None`
`save_dlp`	`bool`	if True, save the `data_loading_plan`	`True`

Returns:

Name	Type	Description
`dataset_id`		id of the dataset stored in database. Returns `dataset_id` if provided (non-None) or a new id if not.

Raises:

Type	Description
`NotImplementedError`	`data_type` is not supported.
`FedbiomedDatasetManagerError`	path does not exist or dataset was not saved properly.

Source code in fedbiomed/node/dataset_manager.py

def add_database(self,
                 name: str,
                 data_type: str,
                 tags: Union[tuple, list],
                 description: str,
                 path: Optional[str] = None,
                 dataset_id: Optional[str] = None,
                 dataset_parameters : Optional[dict] = None,
                 data_loading_plan: Optional[DataLoadingPlan] = None,
                 save_dlp: bool = True):
    """Adds a new dataset contained in a file to node's database.

    Args:
        name: Name of the dataset
        data_type: File extension/format of the
            dataset (*.csv, images, ...)
        tags: Tags of the dataset.
        description: Human readable description of the dataset.
        path: Path to the dataset. Defaults to None.
        dataset_id: Id of the dataset. Defaults to None.
        dataset_parameters: a dictionary of additional (customized) parameters, or None
        data_loading_plan: a DataLoadingPlan to be linked to this dataset, or None
        save_dlp: if True, save the `data_loading_plan`

    Returns:
        dataset_id: id of the dataset stored in database. Returns `dataset_id`
            if provided (non-None) or a new id if not.

    Raises:
        NotImplementedError: `data_type` is not supported.
        FedbiomedDatasetManagerError: path does not exist or dataset was not saved properly.
    """
    # Accept tilde as home folder
    if path is not None:
        path = os.path.expanduser(path)

    # Check that there are not existing dataset with conflicting tags
    conflicting = self.search_conflicting_tags(tags)
    if len(conflicting) > 0:
        msg = f"{ErrorNumbers.FB322.value}, one or more registered dataset has conflicting tags: " \
            f" {' '.join([ c['name'] for c in conflicting ])}"
        logger.critical(msg)
        raise FedbiomedDatasetManagerError(msg)

    dtypes = []  # empty list for Image datasets
    data_types = ['csv', 'default', 'mednist', 'images', 'medical-folder', 'flamby']

    if data_type not in data_types:
        raise NotImplementedError(f'Data type {data_type} is not'
                                  ' a compatible data type. '
                                  f'Compatible data types are: {data_types}')

    elif data_type == 'flamby':
        from fedbiomed.common.data.flamby_dataset import FlambyLoadingBlockTypes, FlambyDataset
        # check that data loading plan is present and well formed
        if data_loading_plan is None or \
                FlambyLoadingBlockTypes.FLAMBY_DATASET_METADATA not in data_loading_plan:
            msg = f"{ErrorNumbers.FB316.value}. A DataLoadingPlan containing " \
                  f"{FlambyLoadingBlockTypes.FLAMBY_DATASET_METADATA.value} is required for adding a FLamby dataset " \
                  f"to the database."
            logger.critical(msg)
            raise FedbiomedDatasetManagerError(msg)

        # initialize a dataset and link to the flamby data. If all goes well, compute shape.
        try:
            dataset = FlambyDataset()
            dataset.set_dlp(data_loading_plan)  # initializes fed_class as a side effect
        except FedbiomedError as e:
            raise FedbiomedDatasetManagerError(f"Can not create FLamby dataset. {e}")
        else:
            shape = dataset.shape()

    if data_type == 'default':
        assert os.path.isdir(path), f'Folder {path} for Default Dataset does not exist.'
        shape = self.load_default_database(name, path)

    elif data_type == 'mednist':
        assert os.path.isdir(path), f'Folder {path} for MedNIST Dataset does not exist.'
        shape, path = self.load_mednist_database(path)

    elif data_type == 'csv':
        assert os.path.isfile(path), f'Path provided ({path}) does not correspond to a CSV file.'
        dataset = self.load_csv_dataset(path)
        shape = dataset.shape
        dtypes = self.get_csv_data_types(dataset)

    elif data_type == 'images':
        assert os.path.isdir(path), f'Folder {path} for Images Dataset does not exist.'
        shape = self.load_images_dataset(path)

    elif data_type == 'medical-folder':
        if not os.path.isdir(path):
            raise FedbiomedDatasetManagerError(f'Folder {path} for Medical Folder Dataset does not exist.')

        if "tabular_file" not in dataset_parameters:
            logger.info("Medical Folder Dataset will be loaded without reference/demographics data.")
        else:
            if not os.path.isfile(dataset_parameters['tabular_file']):
                raise FedbiomedDatasetManagerError(f'Path {dataset_parameters["tabular_file"]} does not '
                                                   f'correspond a file.')
            if "index_col" not in dataset_parameters:
                raise FedbiomedDatasetManagerError('Index column is not provided')

        try:
            # load using the MedicalFolderController to ensure all available modalities are inspected
            controller = MedicalFolderController(root=path)
            if data_loading_plan is not None:
                controller.set_dlp(data_loading_plan)
            dataset = controller.load_MedicalFolder(tabular_file=dataset_parameters.get('tabular_file', None),
                                                    index_col=dataset_parameters.get('index_col', None))

        except FedbiomedError as e:
            raise FedbiomedDatasetManagerError(f"Can not create Medical Folder dataset. {e}")
        else:
            shape = dataset.shape()

        # try to read one sample and raise if it doesn't work
        try:
            _ = dataset.get_nontransformed_item(0)
        except Exception as e:
            raise FedbiomedDatasetManagerError(f'Medical Folder Dataset was not saved properly and '
                                               f'cannot be read. {e}')

    if not dataset_id:
        dataset_id = 'dataset_' + str(uuid.uuid4())

    new_database = dict(name=name, data_type=data_type, tags=tags,
                        description=description, shape=shape,
                        path=path, dataset_id=dataset_id, dtypes=dtypes,
                        dataset_parameters=dataset_parameters)
    if save_dlp:
        dlp_id = self.save_data_loading_plan(data_loading_plan)
    elif isinstance(data_loading_plan, DataLoadingPlan):
        dlp_id = data_loading_plan.dlp_id
    else:
        dlp_id = None
    if dlp_id is not None:
        new_database['dlp_id'] = dlp_id

    self._dataset_table.insert(new_database)

    return dataset_id

get_by_id ¶

get_by_id(dataset_id)

Searches for a dataset with given dataset_id.

Parameters:

Name	Type	Description	Default
`dataset_id`	`str`	A dataset id	required

Returns:

Type	Description
`Union[dict, None]`	A `dict` containing the dataset's description if a dataset with this `dataset_id`
`Union[dict, None]`	exists in the database. `None` if no such dataset exists in the database.

Source code in fedbiomed/node/dataset_manager.py

def get_by_id(self, dataset_id: str) -> Union[dict, None]:
    """Searches for a dataset with given dataset_id.

    Args:
        dataset_id:  A dataset id

    Returns:
        A `dict` containing the dataset's description if a dataset with this `dataset_id`
        exists in the database. `None` if no such dataset exists in the database.
    """
    return self._dataset_table.get(self._database.dataset_id == dataset_id)

get_csv_data_types ¶

get_csv_data_types(dataset)

Gets data types of each variable in dataset.

Parameters:

Name	Type	Description	Default
`dataset`	`DataFrame`	A Pandas dataset.	required

Returns:

Type	Description
`List[str]`	A list of strings containing data types.

Source code in fedbiomed/node/dataset_manager.py

def get_csv_data_types(self, dataset: pd.DataFrame) -> List[str]:
    """Gets data types of each variable in dataset.

    Args:
        dataset: A Pandas dataset.

    Returns:
        A list of strings containing data types.
    """
    types = [str(t) for t in dataset.dtypes]

    return types

get_data_loading_blocks_by_ids ¶

get_data_loading_blocks_by_ids(dlb_ids)

Search for a list of DataLoadingBlockTypes, each corresponding to one given id.

Note that in case of conflicting ids (which should not happen), this function will silently return a random one with the sought id.

DataLoadingBlock IDs always start with 'serialized_data_loading_block_' and should be unique in the database.

Parameters:

Name	Type	Description	Default
`dlb_ids`	`Union[str, List[str]]`	(List[str]) a list of DataLoadingBlock IDs	required

Returns:

Type	Description
`List[dict]`	A list of dictionaries, each one containing the DataLoadingBlock metadata corresponding to one given id.

Source code in fedbiomed/node/dataset_manager.py

def get_data_loading_blocks_by_ids(self, dlb_ids: Union[str, List[str]]) -> List[dict]:
    """Search for a list of DataLoadingBlockTypes, each corresponding to one given id.

    Note that in case of conflicting ids (which should not happen), this function will silently return a random
    one with the sought id.

    DataLoadingBlock IDs always start with 'serialized_data_loading_block_' and should be unique in the database.

    Args:
        dlb_ids: (List[str]) a list of DataLoadingBlock IDs

    Returns:
        A list of dictionaries, each one containing the DataLoadingBlock metadata corresponding to one given id.
    """
    return self._dlp_table.search(self._database.dlb_id.one_of(dlb_ids))

get_dlp_by_id ¶

get_dlp_by_id(dlp_id)

Search for a DataLoadingPlan with a given id.

Note that in case of conflicting ids (which should not happen), this function will silently return a random one with the sought id.

DataLoadingPlan IDs always start with 'dlp_' and should be unique in the database.

Parameters:

Name	Type	Description	Default
`dlp_id`	`str`	(str) the DataLoadingPlan id	required

Returns:

Type	Description
`Tuple[dict, List[dict]]`	A Tuple containing a dictionary with the DataLoadingPlan metadata corresponding to the given id.

Source code in fedbiomed/node/dataset_manager.py

def get_dlp_by_id(self, dlp_id: str) -> Tuple[dict, List[dict]]:
    """Search for a DataLoadingPlan with a given id.

    Note that in case of conflicting ids (which should not happen), this function will silently return a random
    one with the sought id.

    DataLoadingPlan IDs always start with 'dlp_' and should be unique in the database.

    Args:
        dlp_id: (str) the DataLoadingPlan id

    Returns:
        A Tuple containing a dictionary with the DataLoadingPlan metadata corresponding to the given id.
    """
    dlp_metadata = self._dlp_table.get(self._database.dlp_id == dlp_id)

    # TODO: This exception should be removed once non-existing DLP situation is
    # handled by higher layers in Round or Node classes
    if dlp_metadata is None:
        raise FedbiomedDatasetManagerError(
            f"{ErrorNumbers.FB315.value}: Non-existing DLP for the dataset."
        )

    return dlp_metadata, self._dlp_table.search(
        self._database.dlb_id.one_of(dlp_metadata['loading_blocks'].values()))

get_torch_dataset_shape ¶

get_torch_dataset_shape(dataset)

Gets info about dataset shape.

Parameters:

Name	Type	Description	Default
`dataset`	`Dataset`	A Pytorch dataset	required

Returns:

Type	Description
`List[int]`	A list of int containing [, ]. Example for MNIST: [60000, 1, 28, 28], where =60000 and =1, 28, 28

Source code in fedbiomed/node/dataset_manager.py

def get_torch_dataset_shape(self, dataset: torch.utils.data.Dataset) -> List[int]:
    """Gets info about dataset shape.

    Args:
        dataset: A Pytorch dataset

    Returns:
        A list of int containing
            [<nb_of_data>, <dimension_of_first_input_data>].
            Example for MNIST: [60000, 1, 28, 28], where <nb_of_data>=60000
            and <dimension_of_first_input_data>=1, 28, 28
    """
    return [len(dataset)] + list(dataset[0][0].shape)

list_dlp ¶

list_dlp(target_dataset_type=None)

Return all existing DataLoadingPlans.

Parameters:

Name	Type	Description	Default
`target_dataset_type`	`Optional[str]`	(str or None) if specified, return only dlps matching the requested target type.	`None`

Returns:

Type	Description
`List[dict]`	An array of dict, each dict is a DataLoadingPlan

Source code in fedbiomed/node/dataset_manager.py

def list_dlp(self, target_dataset_type: Optional[str] = None) -> List[dict]:
    """Return all existing DataLoadingPlans.

    Args:
        target_dataset_type: (str or None) if specified, return only dlps matching the requested target type.

    Returns:
        An array of dict, each dict is a DataLoadingPlan
    """
    if target_dataset_type is not None:
        if not isinstance(target_dataset_type, str):
            raise FedbiomedDatasetManagerError(f"Wrong input type for target_dataset_type. "
                                               f"Expected str, got {type(target_dataset_type)} instead.")
        if target_dataset_type not in [t.value for t in DatasetTypes]:
            raise FedbiomedDatasetManagerError("target_dataset_type should be of the values defined in "
                                               "fedbiomed.common.constants.DatasetTypes")

        return self._dlp_table.search(
            (self._database.dlp_id.exists()) &
            (self._database.dlp_name.exists()) &
            (self._database.target_dataset_type == target_dataset_type))
    else:
        return self._dlp_table.search(
            (self._database.dlp_id.exists()) & (self._database.dlp_name.exists()))

list_my_data ¶

list_my_data(verbose=True)

Lists all datasets on the node.

Parameters:

Name	Type	Description	Default
`verbose`	`bool`	Give verbose output. Defaults to True.	`True`

Returns:

Type	Description
`List[dict]`	All datasets in the node's database.

Source code in fedbiomed/node/dataset_manager.py

def list_my_data(self, verbose: bool = True) -> List[dict]:
    """Lists all datasets on the node.

    Args:
        verbose: Give verbose output. Defaults to True.

    Returns:
        All datasets in the node's database.
    """
    my_data = self._dataset_table.all()

    # Do not display dtypes
    for doc in my_data:
        doc.pop('dtypes')

    if verbose:
        print(tabulate(my_data, headers='keys'))

    return my_data

load_as_dataloader ¶

load_as_dataloader(dataset)

Loads content of an image dataset.

Parameters:

Name	Type	Description	Default
`dataset`	`dict`	Description of the dataset.	required

Returns:

Type	Description
`Dataset`	Content of the dataset.

Source code in fedbiomed/node/dataset_manager.py

def load_as_dataloader(self, dataset: dict) -> torch.utils.data.Dataset:
    """Loads content of an image dataset.

    Args:
        dataset: Description of the dataset.

    Returns:
        Content of the dataset.
    """
    name = dataset['data_type']
    if name == 'default':
        return self.load_default_database(name=dataset['name'],
                                          path=dataset['path'],
                                          as_dataset=True)
    elif name == 'images':
        return self.load_images_dataset(folder_path=dataset['path'],
                                        as_dataset=True)

load_csv_dataset ¶

load_csv_dataset(path)

Loads a CSV dataset.

Parameters:

Name	Type	Description	Default
`path`	`str`	Path to the CSV file.	required

Returns:

Type	Description
`DataFrame`	Pandas DataFrame with the content of the file.

Source code in fedbiomed/node/dataset_manager.py

def load_csv_dataset(self, path: str) -> pd.DataFrame:
    """Loads a CSV dataset.

    Args:
        path: Path to the CSV file.

    Returns:
        Pandas DataFrame with the content of the file.
    """
    return self.read_csv(path)

load_default_database ¶

load_default_database(name, path, as_dataset=False)

Loads a default dataset.

Currently, only MNIST dataset is used as the default dataset.

Parameters:

Name	Type	Description	Default
`name`	`str`	Name of the default dataset. Currently, only MNIST is accepted.	required
`path`	`str`	Pathfile to MNIST dataset.	required
`as_dataset`	`bool`	Whether to return the complete dataset (True) or dataset dimensions (False). Defaults to False.	`False`

Raises:

Type	Description
`NotImplementedError`	Name is not matching with the name of a default dataset.

Returns:

Type	Description
`Union[List[int], Dataset]`	Depends on the value of the parameter `as_dataset`: If
`Union[List[int], Dataset]`	set to True, returns dataset (type: torch.utils.data.Dataset).
`Union[List[int], Dataset]`	If set to False, returns the size of the dataset stored inside
`Union[List[int], Dataset]`	a list (type: List[int]).

Source code in fedbiomed/node/dataset_manager.py

def load_default_database(self,
                          name: str,
                          path: str,
                          as_dataset: bool = False) -> Union[List[int],
                                                             torch.utils.data.Dataset]:
    """Loads a default dataset.

    Currently, only MNIST dataset is used as the default dataset.

    Args:
        name: Name of the default dataset. Currently,
            only MNIST is accepted.
        path: Pathfile to MNIST dataset.
        as_dataset: Whether to return
            the complete dataset (True) or dataset dimensions (False).
            Defaults to False.

    Raises:
        NotImplementedError: Name is not matching with
            the name of a default dataset.

    Returns:
        Depends on the value of the parameter `as_dataset`: If
        set to True,  returns dataset (type: torch.utils.data.Dataset).
        If set to False, returns the size of the dataset stored inside
        a list (type: List[int]).
    """
    kwargs = dict(root=path, download=True, transform=transforms.ToTensor())

    if 'mnist' in name.lower():
        dataset = datasets.MNIST(**kwargs)
    else:
        raise NotImplementedError(f'Default dataset `{name}` has'
                                  'not been implemented.')
    if as_dataset:
        return dataset
    else:
        return self.get_torch_dataset_shape(dataset)

load_images_dataset ¶

load_images_dataset(folder_path, as_dataset=False)

Loads an image dataset.

Parameters:

Name	Type	Description	Default
`folder_path`	`str`	Path to the directory containing the images.	required
`as_dataset`	`bool`	Whether to return the complete dataset (True) or dataset dimensions (False). Defaults to False.	`False`

Returns:

Type	Description
`Union[List[int], Dataset]`	Depends on the value of the parameter `as_dataset`: If
`Union[List[int], Dataset]`	set to True, returns dataset (type: torch.utils.data.Dataset).
`Union[List[int], Dataset]`	If set to False, returns the size of the dataset stored inside
`Union[List[int], Dataset]`	a list (type: List[int])

Source code in fedbiomed/node/dataset_manager.py

def load_images_dataset(self,
                        folder_path: str,
                        as_dataset: bool = False) -> Union[List[int],
                                                           torch.utils.data.Dataset]:
    """Loads an image dataset.

    Args:
        folder_path: Path to the directory containing the images.
        as_dataset: Whether to return
            the complete dataset (True) or dataset dimensions (False).
            Defaults to False.

    Returns:
        Depends on the value of the parameter `as_dataset`: If
        set to True,  returns dataset (type: torch.utils.data.Dataset).
        If set to False, returns the size of the dataset stored inside
        a list (type: List[int])
    """
    try:
        dataset = datasets.ImageFolder(folder_path,
                                       transform=transforms.ToTensor())
    except Exception as e:
        _msg = ErrorNumbers.FB315.value +\
            "\nThe following error was raised while loading dataset from the selected" \
            " path:  " + str(e) + "\nPlease make sure that the selected folder is not empty \
            and doesn't have any empty class folder"
        logger.error(_msg)
        raise FedbiomedDatasetManagerError(_msg)

    if as_dataset:
        return dataset
    else:
        return self.get_torch_dataset_shape(dataset)

load_mednist_database ¶

load_mednist_database(path, as_dataset=False)

Loads the MedNist dataset.

Parameters:

Name	Type	Description	Default
`path`	`str`	Pathfile to save a local copy of the MedNist dataset.	required
`as_dataset`	`bool`	Whether to return the complete dataset (True) or dataset dimensions (False). Defaults to False.	`False`

Raises:

Type	Description
`FedbiomedDatasetManagerError`	One of the following cases: tarfile cannot be downloaded downloaded tarfile cannot be extracted MedNIST path is empty one of the classes path is empty

Returns:

Type	Description
`Union[List[int], Dataset]`	Tuple of 2 items:
`str`	First item Depends on the value of the parameter `as_dataset`: If
`Tuple[Union[List[int], Dataset], str]`	set to True, returns dataset (type: torch.utils.data.Dataset).
`Tuple[Union[List[int], Dataset], str]`	If set to False, returns the size of the dataset stored inside
`Tuple[Union[List[int], Dataset], str]`	a list (type: List[int])
`Tuple[Union[List[int], Dataset], str]`	Second item is the path used to download the MedNIST dataset, that needs to be saved as an
`Tuple[Union[List[int], Dataset], str]`	entry in the dataset

Source code in fedbiomed/node/dataset_manager.py

def load_mednist_database(self,
                          path: str,
                          as_dataset: bool = False) -> Tuple[Union[List[int],
                                                        torch.utils.data.Dataset], str]:
    """Loads the MedNist dataset.

    Args:
        path: Pathfile to save a local copy of the MedNist dataset.
        as_dataset: Whether to return
            the complete dataset (True) or dataset dimensions (False).
            Defaults to False.

    Raises:
        FedbiomedDatasetManagerError: One of the following cases:

            - tarfile cannot be downloaded
            - downloaded tarfile cannot
                be extracted
            - MedNIST path is empty
            - one of the classes path is empty

    Returns:
        Tuple of 2 items:
        First item Depends on the value of the parameter `as_dataset`: If
        set to True,  returns dataset (type: torch.utils.data.Dataset).
        If set to False, returns the size of the dataset stored inside
        a list (type: List[int])
        Second item is the path used to download the MedNIST dataset, that needs to be saved as an
        entry in the dataset
    """
    download_path = os.path.join(path, 'MedNIST')
    if not os.path.isdir(download_path):
        url = "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/MedNIST.tar.gz"
        filepath = os.path.join(path, 'MedNIST.tar.gz')
        try:
            logger.info("Now downloading MEDNIST...")
            urlretrieve(url, filepath)
            with tarfile.open(filepath) as tar_file:
                logger.info("Now extracting MEDNIST...")
                tar_file.extractall(path)
            os.remove(filepath)

        except (URLError, HTTPError, ContentTooShortError, OSError, tarfile.TarError,
                MemoryError) as e:
            _msg = ErrorNumbers.FB315.value + "\nThe following error was raised while downloading MedNIST dataset"\
                + "from the MONAI repo:  " + str(e)
            logger.error(_msg)
            raise FedbiomedDatasetManagerError(_msg)

    try:
        dataset = datasets.ImageFolder(download_path,
                                       transform=transforms.ToTensor())

    except (FileNotFoundError, RuntimeError) as e:
        _msg = ErrorNumbers.FB315.value + "\nThe following error was raised while loading MedNIST dataset from"\
            "the selected path:  " + str(e) + "\nPlease make sure that the selected MedNIST folder is not empty \
               or choose another path."
        logger.error(_msg)
        raise FedbiomedDatasetManagerError(_msg)

    except Exception as e:
        _msg = ErrorNumbers.FB315.value + "\nThe following error was raised while loading MedNIST dataset" + str(e)
        logger.error(_msg)
        raise FedbiomedDatasetManagerError(_msg)

    if as_dataset:
        return dataset, download_path
    else:
        return self.get_torch_dataset_shape(dataset), download_path

modify_database_info ¶

modify_database_info(dataset_id, modified_dataset)

Modifies a dataset in the database.

Parameters:

Name	Type	Description	Default
`dataset_id`	`str`	ID of the dataset to modify.	required
`modified_dataset`	`dict`	New dataset description to replace the existing one.	required

Raises:

Type	Description
`FedbiomedDatasetManagerError`	conflicting tags with existing dataset

Source code in fedbiomed/node/dataset_manager.py

def modify_database_info(self,
                         dataset_id: str,
                         modified_dataset: dict):
    """Modifies a dataset in the database.

    Args:
        dataset_id: ID of the dataset to modify.
        modified_dataset: New dataset description to replace the existing one.

    Raises:
        FedbiomedDatasetManagerError: conflicting tags with existing dataset
    """
    # Check that there are not existing dataset with conflicting tags
    if 'tags' in modified_dataset:
        conflicting = self.search_conflicting_tags(modified_dataset['tags'])

        conflicting_ids = [ c['dataset_id'] for c in conflicting ]
        # the dataset to modify is ignored (can conflict with its previous tags)
        if dataset_id in conflicting_ids:
            conflicting_ids.remove(dataset_id)

        if len(conflicting_ids) > 0:
            msg = f"{ErrorNumbers.FB322.value}, one or more registered dataset has conflicting tags: " \
                f" {' '.join([ c['name'] for c in conflicting if c['dataset_id'] != dataset_id ])}"
            logger.critical(msg)
            raise FedbiomedDatasetManagerError(msg)

    self._dataset_table.update(modified_dataset, self._database.dataset_id == dataset_id)

obfuscate_private_information `staticmethod` ¶

obfuscate_private_information(database_metadata)

Remove privacy-sensitive information, to prepare for sharing with a researcher.

Removes any information that could be considered privacy-sensitive by the node. The typical use-case is to prevent sharing this information with a researcher through a reply message.

Parameters:

Name	Type	Description	Default
`database_metadata`	`Iterable[dict]`	an iterable of metadata information objects, one per dataset. Each metadata object should be in the format af key-value pairs, such as e.g. a dict.	required

Source code in fedbiomed/node/dataset_manager.py

@staticmethod
def obfuscate_private_information(database_metadata: Iterable[dict]) -> Iterable[dict]:
    """Remove privacy-sensitive information, to prepare for sharing with a researcher.

    Removes any information that could be considered privacy-sensitive by the node. The typical use-case is to
    prevent sharing this information with a researcher through a reply message.

    Args:
        database_metadata: an iterable of metadata information objects, one per dataset. Each metadata object
            should be in the format af key-value pairs, such as e.g. a dict.
    Returns:
         the updated iterable of metadata information objects without privacy-sensitive information
    """
    for d in database_metadata:
        try:
            # common obfuscations
            d.pop('path', None)
            # obfuscations specific for each data type
            if 'data_type' in d:
                if d['data_type'] == 'medical-folder':
                    if 'dataset_parameters' in d:
                        d['dataset_parameters'].pop('tabular_file', None)
        except AttributeError:
            raise FedbiomedDatasetManagerError(f"Object of type {type(d)} does not support pop or getitem method "
                                               f"in obfuscate_private_information.")
    return database_metadata

read_csv ¶

read_csv(csv_file, index_col=None)

Gets content of a CSV file.

Reads a *.csv file and outputs its data into a pandas DataFrame. Finds automatically the CSV delimiter by parsing the first line.

Parameters:

Name	Type	Description	Default
`csv_file`	`str`	File name / path	required
`index_col`	`Union[int, None]`	Column that contains CSV file index. Defaults to None.	`None`

Returns:

Type	Description
`DataFrame`	Pandas DataFrame with data contained in CSV file.

Source code in fedbiomed/node/dataset_manager.py

def read_csv(self, csv_file: str, index_col: Union[int, None] = None) -> pd.DataFrame:
    """Gets content of a CSV file.

    Reads a *.csv file and outputs its data into a pandas DataFrame.
    Finds automatically the CSV delimiter by parsing the first line.

    Args:
        csv_file: File name / path
        index_col: Column that contains CSV file index.
            Defaults to None.

    Returns:
        Pandas DataFrame with data contained in CSV file.
    """

    # Automatically identify separator and header
    sniffer = csv.Sniffer()
    with open(csv_file, 'r') as file:
        delimiter = sniffer.sniff(file.readline()).delimiter
        file.seek(0)
        header = 0 if sniffer.has_header(file.read()) else None

    return pd.read_csv(csv_file, index_col=index_col, sep=delimiter, header=header)

remove_database ¶

remove_database(dataset_id)

Removes a dataset from database.

Only the dataset matching the dataset_id should be removed.

Parameters:

Name	Type	Description	Default
`dataset_id`	`str`	Dataset unique ID.	required

Source code in fedbiomed/node/dataset_manager.py

def remove_database(self, dataset_id: str):
    """Removes a dataset from database.

    Only the dataset matching the `dataset_id` should be removed.

    Args:
        dataset_id: Dataset unique ID.
    """
    # TODO: check that there is no more than one dataset with `dataset_id` (consistency, should not happen)
    _, dataset_document = self._dataset_table.get(self._database.dataset_id == dataset_id, add_docs=True)

    if dataset_document:
        self._dataset_table.remove(doc_ids=[dataset_document.doc_id])
    else:
        _msg = ErrorNumbers.FB322.value + f": No dataset found with id {dataset_id}"
        logger.error(_msg)
        raise FedbiomedDatasetManagerError(_msg)

remove_dlp_by_id ¶

remove_dlp_by_id(dlp_id)

Removes a data loading plan (DLP) from the database.

Only DLP with matching ID is removed from the database. There should be at most one.

If remove_dlbs is True, also remove the attached DLBs. You should ensure they are not used by another DLP, no verification is made.

Parameters:

Name	Type	Description	Default
`dlp_id`	`str`	the DataLoadingPlan id	required

Source code in fedbiomed/node/dataset_manager.py

def remove_dlp_by_id(self, dlp_id: str):
    """Removes a data loading plan (DLP) from the database.

    Only DLP with matching ID is removed from the database. There should be at most one.

    If `remove_dlbs` is True, also remove the attached DLBs. You should ensure
    they are not used by another DLP, no verification is made.

    Args:
        dlp_id: the DataLoadingPlan id
    """
    if not isinstance(dlp_id, str):
        _msg = ErrorNumbers.FB316.value + f": Bad type for dlp '{type(dlp_id)}', expecting str"
        logger.error(_msg)
        raise FedbiomedDatasetManagerError(_msg)
    if not str:
        _msg = ErrorNumbers.FB316.value + ": Bad value for dlp, expecting non empty str"
        logger.error(_msg)
        raise FedbiomedDatasetManagerError(_msg)

    _ , dlbs = self.get_dlp_by_id(dlp_id)
    try:
        self._dlp_table.remove(self._database.dlp_id == dlp_id)
        for dlb in dlbs:
            self._dlp_table.remove(self._database.dlb_id == dlb['dlb_id'])
    except Exception as e:
        _msg = ErrorNumbers.FB316.value + f": Error during remove of DLP {dlp_id}: {e}"
        logger.error(_msg)
        raise FedbiomedDatasetManagerError(_msg)

save_data_loading_block ¶

save_data_loading_block(dlb)

Source code in fedbiomed/node/dataset_manager.py

def save_data_loading_block(self, dlb: DataLoadingBlock) -> None:
    # seems unused
    self._dlp_table.insert(dlb.serialize())

save_data_loading_plan ¶

save_data_loading_plan(data_loading_plan)

Save a DataLoadingPlan to the database.

This function saves a DataLoadingPlan to the database, and returns its ID.

Raises:

Type	Description
`FedbiomedDatasetManagerError`	bad data loading plan name (size, not unique)

Parameters:

Name	Type	Description	Default
`data_loading_plan`	`Optional[DataLoadingPlan]`	the DataLoadingPlan to be saved, or None.	required

Returns:

Type	Description
`Union[str, None]`	The `dlp_id` if a DLP was saved, or None

Source code in fedbiomed/node/dataset_manager.py

def save_data_loading_plan(self,
                           data_loading_plan: Optional[DataLoadingPlan]
                           ) -> Union[str, None]:
    """Save a DataLoadingPlan to the database.

    This function saves a DataLoadingPlan to the database, and returns its ID.

    Raises:
        FedbiomedDatasetManagerError: bad data loading plan name (size, not unique)

    Args:
        data_loading_plan: the DataLoadingPlan to be saved, or None.

    Returns:
        The `dlp_id` if a DLP was saved, or None
    """
    if data_loading_plan is None:
        return None

    if len(data_loading_plan.desc) < 4:
        _msg = ErrorNumbers.FB316.value + ": Cannot save data loading plan, " + \
            "DLP name needs to have at least 4 characters."
        logger.error(_msg)
        raise FedbiomedDatasetManagerError(_msg)

    _dlp_same_name = self._dlp_table.search(
        (self._database.dlp_id.exists()) & (self._database.dlp_name.exists()) &
        (self._database.dlp_name == data_loading_plan.desc))
    if _dlp_same_name:
        _msg = ErrorNumbers.FB316.value + ": Cannot save data loading plan, " + \
            "DLP name needs to be unique."
        logger.error(_msg)
        raise FedbiomedDatasetManagerError(_msg)

    dlp_metadata, loading_blocks_metadata = data_loading_plan.serialize()
    self._dlp_table.insert(dlp_metadata)
    self._dlp_table.insert_multiple(loading_blocks_metadata)
    return data_loading_plan.dlp_id

search_by_tags ¶

search_by_tags(tags)

Searches for data with given tags.

Parameters:

Name	Type	Description	Default
`tags`	`Union[tuple, list]`	List of tags	required

Returns:

Type	Description
`list`	The list of matching datasets

Source code in fedbiomed/node/dataset_manager.py

def search_by_tags(self, tags: Union[tuple, list]) -> list:
    """Searches for data with given tags.

    Args:
        tags:  List of tags

    Returns:
        The list of matching datasets
    """
    return self._dataset_table.search(self._database.tags.all(tags))

search_conflicting_tags ¶

search_conflicting_tags(tags)

Searches for registered data that have conflicting tags with the given tags

Parameters:

Name	Type	Description	Default
`tags`	`Union[tuple, list]`	List of tags	required

Returns:

Type	Description
`list`	The list of conflicting datasets

Source code in fedbiomed/node/dataset_manager.py

def search_conflicting_tags(self, tags: Union[tuple, list]) -> list:
    """Searches for registered data that have conflicting tags with the given tags

    Args:
        tags:  List of tags

    Returns:
        The list of conflicting datasets
    """
    def _conflicting_tags(val):
        return all(t in val for t in tags) or all(t in tags for t in val)


    return self._dataset_table.search(self._database.tags.test(_conflicting_tags))

DatasetManager

Attributes¶

Classes¶

DatasetManager ¶

Functions¶

add_database ¶

get_by_id ¶

get_csv_data_types ¶

get_data_loading_blocks_by_ids ¶

get_dlp_by_id ¶

get_torch_dataset_shape ¶

list_dlp ¶

list_my_data ¶

load_as_dataloader ¶

load_csv_dataset ¶

load_default_database ¶

load_images_dataset ¶

load_mednist_database ¶

modify_database_info ¶

obfuscate_private_information staticmethod ¶

read_csv ¶

remove_database ¶

remove_dlp_by_id ¶

save_data_loading_block ¶

save_data_loading_plan ¶

search_by_tags ¶

search_conflicting_tags ¶

obfuscate_private_information `staticmethod` ¶