Interfaces with the node component database.
Attributes¶
Classes¶
DatasetManager ¶
DatasetManager(db)
Interfaces with the node component database.
Facility for storing data, retrieving data and getting data info for the node. Currently uses TinyDB.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
db | str | Path to the database file | required |
Source code in fedbiomed/node/dataset_manager.py
def __init__(self, db: str):
"""Constructor of the class.
Args:
db: Path to the database file
"""
self._db = TinyDB(db)
self._database = Query()
# don't use DB read cache to ensure coherence
# (eg when mixing CLI commands with a GUI session)
self._dataset_table = DBTable(self._db.storage, name='Datasets', cache_size=0)
self._dlp_table = DBTable(self._db.storage, name='Data_Loading_Plans', cache_size=0)
Functions¶
add_database ¶
add_database(name, data_type, tags, description, path=None, dataset_id=None, dataset_parameters=None, data_loading_plan=None, save_dlp=True)
Adds a new dataset contained in a file to node's database.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
name | str | Name of the dataset | required |
data_type | str | File extension/format of the dataset (*.csv, images, ...) | required |
tags | Union[tuple, list] | Tags of the dataset. | required |
description | str | Human readable description of the dataset. | required |
path | Optional[str] | Path to the dataset. Defaults to None. | None |
dataset_id | Optional[str] | Id of the dataset. Defaults to None. | None |
dataset_parameters | Optional[dict] | a dictionary of additional (customized) parameters, or None | None |
data_loading_plan | Optional[DataLoadingPlan] | a DataLoadingPlan to be linked to this dataset, or None | None |
save_dlp | bool | if True, save the | True |
Returns:
| Name | Type | Description |
|---|---|---|
dataset_id | id of the dataset stored in database. Returns |
Raises:
| Type | Description |
|---|---|
NotImplementedError |
|
FedbiomedDatasetManagerError | path does not exist or dataset was not saved properly. |
Source code in fedbiomed/node/dataset_manager.py
def add_database(self,
name: str,
data_type: str,
tags: Union[tuple, list],
description: str,
path: Optional[str] = None,
dataset_id: Optional[str] = None,
dataset_parameters : Optional[dict] = None,
data_loading_plan: Optional[DataLoadingPlan] = None,
save_dlp: bool = True):
"""Adds a new dataset contained in a file to node's database.
Args:
name: Name of the dataset
data_type: File extension/format of the
dataset (*.csv, images, ...)
tags: Tags of the dataset.
description: Human readable description of the dataset.
path: Path to the dataset. Defaults to None.
dataset_id: Id of the dataset. Defaults to None.
dataset_parameters: a dictionary of additional (customized) parameters, or None
data_loading_plan: a DataLoadingPlan to be linked to this dataset, or None
save_dlp: if True, save the `data_loading_plan`
Returns:
dataset_id: id of the dataset stored in database. Returns `dataset_id`
if provided (non-None) or a new id if not.
Raises:
NotImplementedError: `data_type` is not supported.
FedbiomedDatasetManagerError: path does not exist or dataset was not saved properly.
"""
# Accept tilde as home folder
if path is not None:
path = os.path.expanduser(path)
# Check that there are not existing dataset with conflicting tags
conflicting = self.search_conflicting_tags(tags)
if len(conflicting) > 0:
msg = f"{ErrorNumbers.FB322.value}, one or more registered dataset has conflicting tags: " \
f" {' '.join([ c['name'] for c in conflicting ])}"
logger.critical(msg)
raise FedbiomedDatasetManagerError(msg)
dtypes = [] # empty list for Image datasets
data_types = ['csv', 'default', 'mednist', 'images', 'medical-folder', 'flamby']
if data_type not in data_types:
raise NotImplementedError(f'Data type {data_type} is not'
' a compatible data type. '
f'Compatible data types are: {data_types}')
elif data_type == 'flamby':
from fedbiomed.common.data.flamby_dataset import FlambyLoadingBlockTypes, FlambyDataset
# check that data loading plan is present and well formed
if data_loading_plan is None or \
FlambyLoadingBlockTypes.FLAMBY_DATASET_METADATA not in data_loading_plan:
msg = f"{ErrorNumbers.FB316.value}. A DataLoadingPlan containing " \
f"{FlambyLoadingBlockTypes.FLAMBY_DATASET_METADATA.value} is required for adding a FLamby dataset " \
f"to the database."
logger.critical(msg)
raise FedbiomedDatasetManagerError(msg)
# initialize a dataset and link to the flamby data. If all goes well, compute shape.
try:
dataset = FlambyDataset()
dataset.set_dlp(data_loading_plan) # initializes fed_class as a side effect
except FedbiomedError as e:
raise FedbiomedDatasetManagerError(f"Can not create FLamby dataset. {e}")
else:
shape = dataset.shape()
if data_type == 'default':
assert os.path.isdir(path), f'Folder {path} for Default Dataset does not exist.'
shape = self.load_default_database(name, path)
elif data_type == 'mednist':
assert os.path.isdir(path), f'Folder {path} for MedNIST Dataset does not exist.'
shape, path = self.load_mednist_database(path)
elif data_type == 'csv':
assert os.path.isfile(path), f'Path provided ({path}) does not correspond to a CSV file.'
dataset = self.load_csv_dataset(path)
shape = dataset.shape
dtypes = self.get_csv_data_types(dataset)
elif data_type == 'images':
assert os.path.isdir(path), f'Folder {path} for Images Dataset does not exist.'
shape = self.load_images_dataset(path)
elif data_type == 'medical-folder':
if not os.path.isdir(path):
raise FedbiomedDatasetManagerError(f'Folder {path} for Medical Folder Dataset does not exist.')
if "tabular_file" not in dataset_parameters:
logger.info("Medical Folder Dataset will be loaded without reference/demographics data.")
else:
if not os.path.isfile(dataset_parameters['tabular_file']):
raise FedbiomedDatasetManagerError(f'Path {dataset_parameters["tabular_file"]} does not '
f'correspond a file.')
if "index_col" not in dataset_parameters:
raise FedbiomedDatasetManagerError('Index column is not provided')
try:
# load using the MedicalFolderController to ensure all available modalities are inspected
controller = MedicalFolderController(root=path)
if data_loading_plan is not None:
controller.set_dlp(data_loading_plan)
dataset = controller.load_MedicalFolder(tabular_file=dataset_parameters.get('tabular_file', None),
index_col=dataset_parameters.get('index_col', None))
except FedbiomedError as e:
raise FedbiomedDatasetManagerError(f"Can not create Medical Folder dataset. {e}")
else:
shape = dataset.shape()
# try to read one sample and raise if it doesn't work
try:
_ = dataset.get_nontransformed_item(0)
except Exception as e:
raise FedbiomedDatasetManagerError(f'Medical Folder Dataset was not saved properly and '
f'cannot be read. {e}')
if not dataset_id:
dataset_id = 'dataset_' + str(uuid.uuid4())
new_database = dict(name=name, data_type=data_type, tags=tags,
description=description, shape=shape,
path=path, dataset_id=dataset_id, dtypes=dtypes,
dataset_parameters=dataset_parameters)
if save_dlp:
dlp_id = self.save_data_loading_plan(data_loading_plan)
elif isinstance(data_loading_plan, DataLoadingPlan):
dlp_id = data_loading_plan.dlp_id
else:
dlp_id = None
if dlp_id is not None:
new_database['dlp_id'] = dlp_id
self._dataset_table.insert(new_database)
return dataset_id
get_by_id ¶
get_by_id(dataset_id)
Searches for a dataset with given dataset_id.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
dataset_id | str | A dataset id | required |
Returns:
| Type | Description |
|---|---|
Union[dict, None] | A |
Union[dict, None] | exists in the database. |
Source code in fedbiomed/node/dataset_manager.py
def get_by_id(self, dataset_id: str) -> Union[dict, None]:
"""Searches for a dataset with given dataset_id.
Args:
dataset_id: A dataset id
Returns:
A `dict` containing the dataset's description if a dataset with this `dataset_id`
exists in the database. `None` if no such dataset exists in the database.
"""
return self._dataset_table.get(self._database.dataset_id == dataset_id)
get_csv_data_types ¶
get_csv_data_types(dataset)
Gets data types of each variable in dataset.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
dataset | DataFrame | A Pandas dataset. | required |
Returns:
| Type | Description |
|---|---|
List[str] | A list of strings containing data types. |
Source code in fedbiomed/node/dataset_manager.py
def get_csv_data_types(self, dataset: pd.DataFrame) -> List[str]:
"""Gets data types of each variable in dataset.
Args:
dataset: A Pandas dataset.
Returns:
A list of strings containing data types.
"""
types = [str(t) for t in dataset.dtypes]
return types
get_data_loading_blocks_by_ids ¶
get_data_loading_blocks_by_ids(dlb_ids)
Search for a list of DataLoadingBlockTypes, each corresponding to one given id.
Note that in case of conflicting ids (which should not happen), this function will silently return a random one with the sought id.
DataLoadingBlock IDs always start with 'serialized_data_loading_block_' and should be unique in the database.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
dlb_ids | Union[str, List[str]] | (List[str]) a list of DataLoadingBlock IDs | required |
Returns:
| Type | Description |
|---|---|
List[dict] | A list of dictionaries, each one containing the DataLoadingBlock metadata corresponding to one given id. |
Source code in fedbiomed/node/dataset_manager.py
def get_data_loading_blocks_by_ids(self, dlb_ids: Union[str, List[str]]) -> List[dict]:
"""Search for a list of DataLoadingBlockTypes, each corresponding to one given id.
Note that in case of conflicting ids (which should not happen), this function will silently return a random
one with the sought id.
DataLoadingBlock IDs always start with 'serialized_data_loading_block_' and should be unique in the database.
Args:
dlb_ids: (List[str]) a list of DataLoadingBlock IDs
Returns:
A list of dictionaries, each one containing the DataLoadingBlock metadata corresponding to one given id.
"""
return self._dlp_table.search(self._database.dlb_id.one_of(dlb_ids))
get_dlp_by_id ¶
get_dlp_by_id(dlp_id)
Search for a DataLoadingPlan with a given id.
Note that in case of conflicting ids (which should not happen), this function will silently return a random one with the sought id.
DataLoadingPlan IDs always start with 'dlp_' and should be unique in the database.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
dlp_id | str | (str) the DataLoadingPlan id | required |
Returns:
| Type | Description |
|---|---|
Tuple[dict, List[dict]] | A Tuple containing a dictionary with the DataLoadingPlan metadata corresponding to the given id. |
Source code in fedbiomed/node/dataset_manager.py
def get_dlp_by_id(self, dlp_id: str) -> Tuple[dict, List[dict]]:
"""Search for a DataLoadingPlan with a given id.
Note that in case of conflicting ids (which should not happen), this function will silently return a random
one with the sought id.
DataLoadingPlan IDs always start with 'dlp_' and should be unique in the database.
Args:
dlp_id: (str) the DataLoadingPlan id
Returns:
A Tuple containing a dictionary with the DataLoadingPlan metadata corresponding to the given id.
"""
dlp_metadata = self._dlp_table.get(self._database.dlp_id == dlp_id)
# TODO: This exception should be removed once non-existing DLP situation is
# handled by higher layers in Round or Node classes
if dlp_metadata is None:
raise FedbiomedDatasetManagerError(
f"{ErrorNumbers.FB315.value}: Non-existing DLP for the dataset."
)
return dlp_metadata, self._dlp_table.search(
self._database.dlb_id.one_of(dlp_metadata['loading_blocks'].values()))
get_torch_dataset_shape ¶
get_torch_dataset_shape(dataset)
Gets info about dataset shape.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
dataset | Dataset | A Pytorch dataset | required |
Returns:
| Type | Description |
|---|---|
List[int] | A list of int containing [ |
Source code in fedbiomed/node/dataset_manager.py
def get_torch_dataset_shape(self, dataset: torch.utils.data.Dataset) -> List[int]:
"""Gets info about dataset shape.
Args:
dataset: A Pytorch dataset
Returns:
A list of int containing
[<nb_of_data>, <dimension_of_first_input_data>].
Example for MNIST: [60000, 1, 28, 28], where <nb_of_data>=60000
and <dimension_of_first_input_data>=1, 28, 28
"""
return [len(dataset)] + list(dataset[0][0].shape)
list_dlp ¶
list_dlp(target_dataset_type=None)
Return all existing DataLoadingPlans.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
target_dataset_type | Optional[str] | (str or None) if specified, return only dlps matching the requested target type. | None |
Returns:
| Type | Description |
|---|---|
List[dict] | An array of dict, each dict is a DataLoadingPlan |
Source code in fedbiomed/node/dataset_manager.py
def list_dlp(self, target_dataset_type: Optional[str] = None) -> List[dict]:
"""Return all existing DataLoadingPlans.
Args:
target_dataset_type: (str or None) if specified, return only dlps matching the requested target type.
Returns:
An array of dict, each dict is a DataLoadingPlan
"""
if target_dataset_type is not None:
if not isinstance(target_dataset_type, str):
raise FedbiomedDatasetManagerError(f"Wrong input type for target_dataset_type. "
f"Expected str, got {type(target_dataset_type)} instead.")
if target_dataset_type not in [t.value for t in DatasetTypes]:
raise FedbiomedDatasetManagerError("target_dataset_type should be of the values defined in "
"fedbiomed.common.constants.DatasetTypes")
return self._dlp_table.search(
(self._database.dlp_id.exists()) &
(self._database.dlp_name.exists()) &
(self._database.target_dataset_type == target_dataset_type))
else:
return self._dlp_table.search(
(self._database.dlp_id.exists()) & (self._database.dlp_name.exists()))
list_my_data ¶
list_my_data(verbose=True)
Lists all datasets on the node.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
verbose | bool | Give verbose output. Defaults to True. | True |
Returns:
| Type | Description |
|---|---|
List[dict] | All datasets in the node's database. |
Source code in fedbiomed/node/dataset_manager.py
def list_my_data(self, verbose: bool = True) -> List[dict]:
"""Lists all datasets on the node.
Args:
verbose: Give verbose output. Defaults to True.
Returns:
All datasets in the node's database.
"""
my_data = self._dataset_table.all()
# Do not display dtypes
for doc in my_data:
doc.pop('dtypes')
if verbose:
print(tabulate(my_data, headers='keys'))
return my_data
load_as_dataloader ¶
load_as_dataloader(dataset)
Loads content of an image dataset.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
dataset | dict | Description of the dataset. | required |
Returns:
| Type | Description |
|---|---|
Dataset | Content of the dataset. |
Source code in fedbiomed/node/dataset_manager.py
def load_as_dataloader(self, dataset: dict) -> torch.utils.data.Dataset:
"""Loads content of an image dataset.
Args:
dataset: Description of the dataset.
Returns:
Content of the dataset.
"""
name = dataset['data_type']
if name == 'default':
return self.load_default_database(name=dataset['name'],
path=dataset['path'],
as_dataset=True)
elif name == 'images':
return self.load_images_dataset(folder_path=dataset['path'],
as_dataset=True)
load_csv_dataset ¶
load_csv_dataset(path)
Loads a CSV dataset.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
path | str | Path to the CSV file. | required |
Returns:
| Type | Description |
|---|---|
DataFrame | Pandas DataFrame with the content of the file. |
Source code in fedbiomed/node/dataset_manager.py
def load_csv_dataset(self, path: str) -> pd.DataFrame:
"""Loads a CSV dataset.
Args:
path: Path to the CSV file.
Returns:
Pandas DataFrame with the content of the file.
"""
return self.read_csv(path)
load_default_database ¶
load_default_database(name, path, as_dataset=False)
Loads a default dataset.
Currently, only MNIST dataset is used as the default dataset.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
name | str | Name of the default dataset. Currently, only MNIST is accepted. | required |
path | str | Pathfile to MNIST dataset. | required |
as_dataset | bool | Whether to return the complete dataset (True) or dataset dimensions (False). Defaults to False. | False |
Raises:
| Type | Description |
|---|---|
NotImplementedError | Name is not matching with the name of a default dataset. |
Returns:
| Type | Description |
|---|---|
Union[List[int], Dataset] | Depends on the value of the parameter |
Union[List[int], Dataset] | set to True, returns dataset (type: torch.utils.data.Dataset). |
Union[List[int], Dataset] | If set to False, returns the size of the dataset stored inside |
Union[List[int], Dataset] | a list (type: List[int]). |
Source code in fedbiomed/node/dataset_manager.py
def load_default_database(self,
name: str,
path: str,
as_dataset: bool = False) -> Union[List[int],
torch.utils.data.Dataset]:
"""Loads a default dataset.
Currently, only MNIST dataset is used as the default dataset.
Args:
name: Name of the default dataset. Currently,
only MNIST is accepted.
path: Pathfile to MNIST dataset.
as_dataset: Whether to return
the complete dataset (True) or dataset dimensions (False).
Defaults to False.
Raises:
NotImplementedError: Name is not matching with
the name of a default dataset.
Returns:
Depends on the value of the parameter `as_dataset`: If
set to True, returns dataset (type: torch.utils.data.Dataset).
If set to False, returns the size of the dataset stored inside
a list (type: List[int]).
"""
kwargs = dict(root=path, download=True, transform=transforms.ToTensor())
if 'mnist' in name.lower():
dataset = datasets.MNIST(**kwargs)
else:
raise NotImplementedError(f'Default dataset `{name}` has'
'not been implemented.')
if as_dataset:
return dataset
else:
return self.get_torch_dataset_shape(dataset)
load_images_dataset ¶
load_images_dataset(folder_path, as_dataset=False)
Loads an image dataset.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
folder_path | str | Path to the directory containing the images. | required |
as_dataset | bool | Whether to return the complete dataset (True) or dataset dimensions (False). Defaults to False. | False |
Returns:
| Type | Description |
|---|---|
Union[List[int], Dataset] | Depends on the value of the parameter |
Union[List[int], Dataset] | set to True, returns dataset (type: torch.utils.data.Dataset). |
Union[List[int], Dataset] | If set to False, returns the size of the dataset stored inside |
Union[List[int], Dataset] | a list (type: List[int]) |
Source code in fedbiomed/node/dataset_manager.py
def load_images_dataset(self,
folder_path: str,
as_dataset: bool = False) -> Union[List[int],
torch.utils.data.Dataset]:
"""Loads an image dataset.
Args:
folder_path: Path to the directory containing the images.
as_dataset: Whether to return
the complete dataset (True) or dataset dimensions (False).
Defaults to False.
Returns:
Depends on the value of the parameter `as_dataset`: If
set to True, returns dataset (type: torch.utils.data.Dataset).
If set to False, returns the size of the dataset stored inside
a list (type: List[int])
"""
try:
dataset = datasets.ImageFolder(folder_path,
transform=transforms.ToTensor())
except Exception as e:
_msg = ErrorNumbers.FB315.value +\
"\nThe following error was raised while loading dataset from the selected" \
" path: " + str(e) + "\nPlease make sure that the selected folder is not empty \
and doesn't have any empty class folder"
logger.error(_msg)
raise FedbiomedDatasetManagerError(_msg)
if as_dataset:
return dataset
else:
return self.get_torch_dataset_shape(dataset)
load_mednist_database ¶
load_mednist_database(path, as_dataset=False)
Loads the MedNist dataset.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
path | str | Pathfile to save a local copy of the MedNist dataset. | required |
as_dataset | bool | Whether to return the complete dataset (True) or dataset dimensions (False). Defaults to False. | False |
Raises:
| Type | Description |
|---|---|
FedbiomedDatasetManagerError | One of the following cases:
|
Returns:
| Type | Description |
|---|---|
Union[List[int], Dataset] | Tuple of 2 items: |
str | First item Depends on the value of the parameter |
Tuple[Union[List[int], Dataset], str] | set to True, returns dataset (type: torch.utils.data.Dataset). |
Tuple[Union[List[int], Dataset], str] | If set to False, returns the size of the dataset stored inside |
Tuple[Union[List[int], Dataset], str] | a list (type: List[int]) |
Tuple[Union[List[int], Dataset], str] | Second item is the path used to download the MedNIST dataset, that needs to be saved as an |
Tuple[Union[List[int], Dataset], str] | entry in the dataset |
Source code in fedbiomed/node/dataset_manager.py
def load_mednist_database(self,
path: str,
as_dataset: bool = False) -> Tuple[Union[List[int],
torch.utils.data.Dataset], str]:
"""Loads the MedNist dataset.
Args:
path: Pathfile to save a local copy of the MedNist dataset.
as_dataset: Whether to return
the complete dataset (True) or dataset dimensions (False).
Defaults to False.
Raises:
FedbiomedDatasetManagerError: One of the following cases:
- tarfile cannot be downloaded
- downloaded tarfile cannot
be extracted
- MedNIST path is empty
- one of the classes path is empty
Returns:
Tuple of 2 items:
First item Depends on the value of the parameter `as_dataset`: If
set to True, returns dataset (type: torch.utils.data.Dataset).
If set to False, returns the size of the dataset stored inside
a list (type: List[int])
Second item is the path used to download the MedNIST dataset, that needs to be saved as an
entry in the dataset
"""
download_path = os.path.join(path, 'MedNIST')
if not os.path.isdir(download_path):
url = "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/MedNIST.tar.gz"
filepath = os.path.join(path, 'MedNIST.tar.gz')
try:
logger.info("Now downloading MEDNIST...")
urlretrieve(url, filepath)
with tarfile.open(filepath) as tar_file:
logger.info("Now extracting MEDNIST...")
tar_file.extractall(path)
os.remove(filepath)
except (URLError, HTTPError, ContentTooShortError, OSError, tarfile.TarError,
MemoryError) as e:
_msg = ErrorNumbers.FB315.value + "\nThe following error was raised while downloading MedNIST dataset"\
+ "from the MONAI repo: " + str(e)
logger.error(_msg)
raise FedbiomedDatasetManagerError(_msg)
try:
dataset = datasets.ImageFolder(download_path,
transform=transforms.ToTensor())
except (FileNotFoundError, RuntimeError) as e:
_msg = ErrorNumbers.FB315.value + "\nThe following error was raised while loading MedNIST dataset from"\
"the selected path: " + str(e) + "\nPlease make sure that the selected MedNIST folder is not empty \
or choose another path."
logger.error(_msg)
raise FedbiomedDatasetManagerError(_msg)
except Exception as e:
_msg = ErrorNumbers.FB315.value + "\nThe following error was raised while loading MedNIST dataset" + str(e)
logger.error(_msg)
raise FedbiomedDatasetManagerError(_msg)
if as_dataset:
return dataset, download_path
else:
return self.get_torch_dataset_shape(dataset), download_path
modify_database_info ¶
modify_database_info(dataset_id, modified_dataset)
Modifies a dataset in the database.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
dataset_id | str | ID of the dataset to modify. | required |
modified_dataset | dict | New dataset description to replace the existing one. | required |
Raises:
| Type | Description |
|---|---|
FedbiomedDatasetManagerError | conflicting tags with existing dataset |
Source code in fedbiomed/node/dataset_manager.py
def modify_database_info(self,
dataset_id: str,
modified_dataset: dict):
"""Modifies a dataset in the database.
Args:
dataset_id: ID of the dataset to modify.
modified_dataset: New dataset description to replace the existing one.
Raises:
FedbiomedDatasetManagerError: conflicting tags with existing dataset
"""
# Check that there are not existing dataset with conflicting tags
if 'tags' in modified_dataset:
conflicting = self.search_conflicting_tags(modified_dataset['tags'])
conflicting_ids = [ c['dataset_id'] for c in conflicting ]
# the dataset to modify is ignored (can conflict with its previous tags)
if dataset_id in conflicting_ids:
conflicting_ids.remove(dataset_id)
if len(conflicting_ids) > 0:
msg = f"{ErrorNumbers.FB322.value}, one or more registered dataset has conflicting tags: " \
f" {' '.join([ c['name'] for c in conflicting if c['dataset_id'] != dataset_id ])}"
logger.critical(msg)
raise FedbiomedDatasetManagerError(msg)
self._dataset_table.update(modified_dataset, self._database.dataset_id == dataset_id)
obfuscate_private_information staticmethod ¶
obfuscate_private_information(database_metadata)
Remove privacy-sensitive information, to prepare for sharing with a researcher.
Removes any information that could be considered privacy-sensitive by the node. The typical use-case is to prevent sharing this information with a researcher through a reply message.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
database_metadata | Iterable[dict] | an iterable of metadata information objects, one per dataset. Each metadata object should be in the format af key-value pairs, such as e.g. a dict. | required |
Source code in fedbiomed/node/dataset_manager.py
@staticmethod
def obfuscate_private_information(database_metadata: Iterable[dict]) -> Iterable[dict]:
"""Remove privacy-sensitive information, to prepare for sharing with a researcher.
Removes any information that could be considered privacy-sensitive by the node. The typical use-case is to
prevent sharing this information with a researcher through a reply message.
Args:
database_metadata: an iterable of metadata information objects, one per dataset. Each metadata object
should be in the format af key-value pairs, such as e.g. a dict.
Returns:
the updated iterable of metadata information objects without privacy-sensitive information
"""
for d in database_metadata:
try:
# common obfuscations
d.pop('path', None)
# obfuscations specific for each data type
if 'data_type' in d:
if d['data_type'] == 'medical-folder':
if 'dataset_parameters' in d:
d['dataset_parameters'].pop('tabular_file', None)
except AttributeError:
raise FedbiomedDatasetManagerError(f"Object of type {type(d)} does not support pop or getitem method "
f"in obfuscate_private_information.")
return database_metadata
read_csv ¶
read_csv(csv_file, index_col=None)
Gets content of a CSV file.
Reads a *.csv file and outputs its data into a pandas DataFrame. Finds automatically the CSV delimiter by parsing the first line.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
csv_file | str | File name / path | required |
index_col | Union[int, None] | Column that contains CSV file index. Defaults to None. | None |
Returns:
| Type | Description |
|---|---|
DataFrame | Pandas DataFrame with data contained in CSV file. |
Source code in fedbiomed/node/dataset_manager.py
def read_csv(self, csv_file: str, index_col: Union[int, None] = None) -> pd.DataFrame:
"""Gets content of a CSV file.
Reads a *.csv file and outputs its data into a pandas DataFrame.
Finds automatically the CSV delimiter by parsing the first line.
Args:
csv_file: File name / path
index_col: Column that contains CSV file index.
Defaults to None.
Returns:
Pandas DataFrame with data contained in CSV file.
"""
# Automatically identify separator and header
sniffer = csv.Sniffer()
with open(csv_file, 'r') as file:
delimiter = sniffer.sniff(file.readline()).delimiter
file.seek(0)
header = 0 if sniffer.has_header(file.read()) else None
return pd.read_csv(csv_file, index_col=index_col, sep=delimiter, header=header)
remove_database ¶
remove_database(dataset_id)
Removes a dataset from database.
Only the dataset matching the dataset_id should be removed.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
dataset_id | str | Dataset unique ID. | required |
Source code in fedbiomed/node/dataset_manager.py
def remove_database(self, dataset_id: str):
"""Removes a dataset from database.
Only the dataset matching the `dataset_id` should be removed.
Args:
dataset_id: Dataset unique ID.
"""
# TODO: check that there is no more than one dataset with `dataset_id` (consistency, should not happen)
_, dataset_document = self._dataset_table.get(self._database.dataset_id == dataset_id, add_docs=True)
if dataset_document:
self._dataset_table.remove(doc_ids=[dataset_document.doc_id])
else:
_msg = ErrorNumbers.FB322.value + f": No dataset found with id {dataset_id}"
logger.error(_msg)
raise FedbiomedDatasetManagerError(_msg)
remove_dlp_by_id ¶
remove_dlp_by_id(dlp_id)
Removes a data loading plan (DLP) from the database.
Only DLP with matching ID is removed from the database. There should be at most one.
If remove_dlbs is True, also remove the attached DLBs. You should ensure they are not used by another DLP, no verification is made.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
dlp_id | str | the DataLoadingPlan id | required |
Source code in fedbiomed/node/dataset_manager.py
def remove_dlp_by_id(self, dlp_id: str):
"""Removes a data loading plan (DLP) from the database.
Only DLP with matching ID is removed from the database. There should be at most one.
If `remove_dlbs` is True, also remove the attached DLBs. You should ensure
they are not used by another DLP, no verification is made.
Args:
dlp_id: the DataLoadingPlan id
"""
if not isinstance(dlp_id, str):
_msg = ErrorNumbers.FB316.value + f": Bad type for dlp '{type(dlp_id)}', expecting str"
logger.error(_msg)
raise FedbiomedDatasetManagerError(_msg)
if not str:
_msg = ErrorNumbers.FB316.value + ": Bad value for dlp, expecting non empty str"
logger.error(_msg)
raise FedbiomedDatasetManagerError(_msg)
_ , dlbs = self.get_dlp_by_id(dlp_id)
try:
self._dlp_table.remove(self._database.dlp_id == dlp_id)
for dlb in dlbs:
self._dlp_table.remove(self._database.dlb_id == dlb['dlb_id'])
except Exception as e:
_msg = ErrorNumbers.FB316.value + f": Error during remove of DLP {dlp_id}: {e}"
logger.error(_msg)
raise FedbiomedDatasetManagerError(_msg)
save_data_loading_block ¶
save_data_loading_block(dlb)
Source code in fedbiomed/node/dataset_manager.py
def save_data_loading_block(self, dlb: DataLoadingBlock) -> None:
# seems unused
self._dlp_table.insert(dlb.serialize())
save_data_loading_plan ¶
save_data_loading_plan(data_loading_plan)
Save a DataLoadingPlan to the database.
This function saves a DataLoadingPlan to the database, and returns its ID.
Raises:
| Type | Description |
|---|---|
FedbiomedDatasetManagerError | bad data loading plan name (size, not unique) |
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
data_loading_plan | Optional[DataLoadingPlan] | the DataLoadingPlan to be saved, or None. | required |
Returns:
| Type | Description |
|---|---|
Union[str, None] | The |
Source code in fedbiomed/node/dataset_manager.py
def save_data_loading_plan(self,
data_loading_plan: Optional[DataLoadingPlan]
) -> Union[str, None]:
"""Save a DataLoadingPlan to the database.
This function saves a DataLoadingPlan to the database, and returns its ID.
Raises:
FedbiomedDatasetManagerError: bad data loading plan name (size, not unique)
Args:
data_loading_plan: the DataLoadingPlan to be saved, or None.
Returns:
The `dlp_id` if a DLP was saved, or None
"""
if data_loading_plan is None:
return None
if len(data_loading_plan.desc) < 4:
_msg = ErrorNumbers.FB316.value + ": Cannot save data loading plan, " + \
"DLP name needs to have at least 4 characters."
logger.error(_msg)
raise FedbiomedDatasetManagerError(_msg)
_dlp_same_name = self._dlp_table.search(
(self._database.dlp_id.exists()) & (self._database.dlp_name.exists()) &
(self._database.dlp_name == data_loading_plan.desc))
if _dlp_same_name:
_msg = ErrorNumbers.FB316.value + ": Cannot save data loading plan, " + \
"DLP name needs to be unique."
logger.error(_msg)
raise FedbiomedDatasetManagerError(_msg)
dlp_metadata, loading_blocks_metadata = data_loading_plan.serialize()
self._dlp_table.insert(dlp_metadata)
self._dlp_table.insert_multiple(loading_blocks_metadata)
return data_loading_plan.dlp_id
search_by_tags ¶
search_by_tags(tags)
Searches for data with given tags.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
tags | Union[tuple, list] | List of tags | required |
Returns:
| Type | Description |
|---|---|
list | The list of matching datasets |
Source code in fedbiomed/node/dataset_manager.py
def search_by_tags(self, tags: Union[tuple, list]) -> list:
"""Searches for data with given tags.
Args:
tags: List of tags
Returns:
The list of matching datasets
"""
return self._dataset_table.search(self._database.tags.all(tags))
search_conflicting_tags ¶
search_conflicting_tags(tags)
Searches for registered data that have conflicting tags with the given tags
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
tags | Union[tuple, list] | List of tags | required |
Returns:
| Type | Description |
|---|---|
list | The list of conflicting datasets |
Source code in fedbiomed/node/dataset_manager.py
def search_conflicting_tags(self, tags: Union[tuple, list]) -> list:
"""Searches for registered data that have conflicting tags with the given tags
Args:
tags: List of tags
Returns:
The list of conflicting datasets
"""
def _conflicting_tags(val):
return all(t in val for t in tags) or all(t in tags for t in val)
return self._dataset_table.search(self._database.tags.test(_conflicting_tags))