`declearn.quickrun.parse_data_folder`

Parse the contents of a data folder into a nested dict of file paths.

This function expects the folder to abide by the following standard:

folder/
└─── data*/
    └─── client*/
    │      train_data.* - training data
    │      train_target.* - training labels
    │      valid_data.* - validation data
    │      valid_target.* - validation labels
    └─── client*/
    │    ...

Parameters:

Name	Type	Description	Default
`data_config`	`DataSourceConfig`	DataSourceConfig instance; see its documentation for details.	required
`folder`	`Optional[str]`	The main experiment folder in which to look for a `data*` folder. Overridden by `data_config.data_folder` when specified.	`None`

Returns:

Name	Type	Description
`paths`	`Dict[str, Dict[str, str]]`	Nested directory containing the parsed file paths, with structure `{client_name: {file_key_name: file_path}}`, where the key names are always the same: "train_data", "train_target", "valid_data" and "valid_target".

Source code in declearn/quickrun/_parser.py

def parse_data_folder(
    data_config: DataSourceConfig,
    folder: Optional[str] = None,
) -> Dict[str, Dict[str, str]]:
    """Parse the contents of a data folder into a nested dict of file paths.

    This function expects the folder to abide by the following standard:

        folder/
        └─── data*/
            └─── client*/
            │      train_data.* - training data
            │      train_target.* - training labels
            │      valid_data.* - validation data
            │      valid_target.* - validation labels
            └─── client*/
            │    ...

    Parameters
    ----------
    data_config: DataSourceConfig
        DataSourceConfig instance; see its documentation for details.
    folder: str or None
        The main experiment folder in which to look for a `data*` folder.
        Overridden by `data_config.data_folder` when specified.

    Returns
    -------
    paths:
        Nested directory containing the parsed file paths, with structure
        `{client_name: {file_key_name: file_path}}`, where the key names
        are always the same: "train_data", "train_target", "valid_data"
        and "valid_target".
    """
    # Identify the root data folder.
    data_folder = get_data_folder_path(data_config.data_folder, folder)
    # Identify clients' data folders.
    client_names = list_client_names(data_folder, data_config.client_names)
    clients = {c: {} for c in client_names}  # type: Dict[str, Dict[str, str]]
    # Set up a mapping between expected files and their naming.
    data_items = [
        "train_data",
        "train_target",
        "valid_data",
        "valid_target",
    ]
    dataset_names = data_config.dataset_names
    if dataset_names:
        if set(data_items) != dataset_names.keys():
            raise ValueError(
                "Please provide a properly formatted dictionnary as input, "
                f"using the following keys: {data_items}"
            )
    else:
        dataset_names = {name: name for name in data_items}
    # Gather client-wise file paths.
    for client, paths in clients.items():
        client_dir = data_folder.joinpath(client)
        for key, val in dataset_names.items():
            files = [p for p in client_dir.glob(f"{val}*") if p.is_file()]
            if not files:
                raise ValueError(
                    f"Could not find a '{val}.*' file for client '{client}'."
                )
            if len(files) > 1:
                raise ValueError(
                    f"Found multiple '{val}.*' files for client '{client}'."
                )
            paths[key] = files[0].as_posix()
    # Return the nested directory of parsed file paths.
    return clients