Skip to content

declearn.quickrun.parse_data_folder

Parse the contents of a data folder into a nested dict of file paths.

This function expects the folder to abide by the following standard:

folder/
└─── data*/
    └─── client*/
    │      train_data.* - training data
    │      train_target.* - training labels
    │      valid_data.* - validation data
    │      valid_target.* - validation labels
    └─── client*/
    │    ...

Parameters:

Name Type Description Default
data_config DataSourceConfig

DataSourceConfig instance; see its documentation for details.

required
folder Optional[str]

The main experiment folder in which to look for a data* folder. Overridden by data_config.data_folder when specified.

None

Returns:

Name Type Description
paths Dict[str, Dict[str, str]]

Nested directory containing the parsed file paths, with structure {client_name: {file_key_name: file_path}}, where the key names are always the same: "train_data", "train_target", "valid_data" and "valid_target".

Source code in declearn/quickrun/_parser.py
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def parse_data_folder(
    data_config: DataSourceConfig,
    folder: Optional[str] = None,
) -> Dict[str, Dict[str, str]]:
    """Parse the contents of a data folder into a nested dict of file paths.

    This function expects the folder to abide by the following standard:

        folder/
        └─── data*/
            └─── client*/
            │      train_data.* - training data
            │      train_target.* - training labels
            │      valid_data.* - validation data
            │      valid_target.* - validation labels
            └─── client*/
            │    ...

    Parameters
    ----------
    data_config: DataSourceConfig
        DataSourceConfig instance; see its documentation for details.
    folder: str or None
        The main experiment folder in which to look for a `data*` folder.
        Overridden by `data_config.data_folder` when specified.

    Returns
    -------
    paths:
        Nested directory containing the parsed file paths, with structure
        `{client_name: {file_key_name: file_path}}`, where the key names
        are always the same: "train_data", "train_target", "valid_data"
        and "valid_target".
    """
    # Identify the root data folder.
    data_folder = get_data_folder_path(data_config.data_folder, folder)
    # Identify clients' data folders.
    client_names = list_client_names(data_folder, data_config.client_names)
    clients = {c: {} for c in client_names}  # type: Dict[str, Dict[str, str]]
    # Set up a mapping between expected files and their naming.
    data_items = [
        "train_data",
        "train_target",
        "valid_data",
        "valid_target",
    ]
    dataset_names = data_config.dataset_names
    if dataset_names:
        if set(data_items) != dataset_names.keys():
            raise ValueError(
                "Please provide a properly formatted dictionnary as input, "
                f"using the following keys: {data_items}"
            )
    else:
        dataset_names = {name: name for name in data_items}
    # Gather client-wise file paths.
    for client, paths in clients.items():
        client_dir = data_folder.joinpath(client)
        for key, val in dataset_names.items():
            files = [p for p in client_dir.glob(f"{val}*") if p.is_file()]
            if not files:
                raise ValueError(
                    f"Could not find a '{val}.*' file for client '{client}'."
                )
            if len(files) > 1:
                raise ValueError(
                    f"Found multiple '{val}.*' files for client '{client}'."
                )
            paths[key] = files[0].as_posix()
    # Return the nested directory of parsed file paths.
    return clients