Parse the contents of a data folder into a nested dict of file paths.
This function expects the folder to abide by the following standard:
folder/
└─── data*/
└─── client*/
│ train_data.* - training data
│ train_target.* - training labels
│ valid_data.* - validation data
│ valid_target.* - validation labels
└─── client*/
│ ...
Parameters:
Name |
Type |
Description |
Default |
data_config |
DataSourceConfig
|
DataSourceConfig instance; see its documentation for details. |
required
|
folder |
Optional[str]
|
The main experiment folder in which to look for a data* folder.
Overridden by data_config.data_folder when specified. |
None
|
Returns:
Name | Type |
Description |
paths |
Dict[str, Dict[str, str]]
|
Nested directory containing the parsed file paths, with structure
{client_name: {file_key_name: file_path}} , where the key names
are always the same: "train_data", "train_target", "valid_data"
and "valid_target". |
Source code in declearn/quickrun/_parser.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102 | def parse_data_folder(
data_config: DataSourceConfig,
folder: Optional[str] = None,
) -> Dict[str, Dict[str, str]]:
"""Parse the contents of a data folder into a nested dict of file paths.
This function expects the folder to abide by the following standard:
folder/
└─── data*/
└─── client*/
│ train_data.* - training data
│ train_target.* - training labels
│ valid_data.* - validation data
│ valid_target.* - validation labels
└─── client*/
│ ...
Parameters
----------
data_config: DataSourceConfig
DataSourceConfig instance; see its documentation for details.
folder: str or None
The main experiment folder in which to look for a `data*` folder.
Overridden by `data_config.data_folder` when specified.
Returns
-------
paths:
Nested directory containing the parsed file paths, with structure
`{client_name: {file_key_name: file_path}}`, where the key names
are always the same: "train_data", "train_target", "valid_data"
and "valid_target".
"""
# Identify the root data folder.
data_folder = get_data_folder_path(data_config.data_folder, folder)
# Identify clients' data folders.
client_names = list_client_names(data_folder, data_config.client_names)
clients = {c: {} for c in client_names} # type: Dict[str, Dict[str, str]]
# Set up a mapping between expected files and their naming.
data_items = [
"train_data",
"train_target",
"valid_data",
"valid_target",
]
dataset_names = data_config.dataset_names
if dataset_names:
if set(data_items) != dataset_names.keys():
raise ValueError(
"Please provide a properly formatted dictionnary as input, "
f"using the following keys: {data_items}"
)
else:
dataset_names = {name: name for name in data_items}
# Gather client-wise file paths.
for client, paths in clients.items():
client_dir = data_folder.joinpath(client)
for key, val in dataset_names.items():
files = [p for p in client_dir.glob(f"{val}*") if p.is_file()]
if not files:
raise ValueError(
f"Could not find a '{val}.*' file for client '{client}'."
)
if len(files) > 1:
raise ValueError(
f"Found multiple '{val}.*' files for client '{client}'."
)
paths[key] = files[0].as_posix()
# Return the nested directory of parsed file paths.
return clients
|