69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321 | class Project:
"""
Manages the setup of an AutoMIL project.
The Project class is responsible for:
- Modifying the annotation file to conform to the expected slideflow format
- Creating the project directory structure
- Creating or loading a Slideflow project instance
- Exposing project attributes to downstream processes
A Project instance must be prepared before training, evaluation,
or prediction can be performed.
"""
def __init__(
self,
project_dir: Path | str,
annotations_file: Path | str,
slide_dir: Path | str,
patient_column: str,
label_column: str,
slide_column: str | None = None,
transform_labels: bool = False,
verbose: bool = True
) -> None:
"""Initializes a Project instance.
This metod itself does not create or modify files or directories. To prepare a directory to house
a project, call :meth:`prepare_project`
Args:
project_dir (Path | str): Directory in which to set up project
annotations_file (Path | str): annotations file
slide_dir (Path | str): Slide directory
patient_column (str): column containing patient identifiers
label_column (str): column containing labels
slide_column (str | None, optional): column containing slide identifiers. Defaults to None.
transform_labels (bool, optional): Whether to transform labels to a float mapping. Defaults to False.
verbose (bool, optional): Whether to log verbose messages. Defaults to True.
"""
self.project_dir: Path = Path(project_dir)
self.annotations_file: Path = Path(annotations_file)
self.slide_dir: Path = Path(slide_dir)
self.modified_annotations_file: Path = self.project_dir / "annotations.csv"
self.patient_column = patient_column
self.label_column = label_column
self.slide_column = slide_column
self.transform_labels = transform_labels
self.vlog = get_vlog(verbose)
# === Properties === #
@cached_property
def required_columns(self) -> set[str]:
"""
Set of required columns expected in the annotation file.
Includes:
- Patient identifier column
- Label column
- Slide identifier column (if provided)
Returns:
Set of required columns
"""
required = {self.patient_column, self.label_column}
if self.slide_column:
required.add(self.slide_column)
return required
@property
def label_map(self) -> dict | list[str]:
"""
Mapping between original labels and model-ready labels.
The mapping is created during project scaffold setup.
Returns:
dict:
Mapping from label to float if ``transform_labels=True`` or a list of unique labels otherwise.
Raises:
AttributeError:
If the project scaffold has not been set up yet.
"""
if not hasattr(self, '_label_map'):
raise AttributeError(
"Label map has not been set up yet. Call setup_project_scaffold() first."
)
return self._label_map
@property
def slide_ids(self) -> list[str]:
"""List of unique slide identifiers from the modified annotations file.
Returns:
List of unique slide IDs.
"""
if not hasattr(self, 'modified_annotations'):
raise AttributeError(
"Modified annotations have not been set up yet. Call setup_project_scaffold() first."
)
return self.modified_annotations["slide"].astype(str).unique().tolist()
# === Public Methods === #
def setup_project_scaffold(self) -> None:
"""
Creates the project directory and normalizes annotations.
This method:
- Creates the project directory if it does not exist
- Normalizes the annotation file to Slideflow format
- Generates and stores the label mapping
"""
self._setup_project_folder()
self.modified_annotations = self._setup_annotations()
self._label_map = self._setup_label_map()
self.vlog(f"[{SUCCESS_CLR}]Project scaffold setup complete[/]")
def prepare_project(self) -> sf.Project:
"""
Sets up the project directory structure, modifies and stores annotations, and creates or loads
a Slideflow project.
This method:
1. Creates the project folder if necessary.
2. Normalizes and saves annotations to project_dir/annotations.csv.
3. Creates a new Slideflow project or loads an existing one.
Returns:
sf.Project: A slideflow project instance
"""
# Setup project folder and annotations
self.setup_project_scaffold()
# Load or create project
if is_project(str(self.project_dir)):
self.vlog(f"Loading existing project at [{INFO_CLR}]{self.project_dir}[/]")
self.project = sf.load_project(str(self.project_dir))
else:
self.vlog(f"Creating new project at [{INFO_CLR}]{self.project_dir}[/]")
self.project = sf.create_project(
name="AutoMIL",
root=str(self.project_dir),
slides=str(self.slide_dir),
annotations=str(self.modified_annotations_file),
)
return self.project
def summary(self) -> None:
"""Prints a simple summary of the Project Instance in a tabular format"""
vlog = self.vlog
rows = [
("Project Directory:", str(self.project_dir)),
("Slide Directory:", str(self.slide_dir)),
("Annotations File:", str(self.annotations_file)),
("Patient Column:", self.patient_column),
("Label Column:", self.label_column),
("Slide Column:", self.slide_column or "None (using patient ID)"),
("Transform Labels:", str(self.transform_labels)),
("Modified Annotations:", str(self.modified_annotations_file) or "Not yet created"),
("Slideflow Project:", "Loaded" if self.project else "Not initialized"),
]
vlog("[bold underline]Project Summary[/]")
vlog(render_kv_table(rows, width=256))
# === Internals === #
def _setup_project_folder(self) -> None:
"""
Ensures the project directory exists.
Creates the directory and parent directories if necessary.
"""
if not self.project_dir.exists():
self.project_dir.mkdir(parents=True, exist_ok=True)
self.vlog(f"Created project directory at [{INFO_CLR}]{self.project_dir}[/]")
else:
self.vlog(f"Project directory [{INFO_CLR}]{self.project_dir}[/] already exists")
def _setup_annotations(self) -> pd.DataFrame:
"""
Normalizes the input annotations file to the required format and set up label map.
This includes:
- Validating the presence of required columns.
- Renaming the patient and label columns to `patient` and `label`.
- Creating or renaming the `slide` column.
- Optionally transforming labels to float encodings.
- Creating and storing the label map for later use.
- Saving the normalized file to project_dir/annotations.csv.
AutoMIL requires the annotations file to have the following columns:
- patient | contains patient identifiers
- slide | contains slide identifiers
- label | contains labels
Raises:
ValueError:
If required columns are missing.
IOError:
If the output annotations file cannot be written.
"""
# Make sure given columns exist
if (missing := contains_columns(self.annotations_file, self.required_columns, return_missing=True)):
raise ValueError(f"Annotations file is missing required columns: {missing}")
# Load annotations
annotations = pd.read_csv(self.annotations_file, index_col=self.patient_column)
annotations.index.name = "patient"
# Renaming the slide column if provided, otherwise just use the patient column as slide identifier
if not self.slide_column:
annotations["slide"] = annotations.index
else:
annotations.rename(columns={self.slide_column: "slide"}, inplace=True)
# Rename label column
annotations.rename(columns={self.label_column: "label"}, inplace=True)
# Save modified annotations
out_path = self.modified_annotations_file
annotations.to_csv(out_path, index=True)
if not out_path.exists():
raise IOError(f"Failed to write annotations file: {out_path}")
if annotations.empty:
self.vlog("Warning: annotation file written but is empty.")
self.vlog(f"Annotations saved to [{INFO_CLR}]{out_path}[/]")
return annotations
def _setup_label_map(self) -> dict | list[str]:
"""Sets up the label map based on the modified annotations file.
Returns:
dict | list[str]: The label map (dict if transform_labels=True, else list of unique labels).
"""
annotations = self.modified_annotations
labels = annotations["label"].unique()
# Transform labels to float values and store the mapping
if self.transform_labels:
label_map = {label: float(i) for i, label in enumerate(sorted(labels))}
pretty = ", ".join(f"{k}: {v}" for k, v in label_map.items())
self.vlog(f"Transformed labels to float values: [{INFO_CLR}]{pretty}[/]")
else:
# Store unique labels as sorted list
label_map = sorted(labels.astype(str).tolist())
return label_map
|