Evaluator

automil.evaluation.Evaluator is responsible for evaluating trained MIL models, computing classification metrics, optionally generating ensemble predictions and producing comparison plots.

Evaluator

Evaluates trained MIL models.

The Evaluator supports

Single-model evaluation
Batch evaluation of multiple trained models
Ensemble prediction generation
Metrics calculation (AUC, AP, Accuracy, F1)
Comparative plotting across models

Source code in automil/evaluation.py

class Evaluator:
    """
    Evaluates trained MIL models.

    The Evaluator supports:
        - Single-model evaluation
        - Batch evaluation of multiple trained models
        - Ensemble prediction generation
        - Metrics calculation (AUC, AP, Accuracy, F1)
        - Comparative plotting across models
    """
    def __init__(self,
        dataset: sf.Dataset,
        model_dir: Path,
        out_dir: Path,
        bags_dir: Path,
        verbose: bool = True
    ) -> None:
        """Initializes a Evaluator Instance

        Args:
            dataset (sf.Dataset): Slideflow dataset
            model_dir (Path): Directory in which to store trained models
            out_dir (Path): Diectory in which to store results such as predictions
            bags_dir (Path): Directory with feature bags
            verbose (bool, optional): Whether to print verbose messages. Defaults to True.
        """
        self.dataset = dataset
        self.vlog = get_vlog(verbose)

        # Path Setup
        self.model_dir = model_dir
        self.out_dir = out_dir
        self.bags_dir = bags_dir


    def load_predictions(self, model_path: Path) -> pd.DataFrame:
        """
        Loads and validates prediction outputs from a trained model directory.

        The predictions file must contain:
        - One or more probability columns starting with ``y_pred``
        - Base columns ``slide`` and ``y_true``

        Args:
            model_path (Path): Path to a trained model directory.

        Raises:
            FileNotFoundError: If ``predictions.parquet`` is missing.
            ValueError: If required prediction or base columns are absent.

        Returns:
            pd.DataFrame: Loaded and validated predictions.
        """
        if not (predictions_path := model_path / "predictions.parquet").exists():
            raise FileNotFoundError(f"{model_path} does not contain a 'predictions.parquet' file")

        predictions = pd.read_parquet(predictions_path)

        all_columns = [column for column in predictions.columns]
        # We expect columns containing prediction probabilites to start with 'y_pred' (e.g 'y_pred0', 'y_pred1', ...)
        pred_columns = [column for column in all_columns if column.startswith("y_pred")]
        # Similarly, we expect predictions to contain 'slide' and 'y_true' columns
        base_columns = ["slide", "y_true"]

        if not pred_columns:
            raise ValueError("'predictions.parquet' does not contain the expected prediction columns")
        elif not all(base_column in all_columns for base_column in base_columns):
            raise ValueError("'predictions.parquet' does not contain the expected base columns")

        return predictions

    def calculate_metrics(
        self,
        predictions: pd.DataFrame | Path | str
    ) -> dict[str, float | np.ndarray]:
        """
        Computes classification metrics from prediction outputs.

        Supports both binary and multi-class classification and automatically
        detects ensemble predictions when present.
        """
        import re

        # Ensure DataFrame
        match predictions:
            case Path() | str():
                predictions = self.load_predictions(Path(predictions))
            case pd.DataFrame():
                pass

        if "y_true" not in predictions.columns:
            raise ValueError("Predictions must contain a 'y_true' column")

        y_true = predictions["y_true"].astype(int)

        # Detect probability columns
        pred_columns = [c for c in predictions.columns if c.startswith("y_pred")]

        ensemble_columns = sorted(
            [c for c in pred_columns if re.match(r"^y_pred\d+_ensemble$", c)],
            key=lambda x: int(match.group()) if (match := re.search(r"\d+", x)) else 0
        )

        if ensemble_columns:
            prob_columns = ensemble_columns
            prediction_type = "ensemble"
        else:
            prob_columns = sorted(
                [c for c in pred_columns if re.match(r"^y_pred\d+$", c)],
                key=lambda x: int(x.replace("y_pred", ""))
            )
            prediction_type = "single model"

        if not prob_columns:
            raise ValueError(
                f"No probability columns found for {prediction_type} predictions.\n"
                f"Available columns: {list(predictions.columns)}"
            )

        # Get probability matrix
        num_classes = len(prob_columns)
        prob_matrix = predictions[prob_columns].values

        # Normalize probabilities (safety)
        row_sums = prob_matrix.sum(axis=1, keepdims=True)
        row_sums[row_sums == 0] = 1
        prob_matrix = prob_matrix / row_sums

        # Predicted labels
        if "y_pred_label" in predictions.columns:
            y_pred = predictions["y_pred_label"].astype(int)
        else:
            y_pred = np.argmax(prob_matrix, axis=1)

        # --- BBasic Metrics --- #
        accuracy = float(accuracy_score(y_true, y_pred))
        cm = confusion_matrix(y_true, y_pred, labels=range(num_classes))

        # --- Binary Classification --- #
        if num_classes == 2:

            y_probs = prob_matrix[:, 1]

            if len(np.unique(y_true)) > 1:
                auc = float(roc_auc_score(y_true, y_probs))
                ap = float(average_precision_score(y_true, y_probs))
            else:
                auc = float("nan")
                ap = float("nan")

            f1 = float(f1_score(y_true, y_pred))

        # --- Multiclass Classification --- #
        else:

            # AUC
            try:
                auc = float(
                    roc_auc_score(
                        y_true,
                        prob_matrix,
                        multi_class="ovr",
                        average="macro",
                        labels=range(num_classes)
                    )
                )
            except ValueError:
                auc = float("nan")

            # AP
            ap_scores = []

            for class_idx in range(num_classes):

                y_true_binary = (y_true == class_idx).astype(int)
                y_probs_class = prob_matrix[:, class_idx]

                if len(np.unique(y_true_binary)) > 1:
                    ap_scores.append(
                        average_precision_score(y_true_binary, y_probs_class)
                    )

            ap = float(np.mean(ap_scores)) if ap_scores else float("nan")

            # F1
            f1 = float(
                f1_score(
                    y_true,
                    y_pred,
                    average="macro",
                    labels=range(num_classes)
                )
            )

        # --- Per-Class Accuracy --- #
        with np.errstate(divide="ignore", invalid="ignore"):
            per_class_accuracy = cm.diagonal() / cm.sum(axis=1)

        return {
            "Accuracy": accuracy,
            "AUC": auc,
            "AP": ap,
            "F1": f1,
            "ConfusionMatrix": cm,
            "PerClassAccuracy": per_class_accuracy
        }

    def evaluate_models(
        self,
        model_dir: Path | None = None,
        bags_dir: Path | None = None,
        out_dir: Path | None = None,
        generate_attention_heatmaps: bool = False
    ) -> None:
        """
        Evaluates one or more trained models.

        Detects each trained model directory inside `model_dir` and evaluates them independently.
        Predictions and metrics are written to the output directory.

        Args:
            model_dir (Path | None, optional): Model directory or parent directory of models to evaluate.
            bags_dir (Path | None, optional): Feature bag directory.
            out_dir (Path | None, optional): Output directory.
            generate_attention_heatmaps (bool, optional): Generate attention heatmaps.
        """
        # Default to instance variables if none provided
        model_dir = model_dir or self.model_dir
        bags_dir = bags_dir or self.bags_dir
        out_dir = out_dir or self.out_dir

        # Check if model_dir is a single model directory
        if is_model_directory(model_dir):
            model_paths = [model_dir]
            self.vlog(f"Single model directory detected: {model_dir}")
        # Else, collect all model subdirectories
        else:
            if not (model_paths := [subdir for subdir in model_dir.iterdir() if subdir.is_dir() and is_model_directory(subdir)]):
                self.vlog(f"No model directories found in {model_dir}", LogLevel.WARNING)
                return

        # Iterate over each model directory and evaluate
        for model_idx, model_path in enumerate(model_paths):
            self.vlog(f"Evaluating model [{INFO_CLR}]{model_idx+1}[/]/[{INFO_CLR}]{len(model_paths)}[/]: [{INFO_CLR}]{model_path}[/]")
            try:
                eval_mil(
                    weights=str(model_path),
                    bags=str(bags_dir),
                    dataset=self.dataset,
                    outcomes="label",
                    outdir=str(out_dir),
                    attention_heatmaps=generate_attention_heatmaps
                )
                self.vlog("Evaluation complete.\n")
            except Exception as e:
                self.vlog(f"Error evaluating model at {model_path}: {e}", LogLevel.ERROR)
                continue

    def generate_predictions(
        self,
        model_dir: Path | None = None,
        bags_dir: Path | None = None,
        out_dir: Path | None = None
    ) -> None:
        """
        Generates prediction outputs for one or more trained models.

        Predictions are saved per model in ``predictions.parquet`` format.

        Args:
            model_dir (Path | None, optional): Directory containing model subdirectories.
            bags_dir (Path | None, optional): Feature bag directory.
            out_dir (Path | None, optional): Output directory.
        """
        # Default to instance variables if none provided
        model_dir = model_dir or self.model_dir
        bags_dir = bags_dir or self.bags_dir
        out_dir = out_dir or self.out_dir

        # Check if model_dir is a single model directory
        if is_model_directory(model_dir):
            model_paths = [model_dir]
            self.vlog(f"Single model directory detected: [{INFO_CLR}]{model_dir}[/]")
        # Else, collect all model subdirectories
        else:
            if not (model_paths := [subdir for subdir in model_dir.iterdir() if subdir.is_dir() and is_model_directory(subdir)]):
                self.vlog(f"No model directories found in [{INFO_CLR}]{model_dir}[/]", LogLevel.WARNING)
                return

        # Iterate over each model directory and generate predictions
        for model_idx, model_path in enumerate(model_paths):
            self.vlog(f"Generating predictions with model [{INFO_CLR}]{model_idx+1}[/]/[{INFO_CLR}]{len(model_paths)}[/]: [{INFO_CLR}]{model_path}[/]")
            try:
                predictions = predict_mil(
                    model=str(model_path),
                    bags=str(bags_dir),
                    dataset=self.dataset,
                    outcomes="label",
                )
                # Cast to DataFrame
                # Can do this safely since predict_mil always returns a DataFrame if attention==False
                predictions = pd.DataFrame(predictions)

                # Save predictions to out_dir/model_name/predictions.parquet
                model_out_dir = out_dir / model_path.name
                model_out_dir.mkdir(parents=True, exist_ok=True)
                predictions_path = model_out_dir / "predictions.parquet"
                predictions.to_parquet(predictions_path, index=False)
                self.vlog(f"Predictions saved to [{INFO_CLR}]{predictions_path}[/]")

            except Exception as e:
                self.vlog(f"Error evaluating model at {model_path}: {e}", LogLevel.ERROR)
                continue

    def create_ensemble_predictions(
        self,
        predictions_path: Path | None = None,
        output_path: Path | None = None,
        print_summary: bool = True
    ) -> tuple[pd.DataFrame, dict[str, float | np.ndarray]]:

        predictions_path = predictions_path or self.out_dir
        output_path = output_path or (self.out_dir / "ensemble_predictions.parquet")

        # Collects all (sub)folders containing prediction tables
        prediction_dirs: list[Path] = []
        # Collects all actual prediction tables
        predictions_list: list[pd.DataFrame] = []

        # Case 1: Directory with predictions passed (single model)
        if contains_predictions(predictions_path):
            prediction_dirs = [predictions_path]
            predictions_list = [self.load_predictions(predictions_path)]
        # Case 2: Directory with multiple models passed
        else:
            prediction_dirs = [
                subdir
                for subdir in predictions_path.iterdir()
                if subdir.is_dir() and contains_predictions(subdir)
            ]
            predictions_list = [
                self.load_predictions(pred_dir)
                for pred_dir in prediction_dirs
            ]

        if not prediction_dirs:
            raise ValueError(
                f"No prediction directories found in {predictions_path}"
            )
        elif not predictions_list:
            raise ValueError(
                f"No predictions loaded from {predictions_path}"
            )

        self.vlog(
            f"Found [{INFO_CLR}]{len(prediction_dirs)}[/] prediction directories "
            f"in [{INFO_CLR}]{predictions_path}[/]"
        )

        # Additional check to ensure that we uses the same test set for predictions
        n_models = len(predictions_list)
        slide_sets = [set(df["slide"]) for df in predictions_list]
        common_slides = set.intersection(*slide_sets)

        if not common_slides:
            raise ValueError(
                "No overlapping slides across model predictions. "
                "Ensure all models were evaluated on the same test set."
            )

        # Filter all predictions to common slides
        # TODO | Aborting if slide sets differ probably preferable
        filtered_predictions = []
        for df in predictions_list:
            df = df[df["slide"].isin(common_slides)].sort_values("slide").reset_index(drop=True)
            filtered_predictions.append(df)
        predictions_list = filtered_predictions

        # Build combined probability matrix
        merged = predictions_list[0][["slide", "y_true"]].copy()
        prob_matrices: list[np.ndarray] = []

        for model_idx, df in enumerate(predictions_list):

            # y_pred0, y_pred1, y_pred2,...
            prob_cols = sorted(
                [c for c in df.columns if re.match(r"^y_pred\d+$", c)],
                key=lambda x: int(x.replace("y_pred", ""))
            )

            if not prob_cols:
                raise ValueError(f"No prediction columns found for model {model_idx}")

            prob_matrix = df[prob_cols].values
            prob_matrices.append(prob_matrix)

            # Attach model predictions for inspection/debugging
            renamed = df[prob_cols].add_suffix(f"_model{model_idx}")
            merged = pd.concat([merged, renamed], axis=1)

        # Calculate ensemble probabilities
        prob_matrix = np.mean(prob_matrices, axis=0)

        # Normalize probabilities (required for ROC-AUC)
        row_sums = prob_matrix.sum(axis=1, keepdims=True)
        row_sums[row_sums == 0] = 1
        prob_matrix = prob_matrix / row_sums

        n_classes = prob_matrix.shape[1]

        # ensemble is attached back to merged
        ensemble_columns = []

        for class_idx in range(n_classes):
            col = f"y_pred{class_idx}_ensemble"
            merged[col] = prob_matrix[:, class_idx]
            ensemble_columns.append(col)

        # Calculate metrics
        predicted_classes = np.argmax(prob_matrix, axis=1)
        merged["y_pred_label"] = predicted_classes

        metrics = self.calculate_metrics(merged)

        # Optional summary
        if print_summary:

            summary = format_ensemble_summary(
                n_models,
                metrics["ConfusionMatrix"],  # type: ignore
                float(metrics["AUC"]),
                float(metrics["AP"]),
                float(metrics["Accuracy"]),
                float(metrics["F1"])
            )

            self.vlog(summary)

        # Save results
        output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.suffix == ".csv":
            merged.to_csv(output_path, index=False)
        else:
            merged.to_parquet(output_path, index=False)

        self.vlog(f"Ensemble predictions saved to [{INFO_CLR}]{output_path}[/]")

        return merged, metrics

    def compare_predictions(
        self,
        predictions_path: Path | None = None,
        metrics: list[str] = ["Accuracy", "AUC", "F1", "AP"]
    ) -> pd.DataFrame:
        """
        Compares evaluation metrics across multiple trained models.

        Args:
            model_dir (Path | None, optional): Directory containing model subdirectories.
            metrics (list[str], optional): Metrics to include in the comparison.

        Returns:
            pd.DataFrame: Model-wise metric comparison table.
        """
        predictions_path = predictions_path or self.out_dir

        # Collects all (sub)folders containing prediction tables
        prediction_dirs: list[Path] = []
        # Collects all actual prediction tables
        predictions_dict: dict[str, pd.DataFrame] = {}

        # Case 1: Directory with predictions passed (single model)
        if contains_predictions(predictions_path):
            prediction_dirs = [predictions_path]
            predictions_dict[predictions_path.name] = self.load_predictions(predictions_path)
        else:
            prediction_dirs = [
                subdir
                for subdir in predictions_path.iterdir()
                if subdir.is_dir() and contains_predictions(subdir)
            ]
            predictions_dict = {
                pred_dir.name: self.load_predictions(pred_dir)
                for pred_dir in prediction_dirs
            }

        if not prediction_dirs:
            raise ValueError(
                f"No prediction directories found in {predictions_path}"
            )
        elif not predictions_dict:
            raise ValueError(
                f"No predictions loaded from {predictions_path}"
        )

        self.vlog(
            f"Found [{INFO_CLR}]{len(prediction_dirs)}[/] prediction directories "
            f"in [{INFO_CLR}]{predictions_path}[/]"
        )

        comparison_data = []
        for model_name, predictions in predictions_dict.items():
            try:
                model_metrics = self.calculate_metrics(predictions)

                row: dict[str, str | float] = {"model": model_name}
                for metric in metrics:
                    if metric in model_metrics:
                        value = model_metrics[metric]
                        # Convert numpy arrays and other types to string representation
                        if isinstance(value, np.ndarray):
                            row[metric] = round(float(value), 2)
                        else:
                            row[metric] = round(float(value), 2)
                    else:
                        row[metric] = "N/A"

                comparison_data.append(row)

            except Exception as e:
                self.vlog(f"Failed to evaluate [{INFO_CLR}]{model_name}[/]: {e}", LogLevel.WARNING)
                continue

        comparison_df = pd.DataFrame(comparison_data)

        if not comparison_df.empty:
            self.vlog("Model Comparison:")
            self.vlog(comparison_df.to_string(index=False))

        return comparison_df

    # === Plotting === #
    def generate_plots(
        self,
        model_paths: list[Path] | None = None,
        save_path: Path | None = None,
        figsize: tuple[int, int] = (10, 10)
    ) -> None:
        """Generate all comparison plots and save them to `self.project_dir/figures`"""
        # Collect models from expected folder if not provided
        if model_paths is None:
            model_paths = sorted(
                [path for path in self.out_dir.iterdir() if path.is_dir()]
            )

        # Calculate and collect metrics for all models
        combined_metrics = {}
        for model_path in model_paths:
            try:
                predictions = self.load_predictions(model_path)
                model_metrics = self.calculate_metrics(predictions)
                combined_metrics[model_path.name] = model_metrics
            except Exception as e:
                self.vlog(f"Failed to load metrics for {model_path.name}: {e}")
                continue

        if not combined_metrics:
            self.vlog("No valid model data found for generating plots")
            return

        # Collect and execute all plotting methods
        plots = cast(
            dict[str, Figure], # Make sure the type annotation is correct
            {
                method_name.removeprefix('_plot_'): plot_method(
                    combined_metrics,
                    figsize=figsize,
                )
                for method_name in dir(self)
                if (
                    method_name.startswith('_plot_')
                    and callable((plot_method := getattr(self, method_name)))
                    and signature(plot_method).return_annotation == Figure
                )
            }
        )

        if not save_path:
            save_path = self.out_dir / "figures"
            save_path.mkdir(parents=True, exist_ok=True)

        # Save all generated plots
        for plot_name, fig in plots.items():
            plot_file = save_path / f"{plot_name}.png"
            fig.savefig(plot_file, dpi=300, bbox_inches='tight')
            self.vlog(f"Saved plot '[{INFO_CLR}]{plot_name}[/]' to [{INFO_CLR}]{plot_file}[/]")
        return

    def _plot_roc_curves(
        self,
        combined_metrics: dict[str, dict[str, float | np.ndarray]],
        figsize: tuple[int, int] = (10, 8)
    ) -> Figure:
        """Plot ROC curves for all models"""
        from sklearn.metrics import auc, roc_curve

        plt.figure(figsize=figsize)

        colors = plt.cm.get_cmap('Set1')(np.linspace(0, 1, len(combined_metrics)))

        for i, (model_name, _) in enumerate(combined_metrics.items()):
            try:
                # Load predictions for this model
                model_path = self.out_dir / model_name
                predictions = self.load_predictions(model_path)

                y_true = predictions["y_true"].astype(int)
                num_classes = len(y_true.unique())

                # Get prediction probabilities
                pred_columns = [column for column in predictions.columns if column.startswith("y_pred")]
                ensemble_columns = [col for col in pred_columns if col.endswith("_ensemble")]

                if ensemble_columns:
                    prob_columns = [f"y_pred{i}_ensemble" for i in range(num_classes)]
                else:
                    prob_columns = [f"y_pred{i}" for i in range(num_classes)]

                prob_matrix = predictions[prob_columns].values

                if num_classes == 2:
                    # Binary classification - single ROC curve
                    y_probs = prob_matrix[:, 1]  # Probabilities for positive class

                    fpr, tpr, _ = roc_curve(y_true, y_probs)
                    roc_auc = auc(fpr, tpr)

                    plt.plot(
                        fpr, tpr, 
                        color=colors[i], 
                        linewidth=2,
                        label=f'{model_name} (AUC = {roc_auc:.3f})'
                    )

                else:
                    # Multiclass - plot ROC curve for each class
                    for class_idx in range(num_classes):
                        y_true_binary = (y_true == class_idx).astype(int)
                        y_probs_class = prob_matrix[:, class_idx]

                        # Only plot if we have both classes
                        if len(y_true_binary.unique()) > 1:
                            fpr, tpr, _ = roc_curve(y_true_binary, y_probs_class)
                            roc_auc = auc(fpr, tpr)

                            # Use different line styles for different classes
                            line_style = ['-', '--', '-.', ':'][class_idx % 4]

                            plt.plot(
                                fpr, tpr,
                                color=colors[i],
                                linestyle=line_style,
                                linewidth=2,
                                label=f'{model_name} Class {class_idx} (AUC = {roc_auc:.3f})'
                            )

            except Exception as e:
                self.vlog(f"Could not plot ROC curve for {model_name}: {e}", LogLevel.WARNING)
                continue

        # Plot diagonal line (random classifier)
        plt.plot([0, 1], [0, 1], 'k--', linewidth=1, alpha=0.5, label='Random')

        plt.xlabel('False Positive Rate', fontsize=12)
        plt.ylabel('True Positive Rate', fontsize=12)
        plt.title('ROC Curves', fontsize=14, fontweight='bold')
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.grid(True, alpha=0.3)
        plt.xlim([0, 1])
        plt.ylim([0, 1])

        plt.tight_layout()
        return plt.gcf()

    def _plot_model_comparison(
        self,
        combined_metrics: dict[str, dict[str, float | np.ndarray]],
        figsize: tuple[int, int] = (12, 8)
    ) -> Figure:
        data = pd.DataFrame(combined_metrics)
        metrics = [col for col in data.index if col != "ConfusionMatrix" and col != "PerClassAccuracy"]
        n_metrics = len(metrics)

        # Create subplots
        fig, axes = plt.subplots(1, n_metrics, figsize=figsize, sharey=False)
        # n_metrics == 1 means only 1 subplot, cast to list for consistency
        if n_metrics == 1:
            axes = cast(
                list[Axes],
                [axes]
            )
        # Otherwise axes is a list of subplots
        else:
            axes = cast(
                list[Axes],
                axes
            )

        colors = plt.cm.get_cmap('Set1')(np.linspace(0, 1, len(data)))
        x_positions = np.arange(len(data.columns))
        model_names = list(data.columns)

        for i, metric in enumerate(metrics):
            ax = axes[i]
            # Plot single metric
            bars = ax.bar(
                x_positions,
                data.loc[metric],
                color=colors,
                alpha=0.8,
                edgecolor='black',
                linewidth=0.5,
            )

            bar: Rectangle # Iterating over a BarContainer gives Rectangle objects
            for bar, value in zip(bars, data.loc[metric]):
                height = bar.get_height()
                # Place actual value above bar
                ax.text(
                    bar.get_x() + bar.get_width()/2., 
                    height + 0.005,
                    f'{value:.3f}',
                    ha='center',
                    va='bottom',
                    fontsize=9
                )

            ax.set_xticks(x_positions)
            # Set model names as x-tick labels
            # Since model names can be long, center them to the right
            # To avoid any offset issues
            ax.set_xticklabels(
                model_names,
                rotation=45,
                ha="right",
                rotation_mode="anchor"
            )

            ax.set_title(f'{metric}', fontsize=12, fontweight='bold')
            ax.set_ylabel(metric, fontsize=10)
            ax.set_ylim(0, 1.1)
            ax.grid(True, alpha=0.3, axis='y')

        plt.tight_layout()
        return fig

    def _plot_box_plots(
        self,
        combined_metrics: dict[str, dict[str, float | np.ndarray]],
        figsize: tuple[int, int] = (10, 8)
    ) -> Figure:
        # Collect data in long format for box plots
        plot_data = []
        for _, metrics in combined_metrics.items():
            for metric_name, metric_value in metrics.items():
                if metric_name in ["ConfusionMatrix", "PerClassAccuracy"]:
                    continue

                plot_data.append({
                    'Metric': metric_name,
                    'Value': float(metric_value)
                })

        df = pd.DataFrame(plot_data)

        # Create the plot
        plt.figure(figsize=figsize)

        sns.boxplot(
            data=df,
            x='Metric',
            y='Value',
            palette='Set2',
            width=0.5
        )

        sns.stripplot(
            data=df,
            x='Metric',
            y='Value',
            color='black',
            size=6,
            jitter=True,
            alpha=0.7
        )

        plt.title('Metric Distributions', fontsize=14, fontweight='bold')
        plt.ylabel('Value', fontsize=12)
        plt.xlabel('Metric', fontsize=12)
        plt.ylim(0, 1.1)
        plt.grid(True, alpha=0.3, axis='y')

        plt.tight_layout()
        return plt.gcf()

    def _plot_per_class_accuracy(
        self,
        combined_metrics: dict[str, dict[str, float | np.ndarray]],
        figsize: tuple[int, int] = (12, 8)
    ) -> Figure:
        # Prepare data for plotting
        data = []
        for model_name, metrics in combined_metrics.items():
            per_class_acc = metrics.get("PerClassAccuracy")
            if isinstance(per_class_acc, np.ndarray):
                for class_idx, acc in enumerate(per_class_acc):
                    data.append({
                        "Model": model_name,
                        "Class": f"Class {class_idx}",
                        "Accuracy": acc
                    })

        df = pd.DataFrame(data)

        # Create the plot
        plt.figure(figsize=figsize)

        # Create a grouped bar plot
        ax = sns.barplot(data=df, x='Class', y='Accuracy', hue='Model', alpha=0.8)

        plt.title('Per-Class Accuracy Comparison', fontsize=14, fontweight='bold')
        plt.ylabel('Accuracy', fontsize=12)
        plt.xlabel('Class', fontsize=12)
        plt.ylim(0, 1.1)
        plt.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.grid(True, alpha=0.3, axis='y')

        # Add value labels on bars
        for container in ax.containers:
            if isinstance(container, BarContainer):
                ax.bar_label(container, fmt='%.2f', fontsize=9)

        plt.tight_layout()
        return plt.gcf()

calculate_metrics

calculate_metrics(
    predictions: DataFrame | Path | str,
) -> dict[str, float | np.ndarray]

Computes classification metrics from prediction outputs.

Supports both binary and multi-class classification and automatically detects ensemble predictions when present.

Source code in automil/evaluation.py

def calculate_metrics(
    self,
    predictions: pd.DataFrame | Path | str
) -> dict[str, float | np.ndarray]:
    """
    Computes classification metrics from prediction outputs.

    Supports both binary and multi-class classification and automatically
    detects ensemble predictions when present.
    """
    import re

    # Ensure DataFrame
    match predictions:
        case Path() | str():
            predictions = self.load_predictions(Path(predictions))
        case pd.DataFrame():
            pass

    if "y_true" not in predictions.columns:
        raise ValueError("Predictions must contain a 'y_true' column")

    y_true = predictions["y_true"].astype(int)

    # Detect probability columns
    pred_columns = [c for c in predictions.columns if c.startswith("y_pred")]

    ensemble_columns = sorted(
        [c for c in pred_columns if re.match(r"^y_pred\d+_ensemble$", c)],
        key=lambda x: int(match.group()) if (match := re.search(r"\d+", x)) else 0
    )

    if ensemble_columns:
        prob_columns = ensemble_columns
        prediction_type = "ensemble"
    else:
        prob_columns = sorted(
            [c for c in pred_columns if re.match(r"^y_pred\d+$", c)],
            key=lambda x: int(x.replace("y_pred", ""))
        )
        prediction_type = "single model"

    if not prob_columns:
        raise ValueError(
            f"No probability columns found for {prediction_type} predictions.\n"
            f"Available columns: {list(predictions.columns)}"
        )

    # Get probability matrix
    num_classes = len(prob_columns)
    prob_matrix = predictions[prob_columns].values

    # Normalize probabilities (safety)
    row_sums = prob_matrix.sum(axis=1, keepdims=True)
    row_sums[row_sums == 0] = 1
    prob_matrix = prob_matrix / row_sums

    # Predicted labels
    if "y_pred_label" in predictions.columns:
        y_pred = predictions["y_pred_label"].astype(int)
    else:
        y_pred = np.argmax(prob_matrix, axis=1)

    # --- BBasic Metrics --- #
    accuracy = float(accuracy_score(y_true, y_pred))
    cm = confusion_matrix(y_true, y_pred, labels=range(num_classes))

    # --- Binary Classification --- #
    if num_classes == 2:

        y_probs = prob_matrix[:, 1]

        if len(np.unique(y_true)) > 1:
            auc = float(roc_auc_score(y_true, y_probs))
            ap = float(average_precision_score(y_true, y_probs))
        else:
            auc = float("nan")
            ap = float("nan")

        f1 = float(f1_score(y_true, y_pred))

    # --- Multiclass Classification --- #
    else:

        # AUC
        try:
            auc = float(
                roc_auc_score(
                    y_true,
                    prob_matrix,
                    multi_class="ovr",
                    average="macro",
                    labels=range(num_classes)
                )
            )
        except ValueError:
            auc = float("nan")

        # AP
        ap_scores = []

        for class_idx in range(num_classes):

            y_true_binary = (y_true == class_idx).astype(int)
            y_probs_class = prob_matrix[:, class_idx]

            if len(np.unique(y_true_binary)) > 1:
                ap_scores.append(
                    average_precision_score(y_true_binary, y_probs_class)
                )

        ap = float(np.mean(ap_scores)) if ap_scores else float("nan")

        # F1
        f1 = float(
            f1_score(
                y_true,
                y_pred,
                average="macro",
                labels=range(num_classes)
            )
        )

    # --- Per-Class Accuracy --- #
    with np.errstate(divide="ignore", invalid="ignore"):
        per_class_accuracy = cm.diagonal() / cm.sum(axis=1)

    return {
        "Accuracy": accuracy,
        "AUC": auc,
        "AP": ap,
        "F1": f1,
        "ConfusionMatrix": cm,
        "PerClassAccuracy": per_class_accuracy
    }

compare_predictions

compare_predictions(
    predictions_path: Path | None = None,
    metrics: list[str] = ["Accuracy", "AUC", "F1", "AP"],
) -> pd.DataFrame

Compares evaluation metrics across multiple trained models.

Parameters:

Name	Type	Description	Default
`model_dir`	`Path \| None`	Directory containing model subdirectories.	required
`metrics`	`list[str]`	Metrics to include in the comparison.	`['Accuracy', 'AUC', 'F1', 'AP']`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: Model-wise metric comparison table.

Source code in automil/evaluation.py

def compare_predictions(
    self,
    predictions_path: Path | None = None,
    metrics: list[str] = ["Accuracy", "AUC", "F1", "AP"]
) -> pd.DataFrame:
    """
    Compares evaluation metrics across multiple trained models.

    Args:
        model_dir (Path | None, optional): Directory containing model subdirectories.
        metrics (list[str], optional): Metrics to include in the comparison.

    Returns:
        pd.DataFrame: Model-wise metric comparison table.
    """
    predictions_path = predictions_path or self.out_dir

    # Collects all (sub)folders containing prediction tables
    prediction_dirs: list[Path] = []
    # Collects all actual prediction tables
    predictions_dict: dict[str, pd.DataFrame] = {}

    # Case 1: Directory with predictions passed (single model)
    if contains_predictions(predictions_path):
        prediction_dirs = [predictions_path]
        predictions_dict[predictions_path.name] = self.load_predictions(predictions_path)
    else:
        prediction_dirs = [
            subdir
            for subdir in predictions_path.iterdir()
            if subdir.is_dir() and contains_predictions(subdir)
        ]
        predictions_dict = {
            pred_dir.name: self.load_predictions(pred_dir)
            for pred_dir in prediction_dirs
        }

    if not prediction_dirs:
        raise ValueError(
            f"No prediction directories found in {predictions_path}"
        )
    elif not predictions_dict:
        raise ValueError(
            f"No predictions loaded from {predictions_path}"
    )

    self.vlog(
        f"Found [{INFO_CLR}]{len(prediction_dirs)}[/] prediction directories "
        f"in [{INFO_CLR}]{predictions_path}[/]"
    )

    comparison_data = []
    for model_name, predictions in predictions_dict.items():
        try:
            model_metrics = self.calculate_metrics(predictions)

            row: dict[str, str | float] = {"model": model_name}
            for metric in metrics:
                if metric in model_metrics:
                    value = model_metrics[metric]
                    # Convert numpy arrays and other types to string representation
                    if isinstance(value, np.ndarray):
                        row[metric] = round(float(value), 2)
                    else:
                        row[metric] = round(float(value), 2)
                else:
                    row[metric] = "N/A"

            comparison_data.append(row)

        except Exception as e:
            self.vlog(f"Failed to evaluate [{INFO_CLR}]{model_name}[/]: {e}", LogLevel.WARNING)
            continue

    comparison_df = pd.DataFrame(comparison_data)

    if not comparison_df.empty:
        self.vlog("Model Comparison:")
        self.vlog(comparison_df.to_string(index=False))

    return comparison_df

evaluate_models

evaluate_models(
    model_dir: Path | None = None,
    bags_dir: Path | None = None,
    out_dir: Path | None = None,
    generate_attention_heatmaps: bool = False,
) -> None

Evaluates one or more trained models.

Detects each trained model directory inside model_dir and evaluates them independently. Predictions and metrics are written to the output directory.

Parameters:

Name	Type	Description	Default
`model_dir`	`Path \| None`	Model directory or parent directory of models to evaluate.	`None`
`bags_dir`	`Path \| None`	Feature bag directory.	`None`
`out_dir`	`Path \| None`	Output directory.	`None`
`generate_attention_heatmaps`	`bool`	Generate attention heatmaps.	`False`

Source code in automil/evaluation.py

def evaluate_models(
    self,
    model_dir: Path | None = None,
    bags_dir: Path | None = None,
    out_dir: Path | None = None,
    generate_attention_heatmaps: bool = False
) -> None:
    """
    Evaluates one or more trained models.

    Detects each trained model directory inside `model_dir` and evaluates them independently.
    Predictions and metrics are written to the output directory.

    Args:
        model_dir (Path | None, optional): Model directory or parent directory of models to evaluate.
        bags_dir (Path | None, optional): Feature bag directory.
        out_dir (Path | None, optional): Output directory.
        generate_attention_heatmaps (bool, optional): Generate attention heatmaps.
    """
    # Default to instance variables if none provided
    model_dir = model_dir or self.model_dir
    bags_dir = bags_dir or self.bags_dir
    out_dir = out_dir or self.out_dir

    # Check if model_dir is a single model directory
    if is_model_directory(model_dir):
        model_paths = [model_dir]
        self.vlog(f"Single model directory detected: {model_dir}")
    # Else, collect all model subdirectories
    else:
        if not (model_paths := [subdir for subdir in model_dir.iterdir() if subdir.is_dir() and is_model_directory(subdir)]):
            self.vlog(f"No model directories found in {model_dir}", LogLevel.WARNING)
            return

    # Iterate over each model directory and evaluate
    for model_idx, model_path in enumerate(model_paths):
        self.vlog(f"Evaluating model [{INFO_CLR}]{model_idx+1}[/]/[{INFO_CLR}]{len(model_paths)}[/]: [{INFO_CLR}]{model_path}[/]")
        try:
            eval_mil(
                weights=str(model_path),
                bags=str(bags_dir),
                dataset=self.dataset,
                outcomes="label",
                outdir=str(out_dir),
                attention_heatmaps=generate_attention_heatmaps
            )
            self.vlog("Evaluation complete.\n")
        except Exception as e:
            self.vlog(f"Error evaluating model at {model_path}: {e}", LogLevel.ERROR)
            continue

generate_plots

generate_plots(
    model_paths: list[Path] | None = None,
    save_path: Path | None = None,
    figsize: tuple[int, int] = (10, 10),
) -> None

Generate all comparison plots and save them to self.project_dir/figures

Source code in automil/evaluation.py

def generate_plots(
    self,
    model_paths: list[Path] | None = None,
    save_path: Path | None = None,
    figsize: tuple[int, int] = (10, 10)
) -> None:
    """Generate all comparison plots and save them to `self.project_dir/figures`"""
    # Collect models from expected folder if not provided
    if model_paths is None:
        model_paths = sorted(
            [path for path in self.out_dir.iterdir() if path.is_dir()]
        )

    # Calculate and collect metrics for all models
    combined_metrics = {}
    for model_path in model_paths:
        try:
            predictions = self.load_predictions(model_path)
            model_metrics = self.calculate_metrics(predictions)
            combined_metrics[model_path.name] = model_metrics
        except Exception as e:
            self.vlog(f"Failed to load metrics for {model_path.name}: {e}")
            continue

    if not combined_metrics:
        self.vlog("No valid model data found for generating plots")
        return

    # Collect and execute all plotting methods
    plots = cast(
        dict[str, Figure], # Make sure the type annotation is correct
        {
            method_name.removeprefix('_plot_'): plot_method(
                combined_metrics,
                figsize=figsize,
            )
            for method_name in dir(self)
            if (
                method_name.startswith('_plot_')
                and callable((plot_method := getattr(self, method_name)))
                and signature(plot_method).return_annotation == Figure
            )
        }
    )

    if not save_path:
        save_path = self.out_dir / "figures"
        save_path.mkdir(parents=True, exist_ok=True)

    # Save all generated plots
    for plot_name, fig in plots.items():
        plot_file = save_path / f"{plot_name}.png"
        fig.savefig(plot_file, dpi=300, bbox_inches='tight')
        self.vlog(f"Saved plot '[{INFO_CLR}]{plot_name}[/]' to [{INFO_CLR}]{plot_file}[/]")
    return

generate_predictions

generate_predictions(
    model_dir: Path | None = None,
    bags_dir: Path | None = None,
    out_dir: Path | None = None,
) -> None

Generates prediction outputs for one or more trained models.

Predictions are saved per model in predictions.parquet format.

Parameters:

Name	Type	Description	Default
`model_dir`	`Path \| None`	Directory containing model subdirectories.	`None`
`bags_dir`	`Path \| None`	Feature bag directory.	`None`
`out_dir`	`Path \| None`	Output directory.	`None`

Source code in automil/evaluation.py

def generate_predictions(
    self,
    model_dir: Path | None = None,
    bags_dir: Path | None = None,
    out_dir: Path | None = None
) -> None:
    """
    Generates prediction outputs for one or more trained models.

    Predictions are saved per model in ``predictions.parquet`` format.

    Args:
        model_dir (Path | None, optional): Directory containing model subdirectories.
        bags_dir (Path | None, optional): Feature bag directory.
        out_dir (Path | None, optional): Output directory.
    """
    # Default to instance variables if none provided
    model_dir = model_dir or self.model_dir
    bags_dir = bags_dir or self.bags_dir
    out_dir = out_dir or self.out_dir

    # Check if model_dir is a single model directory
    if is_model_directory(model_dir):
        model_paths = [model_dir]
        self.vlog(f"Single model directory detected: [{INFO_CLR}]{model_dir}[/]")
    # Else, collect all model subdirectories
    else:
        if not (model_paths := [subdir for subdir in model_dir.iterdir() if subdir.is_dir() and is_model_directory(subdir)]):
            self.vlog(f"No model directories found in [{INFO_CLR}]{model_dir}[/]", LogLevel.WARNING)
            return

    # Iterate over each model directory and generate predictions
    for model_idx, model_path in enumerate(model_paths):
        self.vlog(f"Generating predictions with model [{INFO_CLR}]{model_idx+1}[/]/[{INFO_CLR}]{len(model_paths)}[/]: [{INFO_CLR}]{model_path}[/]")
        try:
            predictions = predict_mil(
                model=str(model_path),
                bags=str(bags_dir),
                dataset=self.dataset,
                outcomes="label",
            )
            # Cast to DataFrame
            # Can do this safely since predict_mil always returns a DataFrame if attention==False
            predictions = pd.DataFrame(predictions)

            # Save predictions to out_dir/model_name/predictions.parquet
            model_out_dir = out_dir / model_path.name
            model_out_dir.mkdir(parents=True, exist_ok=True)
            predictions_path = model_out_dir / "predictions.parquet"
            predictions.to_parquet(predictions_path, index=False)
            self.vlog(f"Predictions saved to [{INFO_CLR}]{predictions_path}[/]")

        except Exception as e:
            self.vlog(f"Error evaluating model at {model_path}: {e}", LogLevel.ERROR)
            continue

load_predictions

load_predictions(model_path: Path) -> pd.DataFrame

Loads and validates prediction outputs from a trained model directory.

The predictions file must contain: - One or more probability columns starting with y_pred - Base columns slide and y_true

Parameters:

Name	Type	Description	Default
`model_path`	`Path`	Path to a trained model directory.	required

Raises:

Type	Description
`FileNotFoundError`	If `predictions.parquet` is missing.
`ValueError`	If required prediction or base columns are absent.

Returns:

Type	Description
`DataFrame`	pd.DataFrame: Loaded and validated predictions.

Source code in automil/evaluation.py

def load_predictions(self, model_path: Path) -> pd.DataFrame:
    """
    Loads and validates prediction outputs from a trained model directory.

    The predictions file must contain:
    - One or more probability columns starting with ``y_pred``
    - Base columns ``slide`` and ``y_true``

    Args:
        model_path (Path): Path to a trained model directory.

    Raises:
        FileNotFoundError: If ``predictions.parquet`` is missing.
        ValueError: If required prediction or base columns are absent.

    Returns:
        pd.DataFrame: Loaded and validated predictions.
    """
    if not (predictions_path := model_path / "predictions.parquet").exists():
        raise FileNotFoundError(f"{model_path} does not contain a 'predictions.parquet' file")

    predictions = pd.read_parquet(predictions_path)

    all_columns = [column for column in predictions.columns]
    # We expect columns containing prediction probabilites to start with 'y_pred' (e.g 'y_pred0', 'y_pred1', ...)
    pred_columns = [column for column in all_columns if column.startswith("y_pred")]
    # Similarly, we expect predictions to contain 'slide' and 'y_true' columns
    base_columns = ["slide", "y_true"]

    if not pred_columns:
        raise ValueError("'predictions.parquet' does not contain the expected prediction columns")
    elif not all(base_column in all_columns for base_column in base_columns):
        raise ValueError("'predictions.parquet' does not contain the expected base columns")

    return predictions