Skip to content

Io json

json_loader(path_data, path_imagedir, allowed_image_formats, training=True, ohe=True) ยค

Data Input Interface for loading a dataset via a JSON and an image directory.

This internal function allows simple parsing of class annotations encoded in a JSON.

Input Formats
Format Sparse:
    - Name Index (key) : Class (value)

Format One-Hot Encoded:
    - Name Index (key) : List consisting of binary integers.

Expected structure:

dataset/
    images_dir/                 # path_imagedir = "dataset/images_dir"
        sample001.png
        sample002.png
        ...
        sample350.png
    annotations.json            # path_data = "dataset/annotations.json"

Parameters:

Name Type Description Default
path_data str

Path to the json file.

required
path_imagedir str

Path to the directory containing the images.

required
allowed_image_formats list of str

List of allowed imaging formats. (provided by IO_Interface)

required
training bool

Boolean option whether annotation data is available.

True
ohe bool

Boolean option whether annotation data is sparse categorical or one-hot encoded.

True

Returns:

Name Type Description
index_list list of str

List of sample/index encoded as Strings. Required in DataGenerator as samples.

class_ohe numpy.ndarray

Classification list as One-Hot encoding. Required in DataGenerator as labels.

class_n int

Number of classes. Required in NeuralNetwork for Architecture design as n_labels.

class_names list of str

List of names for corresponding classes. Used for later prediction storage or evaluation.

image_format str

Image format to add at the end of the sample index for image loading. Required in DataGenerator.

Source code in aucmedi/data_processing/io_interfaces/io_json.py
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
def json_loader(path_data, path_imagedir, allowed_image_formats, training=True,
                ohe=True):
    """ Data Input Interface for loading a dataset via a JSON and an image directory.

    This **internal** function allows simple parsing of class annotations encoded in a JSON.

    ???+ info "Input Formats"
        ```
        Format Sparse:
            - Name Index (key) : Class (value)

        Format One-Hot Encoded:
            - Name Index (key) : List consisting of binary integers.
        ```

    **Expected structure:**
    ```
    dataset/
        images_dir/                 # path_imagedir = "dataset/images_dir"
            sample001.png
            sample002.png
            ...
            sample350.png
        annotations.json            # path_data = "dataset/annotations.json"
    ```

    Args:
        path_data (str):                        Path to the json file.
        path_imagedir (str):                    Path to the directory containing the images.
        allowed_image_formats (list of str):    List of allowed imaging formats. (provided by IO_Interface)
        training (bool):                        Boolean option whether annotation data is available.
        ohe (bool):                             Boolean option whether annotation data is sparse categorical or one-hot encoded.

    Returns:
        index_list (list of str):               List of sample/index encoded as Strings. Required in DataGenerator as `samples`.
        class_ohe (numpy.ndarray):              Classification list as One-Hot encoding. Required in DataGenerator as `labels`.
        class_n (int):                          Number of classes. Required in NeuralNetwork for Architecture design as `n_labels`.
        class_names (list of str):              List of names for corresponding classes. Used for later prediction storage or evaluation.
        image_format (str):                     Image format to add at the end of the sample index for image loading. Required in DataGenerator.
    """
    # Load JSON file
    with open(path_data, "r") as json_reader:
        dt_json = json.load(json_reader)
    # Identify image format by peaking first image
    image_format = None
    for file in os.listdir(path_imagedir):
        format = file.split(".")[-1]
        if format.lower() in allowed_image_formats or \
           format.upper() in allowed_image_formats:
           image_format = format
           break
    # Raise Exception if image format is unknown
    if image_format is None:
        raise Exception("Unknown image format.", path_imagedir)

    # Verify if all images are existing
    lever = True
    for sample in dt_json:
        if sample == "legend" : continue
        # Check if image ending is already in sample name by peaking first one
        if lever:
            lever = False
            if sample.endswith("." + image_format) : image_format = None
        # Obtain image file path
        if image_format : img_file = sample + "." + image_format
        else : img_file = sample
        path_img = os.path.join(path_imagedir, img_file)
        # Check existance
        if not os.path.exists(path_img):
            raise Exception("Image does not exist / not accessible!",
                            'Sample: "' + sample + '"', path_img)

    # If JSON is for inference (no annotation data)
    if not training:
        # Ensure index list to contain strings
        if "legend" in dt_json : del dt_json["legend"]
        index_list = [str(x) for x in dt_json]
        # -> return parsing
        return index_list, None, None, None, image_format

    # Try parsing with a sparse categorical class format
    if not ohe:
        # Parse class name information
        if "legend" in dt_json:
            class_names = dt_json["legend"]
            del dt_json["legend"]
        else : class_names = None
        # Obtain class information and index list
        index_list = []
        classes_sparse = []
        for sample in dt_json:
            index_list.append(str(sample))
            classes_sparse.append(dt_json[sample])
        if class_names is None : class_names = np.unique(classes_sparse).tolist()
        class_n = len(class_names)
        # Parse sparse categorical annotations to One-Hot Encoding
        class_ohe = pd.get_dummies(classes_sparse).to_numpy()
    # Try parsing one-hot encoded format
    else:
        # Parse information
        if "legend" in dt_json:
            class_names = dt_json["legend"]
            del dt_json["legend"]
            class_n = len(class_names)
        else:
            class_names = None
            class_n = None
        # Obtain class information and index list
        index_list = []
        class_data = []
        for sample in dt_json:
            index_list.append(str(sample))
            class_data.append(dt_json[sample])
        class_ohe = np.array(class_data)
        # Verify number of class annotation
        if class_n is None : class_ohe.shape[1]

    # Validate if number of samples and number of annotations match
    if len(index_list) != len(class_ohe):
        raise Exception("Numbers of samples and annotations do not match!",
                        len(index_list), len(class_ohe))

    # Return parsed JSON data
    return index_list, class_ohe, class_n, class_names, image_format