csi_images.csi_events

Contains the Event class, which represents a single event in a scan. The Event class optionally holds metadata and features. Lists of events with similar metadata or features can be combined into DataFrames for analysis.

The Event class holds the position of the event in the frame, which can be converted to the position in the scanner or slide coordinate positions. See the csi_utils.csi_scans documentation page for more information on the coordinate systems.

   1"""
   2Contains the Event class, which represents a single event in a scan.
   3The Event class optionally holds metadata and features. Lists of events with
   4similar metadata or features can be combined into DataFrames for analysis.
   5
   6The Event class holds the position of the event in the frame, which can be converted
   7to the position in the scanner or slide coordinate positions. See the
   8csi_utils.csi_scans documentation page for more information on the coordinate systems.
   9"""
  10
  11import os
  12import glob
  13import math
  14import warnings
  15from typing import Self, Iterable, Hashable, Sequence
  16
  17import numpy as np
  18import pandas as pd
  19
  20from .csi_scans import Scan
  21from .csi_tiles import Tile
  22from .csi_frames import Frame
  23
  24# Optional dependencies; will raise errors in particular functions if not installed
  25try:
  26    from . import csi_images
  27except ImportError:
  28    csi_images = None
  29try:
  30    import imageio.v3 as imageio
  31except ImportError:
  32    imageio = None
  33try:
  34    import pyreadr
  35except ImportError:
  36    pyreadr = None
  37
  38
  39class Event:
  40    """
  41    A class that represents a single event in a scan, making it easy to evaluate
  42    singular events. Required metadata is exposed as attributes, and optional
  43    metadata and features are stored as DataFrames.
  44    """
  45
  46    SCAN_TO_SLIDE_TRANSFORM = {
  47        # Axioscan zero is in the top-right corner instead of top-left
  48        Scan.Type.AXIOSCAN7: np.array(
  49            [
  50                [1, 0, 75000],
  51                [0, 1, 0],
  52                [0, 0, 1],
  53            ]
  54        ),
  55        # BZScanner coordinates are a special kind of messed up:
  56        # - The slide is upside-down.
  57        # - The slide is oriented vertically, with the barcode at the bottom.
  58        # - Tiles are numbered from the top-right
  59        Scan.Type.BZSCANNER: np.array(
  60            [
  61                [0, -1, 75000],
  62                [-1, 0, 25000],
  63                [0, 0, 1],
  64            ]
  65        ),
  66    }
  67    """
  68    Homogeneous transformation matrices for converting between scanner and slide
  69    coordinates. The matrices are 3x3, with the final column representing the
  70    translation in micrometers (um). For more information, see 
  71    [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations).
  72    
  73    Transformations are nominal, and accuracy is not guaranteed; this is due to 
  74    imperfections in slides and alignment in the scanners. Units are in micrometers.
  75    """
  76
  77    def __init__(
  78        self,
  79        tile: Tile,
  80        x: int,
  81        y: int,
  82        metadata: pd.Series = None,
  83        features: pd.Series = None,
  84    ):
  85        self.tile = tile
  86        self.x = int(x)
  87        self.y = int(y)
  88        self.metadata = metadata
  89        self.features = features
  90
  91    def __repr__(self) -> str:
  92        return f"{self.tile}-{self.x}-{self.y}"
  93
  94    def __eq__(self, other) -> bool:
  95        return self.__repr__() == other.__repr__()
  96
  97    def __lt__(self, other):
  98        return self.__repr__() < other.__repr__()
  99
 100    def get_scan_position(self) -> tuple[float, float]:
 101        """
 102        Get the position of the event in the scanner's coordinate frame.
 103        :return: the scan position of the event in micrometers (um).
 104        """
 105        # Get overall pixel position
 106        real_tile_height, real_tile_width = self.tile.scan.get_image_size()
 107        pixel_x = self.x + (real_tile_width * self.tile.x)
 108        pixel_y = self.y + (real_tile_height * self.tile.y)
 109        # Convert to micrometers
 110        x_um = pixel_x * self.tile.scan.pixel_size_um
 111        y_um = pixel_y * self.tile.scan.pixel_size_um
 112        # Add the scan's origin in the scanner frame
 113        x_um += self.tile.scan.roi[self.tile.n_roi].origin_x_um
 114        y_um += self.tile.scan.roi[self.tile.n_roi].origin_y_um
 115        return x_um, y_um
 116
 117    def get_slide_position(self) -> tuple[float, float]:
 118        """
 119        Get the slide position of the event in micrometers (um).
 120        :return: the slide position of the event.
 121        """
 122        # Turn scan_position into a 3x1 vector
 123        scan_position = self.get_scan_position()
 124        scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]])
 125
 126        # Multiply by the appropriate homogeneous matrix
 127        if self.tile.scan.scanner_id.startswith(self.tile.scan.Type.AXIOSCAN7.value):
 128            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.tile.scan.Type.AXIOSCAN7]
 129        elif self.tile.scan.scanner_id.startswith(self.tile.scan.Type.BZSCANNER.value):
 130            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.tile.scan.Type.BZSCANNER]
 131        else:
 132            raise ValueError(f"Scanner type {self.tile.scan.scanner_id} not supported.")
 133        slide_position = np.matmul(transform, scan_position)
 134        return float(slide_position[0][0]), float(slide_position[1][0])
 135
 136    def crop(
 137        self, images: Iterable[np.ndarray], crop_size: int = 100, in_pixels: bool = True
 138    ) -> list[np.ndarray]:
 139        """
 140        Crop the event from the provided frame images. Use if you have already gotten
 141        frame images; useful for cropping multiple events from the same frame image.
 142        :param images: the frame images.
 143        :param crop_size: the square size of the image crop to get for this event.
 144        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
 145        :return: image_size x image_size crops of the event in the provided frames. If
 146        the event is too close to the edge, the crop will be smaller and not centered.
 147        """
 148        # Convert a crop size in micrometers to pixels
 149        if not in_pixels:
 150            crop_size = round(crop_size / self.tile.scan.pixel_size_um)
 151        image_height, image_width = 0, 0
 152        for image in images:
 153            if image_height == 0 and image_width == 0:
 154                image_height, image_width = image.shape
 155            else:
 156                if image_height != image.shape[0] or image_width != image.shape[1]:
 157                    raise ValueError("All images must be the same size")
 158        if image_height == 0 or image_width == 0:
 159            raise ValueError("No images provided")
 160
 161        # Find the crop bounds
 162        bounds = [
 163            self.x - (crop_size // 2) + 1,
 164            self.y - (crop_size // 2) + 1,
 165            self.x + math.ceil(crop_size / 2) + 1,
 166            self.y + math.ceil(crop_size / 2) + 1,
 167        ]
 168        # Determine how much the bounds violate the image size
 169        displacements = [
 170            max(0, -bounds[0]),
 171            max(0, -bounds[1]),
 172            max(0, bounds[2] - image_width),
 173            max(0, bounds[3] - image_height),
 174        ]
 175        # Cap off the bounds
 176        bounds = [
 177            max(0, bounds[0]),
 178            max(0, bounds[1]),
 179            min(image_width, bounds[2]),
 180            min(image_height, bounds[3]),
 181        ]
 182
 183        # Crop the images
 184        crops = []
 185        for image in images:
 186            # Create a blank image of the right size
 187            crop = np.zeros((crop_size, crop_size), dtype=image.dtype)
 188
 189            # Insert the cropped image into the blank image, leaving a black buffer
 190            # around the edges if the crop would go beyond the original image bounds
 191            crop[
 192                displacements[1] : crop_size - displacements[3],
 193                displacements[0] : crop_size - displacements[2],
 194            ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]]
 195            crops.append(crop)
 196        return crops
 197
 198    def get_crops(
 199        self,
 200        crop_size: int = 100,
 201        in_pixels: bool = True,
 202        input_path: str = None,
 203        channels: Iterable[int | str] = None,
 204        apply_gain: bool | Iterable[bool] = True,
 205    ) -> list[np.ndarray]:
 206        """
 207        Gets the frame images for this event and then crops the event from the images.
 208        Convenient for retrieving a single event's crops, but less efficient when
 209        retrieving multiple events from the same tile as it will reread the images.
 210        :param crop_size: the square size of the image crop to get for this event.
 211        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
 212        :param input_path: the path to the input images. Defaults to None (uses the scan's path).
 213        :param channels: the channels to extract images for. Defaults to all channels.
 214        :param apply_gain: whether to apply scanner-calculated gain to the images, if
 215        not already applied. If a list, matches the channels.
 216        :return: a list of cropped images from the scan in the order of the channels.
 217        """
 218        # This function validates channels
 219        frames = Frame.get_frames(self.tile, channels)
 220        # Convert individual inputs to lists of appropriate length
 221        if isinstance(apply_gain, bool):
 222            apply_gain = [apply_gain] * len(frames)
 223        images = [f.get_image(input_path, a) for f, a in zip(frames, apply_gain)]
 224        return self.crop(images, crop_size, in_pixels)
 225
 226    def save_crops(
 227        self,
 228        crops: Sequence[np.ndarray],
 229        output_path: str,
 230        labels: Sequence[str],
 231        ext: str = "auto",
 232    ):
 233        """
 234        Save the crops to image files.
 235        :param crops: the crops to save. Will save as RGB if 3 channel [h, w, 3] or
 236        grayscale if 1 channel [h, w] or [h, w, 1].
 237        :param labels: the labels to append to the file name, usually the channel names
 238        associated with each crop.
 239        :param output_path: the folder to save the crops to. Will make if needed.
 240        :param ext: the file extension to save the crops as. Defaults to "auto", which
 241        will save as .tif for grayscale images and .jpg for RGB images.
 242        :return: None
 243        """
 244        if len(crops) != len(labels):
 245            raise ValueError("Crops and labels must be the same length")
 246
 247        if csi_images is None or imageio is None:
 248            raise ModuleNotFoundError(
 249                "imageio libraries not installed! "
 250                "run `pip install csi_images[imageio]` to resolve."
 251            )
 252
 253        os.makedirs(output_path, exist_ok=True)
 254
 255        for crop, label in zip(crops, labels):
 256            if ext == "auto":
 257                if len(crop.shape) == 2 or crop.shape[2] == 1:
 258                    file_extension = ".tif"
 259                elif crop.shape[2] == 3:
 260                    file_extension = ".jpg"
 261                else:
 262                    warnings.warn(
 263                        f"Image shape {crop.shape} not recognized; saving as .tif"
 264                    )
 265                    file_extension = ".tif"
 266            else:
 267                file_extension = ext
 268            file = os.path.join(output_path, f"{self}-{label}{file_extension}")
 269            # TODO: add more file types here
 270            if file_extension == ".tif":
 271                imageio.imwrite(file, crop, compression="deflate")
 272            elif file_extension in [".jpg", ".jpeg"]:
 273                crop = csi_images.scale_bit_depth(crop, np.uint8)
 274                imageio.imwrite(file, crop, quality=80)
 275            else:
 276                imageio.imwrite(file, crop)
 277
 278    def load_crops(
 279        self, input_path: str, labels: list[str] = None
 280    ) -> dict[str, np.ndarray]:
 281        """
 282        Loads previously saved crop files from a folder.
 283        :param input_path: folder containing crop files.
 284        :param labels: optional label filter, will only return crops with these labels.
 285        :return: a tuple of lists containing the crops and their labels.
 286        """
 287        crops = {}
 288        for file in glob.glob(os.path.join(input_path, f"{self}-*")):
 289            label = os.path.splitext(os.path.basename(file))[0].split("-")[-1]
 290            # Skip if we have labels to target
 291            if labels is not None and label not in labels:
 292                continue
 293            crops[label] = imageio.imread(file)
 294        return crops
 295
 296    def get_montage_channels(
 297        self,
 298        channels: Sequence[int | str] | None = None,
 299        composites: dict[int | str, tuple[float, float, float]] | None = None,
 300    ) -> tuple[list[int], list[int], dict[int, tuple[float, float, float]]]:
 301        """
 302        Get the channel names for the montage from the event's tile.
 303        :param channels: channel indices or names for grayscale channels
 304        :param composites: dictionary of channel indices or names and RGB values
 305        :return: (1) channel indices to retrieve,
 306                 (2) relative grayscale channel indices, and
 307                 (3) composite channel indices and RGB values.
 308        """
 309        if channels is None:
 310            channels = list(range(len(self.tile.scan.channels)))
 311        if (len(channels) == 0) and (composites is None or len(composites) == 0):
 312            raise ValueError("Must provide at least one channel type to montage")
 313
 314        channels_to_get = []
 315
 316        # Build the list of channels to retrieve
 317        if channels is not None:
 318            if isinstance(channels[0], str):
 319                channels = self.tile.scan.get_channel_indices(channels)
 320            channels_to_get += channels
 321            order = list(range(len(channels)))  # Always the first n channels
 322        else:
 323            order = None
 324
 325        if composites is not None:
 326            relative_composites = {}  # Relative indices for retrieved channels
 327            # Convert to scan indices
 328            rgb_channels = list(composites.keys())
 329            if isinstance(rgb_channels[0], str):
 330                rgb_channels = self.tile.scan.get_channel_indices(rgb_channels)
 331            # Find the index or add to the end
 332            for channel, rgb in zip(rgb_channels, composites.values()):
 333                if channel not in channels_to_get:
 334                    channels_to_get.append(channel)
 335                    relative_composites[channel] = rgb
 336                else:
 337                    relative_composites[channels_to_get.index(channel)] = rgb
 338        else:
 339            relative_composites = None
 340
 341        return channels_to_get, order, relative_composites
 342
 343    def get_montage(
 344        self,
 345        channels: Sequence[int | str] = None,
 346        composites: dict[int | str, tuple[float, float, float]] = None,
 347        mask: np.ndarray[np.uint8] = None,
 348        labels: Sequence[str] = None,
 349        crop_size: int = 100,
 350        in_pixels: bool = True,
 351        input_path: str = None,
 352        apply_gain: bool = True,
 353        **kwargs,
 354    ) -> np.ndarray:
 355        """
 356        Convenience function for getting frame images and creating a montage. Mirrors
 357        csi_images.make_montage(). Convenient for a single event's montage, but less
 358        efficient when for multiple events from the same tile.
 359        :param channels: the channels to use for black-and-white montages.
 360        :param composites: dictionary of indices and RGB tuples for a composite.
 361        :param mask: a mask to apply to the montage. Must be the same size as the crop.
 362        :param labels: the labels to subtitle montage images, usually the channel names
 363        :param crop_size: the square size of the image crop to get for this event.
 364        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
 365        :param input_path: the path to the input images. Defaults to None (uses the scan's path).
 366        :param apply_gain: whether to apply scanner-calculated gain to the images, if
 367        not already applied. If a list, matches the channels.
 368        :param kwargs: montage options. See csi_images.make_montage() for more details.
 369        :return: numpy array representing the montage.
 370        """
 371        channels, order, composites = self.get_montage_channels(channels, composites)
 372        images = self.get_crops(crop_size, in_pixels, input_path, channels, apply_gain)
 373        return csi_images.make_montage(
 374            images, order, composites, mask, labels, **kwargs
 375        )
 376
 377    def save_montage(
 378        self,
 379        montage: np.ndarray,
 380        output_path: str,
 381        ocular_names: bool = False,
 382        tag: str = "",
 383        file_extension: str = ".jpeg",
 384        **kwargs,
 385    ):
 386        """
 387        Save the montage as a JPEG image with a set name.
 388        :param montage: the montage to save.
 389        :param output_path: the folder to save the montage in. Will make if needed.
 390        :param ocular_names: whether to use the OCULAR naming convention.
 391        :param tag: a tag to append to the file name. Ignored if ocular_names is True.
 392        :param file_extension: the file extension to save the montage as. Defaults to .jpeg.
 393        :param kwargs: additional arguments to pass to imageio.imwrite().
 394        :return: None
 395        """
 396        if csi_images is None or imageio is None:
 397            raise ModuleNotFoundError(
 398                "imageio libraries not installed! "
 399                "run `pip install csi_images[imageio]` to resolve."
 400            )
 401
 402        montage = csi_images.scale_bit_depth(montage, np.uint8)
 403
 404        if not file_extension.startswith("."):
 405            file_extension = f".{file_extension}"
 406
 407        if ocular_names:
 408            if "cell_id" not in self.metadata.index:
 409                raise ValueError(
 410                    "Event metadata must include 'cell_id' for OCULAR naming."
 411                )
 412            file = f"{self.tile.n}-{self.metadata['cell_id']}-{self.x}-{self.y}{file_extension}"
 413        else:
 414            file = f"{self}{tag}{file_extension}"
 415
 416        os.makedirs(output_path, exist_ok=True)
 417        imageio.imwrite(os.path.join(output_path, file), montage, **kwargs)
 418
 419    def load_montage(self, input_path: str, tag: str = "") -> np.ndarray:
 420        """
 421        Loads the montage from a file saved by Event.save_montage.
 422        :param input_path: the path to the folder where the montage was saved.
 423        :param tag: a string to add to the file name, before the extension.
 424        :return:
 425        """
 426        file = f"{self}{tag}.jpeg"
 427        return imageio.imread(os.path.join(input_path, file))
 428
 429    @classmethod
 430    def get_many_crops(
 431        cls,
 432        events: Sequence[Self],
 433        crop_size: int | Sequence[int] = 100,
 434        in_pixels: bool = True,
 435        input_path: str | Sequence[str] = None,
 436        channels: Sequence[int | str] = None,
 437        apply_gain: bool | Sequence[bool] = True,
 438    ) -> list[list[np.ndarray]]:
 439        """
 440        Get the crops for a list of events, ensuring that there is no wasteful reading
 441        of the same tile multiple times. This function is more efficient than calling
 442        get_crops() for each event.
 443        :param events: the events to get crops for.
 444        :param crop_size: the square size of the image crop to get for this event.
 445                          Defaults to four times the size of the event.
 446        :param in_pixels: whether the crop size is in pixels or micrometers.
 447                          Defaults to pixels, and is ignored if crop_size is None.
 448        :param input_path: the path to the input images. Will only work for lists of events
 449                           from the same scan. Defaults to None (uses the scan's path).
 450        :param channels: the channels to extract images for. Defaults to all channels.
 451        :param apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True.
 452                           Can be supplied as a list to apply gain to individual channels.
 453        :return: a list of lists of cropped images for each event.
 454        """
 455        if len(events) == 0:
 456            return []
 457        # Adapt singular inputs to lists of appropriate length
 458        if isinstance(crop_size, int):
 459            crop_size = [crop_size] * len(events)
 460        if input_path is None or isinstance(input_path, str):
 461            input_path = [input_path] * len(events)
 462
 463        # Get the order of the events when sorted by slide/tile
 464        order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__()))
 465
 466        # Allocate the list to size
 467        crops = [[]] * len(events)
 468        last_tile = None
 469        images = None  # Holds large numpy arrays, so expensive to compare
 470        # Iterate through in slide/tile sorted order
 471        for i in order:
 472            if last_tile != events[i].tile:
 473                # Gather the frame images, preserving them for the next event
 474                frames = Frame.get_frames(events[i].tile, channels)
 475                if isinstance(apply_gain, bool):
 476                    apply = [apply_gain] * len(frames)
 477                else:
 478                    apply = apply_gain
 479                images = [f.get_image(input_path[i], a) for f, a in zip(frames, apply)]
 480                last_tile = events[i].tile
 481            # Use the frame images to crop the event images
 482            crops[i] = events[i].crop(images, crop_size[i], in_pixels)
 483        return crops
 484
 485    @classmethod
 486    def get_many_montages(
 487        cls,
 488        events: Sequence[Self],
 489        channels: Sequence[int | str] = None,
 490        composites: dict[int | str, tuple[float, float, float]] = None,
 491        masks: Sequence[np.ndarray[np.uint8]] = None,
 492        labels: Sequence[str] = None,
 493        crop_size: int = 100,
 494        in_pixels: bool = True,
 495        input_path: str = None,
 496        apply_gain: bool | Iterable[bool] = True,
 497        **kwargs,
 498    ) -> list[np.ndarray]:
 499        """
 500        Convenience function for get_montage(), but for a list of events. More efficient
 501        than get_montage() when working with multiple events from the same tile.
 502        :param events: a list of Event objects.
 503        :param channels: the channels to extract images for. Defaults to all channels.
 504        :param composites: dictionary of indices and RGB tuples for a composite.
 505        :param masks: a list of masks to apply to the montages. Must be the same size as the crops.
 506        :param labels: the labels to subtitle montage images, usually the channel names
 507        :param crop_size: the square size of the image crop to get for this event.
 508        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
 509        :param input_path: the path to the input images. Defaults to None (uses the scan's path).
 510        :param apply_gain: whether to apply scanner-calculated gain to the images, if
 511        not already applied. If a list, matches the channels.
 512        :param kwargs: montage options. See csi_images.make_montage() for more details.
 513        :return: a list of numpy arrays representing the montages.
 514        """
 515        if len(events) == 0:
 516            return []
 517        # Adapt singular inputs to lists of appropriate length
 518        if isinstance(crop_size, int):
 519            crop_size = [crop_size] * len(events)
 520        if input_path is None or isinstance(input_path, str):
 521            input_path = [input_path] * len(events)
 522        if masks is None or isinstance(masks, np.ndarray):
 523            masks = [masks] * len(events)
 524
 525        # Get the order of the events when sorted by slide/tile
 526        event_order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__()))
 527
 528        # Allocate the list to size
 529        montages = [np.empty(0)] * len(events)
 530        # Placeholder variables to avoid rereading the same tile
 531        images = None  # Holds large numpy arrays, so expensive to compare
 532        order = None
 533        rel_composites = None
 534        last_tile = None
 535        # Iterate through in slide/tile sorted order
 536        for i in event_order:
 537            if last_tile != events[i].tile:
 538                channels_to_get, order, rel_composites = events[i].get_montage_channels(
 539                    channels, composites
 540                )
 541                # Gather the frame images, preserving them for the next event
 542                frames = Frame.get_frames(events[i].tile, channels_to_get)
 543                if isinstance(apply_gain, bool):
 544                    apply = [apply_gain] * len(frames)
 545                else:
 546                    apply = apply_gain
 547                images = [f.get_image(input_path[i], a) for f, a in zip(frames, apply)]
 548                last_tile = events[i].tile
 549            # Use the frame images to crop the event images and make montages
 550            crops = events[i].crop(images, crop_size[i], in_pixels)
 551            montages[i] = csi_images.make_montage(
 552                crops, order, rel_composites, masks[i], labels, **kwargs
 553            )
 554
 555        return montages
 556
 557    @classmethod
 558    def get_and_save_many_crops(
 559        cls,
 560        events: list[Self],
 561        output_path: str,
 562        labels: Sequence[str],
 563        ext: str = "auto",
 564        additional_gain: Sequence[float] = None,
 565        **kwargs,
 566    ) -> None:
 567        """
 568        Get and save the crops for a list of events, ensuring that there is no wasteful
 569        reading and limiting the image data in memory to 1 tile at a time. This function
 570        is more efficient that chaining get_crops() and save_crops() for each event or
 571        get_many_crops() and then save_crops().
 572        :param events: list of events to get, crop, and save.
 573        :param output_path: the folder to save the crops in. Will make if needed.
 574        :param labels: the labels to save the crops with. See save_crops().
 575        :param ext: the file extension to save the crops as. See save_crops().
 576        :param additional_gain: additional gain to apply to the crops. If not None, must
 577        match the length of the number of crop channels.
 578        :param kwargs: see get_many_crops() for more parameters.
 579        :return:
 580        """
 581        unique_tiles = set([event.tile for event in events])
 582
 583        for tile in unique_tiles:
 584            # Get one tile's worth of event crops
 585            tile_events = [e for e in events if e.tile == tile]
 586            crops_list = cls.get_many_crops(tile_events, **kwargs)
 587            for event, crops in zip(tile_events, crops_list):
 588                # Apply any additional gains
 589                if additional_gain is not None:
 590                    crops = [gain * crop for gain, crop in zip(additional_gain, crops)]
 591                event.save_crops(crops, output_path, labels, ext)
 592
 593    @classmethod
 594    def get_and_save_many_montages(
 595        cls,
 596        events: list[Self],
 597        output_path: str,
 598        ocular_names: bool = False,
 599        tag: str = "",
 600        **kwargs,
 601    ) -> None:
 602        """
 603        Save montages of the events to image files.
 604        :param events: the events to get, montage, and save.
 605        :param output_path: the folder to save the montages to. Will make if needed.
 606        :param ocular_names: whether to use the OCULAR naming convention.
 607        :param tag: a tag to append to the file name. Ignored if ocular_names is True.
 608        :param kwargs: see get_many_montages() for more parameters.
 609        """
 610        unique_tiles = set([event.tile for event in events])
 611
 612        for tile in unique_tiles:
 613            # Get one tile's worth of event crops
 614            tile_events = [e for e in events if e.tile == tile]
 615            montages = cls.get_many_montages(tile_events, **kwargs)
 616            for event, montage in zip(tile_events, montages):
 617                event.save_montage(montage, output_path, ocular_names, tag)
 618
 619
 620class EventArray:
 621    """
 622    A class that holds a large number of events' data, making it easy to analyze and
 623    manipulate many events at once. A more separated version of the Event class.
 624    """
 625
 626    INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y"]
 627
 628    def __init__(
 629        self,
 630        info: pd.DataFrame = None,
 631        metadata: pd.DataFrame = None,
 632        features: pd.DataFrame = None,
 633    ):
 634
 635        # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y"
 636        self.info = info
 637        if self.info is not None:
 638            # Special case: "roi" is often not required, so we'll fill in if its missing
 639            if "roi" not in info.columns:
 640                self.info = self.info.assign(roi=0)
 641            if set(self.info.columns) != set(self.INFO_COLUMNS):
 642                raise ValueError(
 643                    f"EventArray.info must have columns:"
 644                    f"{self.INFO_COLUMNS}; had {list(self.info.columns)}"
 645                )
 646            # Ensure order and data types
 647            self.info = pd.DataFrame(
 648                {
 649                    "slide_id": self.info["slide_id"].astype(str),
 650                    "tile": self.info["tile"].astype(np.uint16),
 651                    "roi": self.info["roi"].astype(np.uint8),
 652                    "x": self.info["x"].round().astype(np.uint16),
 653                    "y": self.info["y"].round().astype(np.uint16),
 654                }
 655            )
 656
 657        # All DataFrames must all have the same number of rows
 658        if metadata is not None and (info is None or len(info) != len(metadata)):
 659            raise ValueError(
 660                "If EventArray.metadata is not None, it should match rows with .info"
 661            )
 662        if features is not None and (info is None or len(info) != len(features)):
 663            raise ValueError(
 664                "If EventArray.features is not None, it should match rows with .info"
 665            )
 666        # No columns named "metadata_", "features_", or "None"
 667        column_names = []
 668        if metadata is not None:
 669            column_names += metadata.columns.tolist()
 670        if features is not None:
 671            column_names += features.columns.tolist()
 672        if any([col.lower().startswith("metadata_") for col in column_names]):
 673            raise ValueError("EventArray column names cannot start with 'metadata_'")
 674        if any([col.lower().startswith("features_") for col in column_names]):
 675            raise ValueError("EventArray column names cannot start with 'features_'")
 676        if any([col.lower() == "none" for col in column_names]):
 677            raise ValueError("EventArray column names cannot be 'none'")
 678
 679        # Add metadata and features
 680        self.metadata = None
 681        self.features = None
 682        if metadata is not None:
 683            self.add_metadata(metadata)
 684        if features is not None:
 685            self.add_features(features)
 686
 687    def __len__(self) -> int:
 688        # Convenience method to get the number of events
 689        if self.info is None:
 690            return 0
 691        else:
 692            return len(self.info)
 693
 694    def __eq__(self, other):
 695        # Parse all possibilities for info
 696        if isinstance(self.info, pd.DataFrame):
 697            if isinstance(other.info, pd.DataFrame):
 698                if not self.info.equals(other.info):
 699                    return False
 700            else:
 701                return False
 702        elif self.info is None:
 703            if other.info is not None:
 704                return False
 705
 706        # Parse all possibilities for metadata
 707        if isinstance(self.metadata, pd.DataFrame):
 708            if isinstance(other.metadata, pd.DataFrame):
 709                is_equal = self.metadata.equals(other.metadata)
 710                if not is_equal:
 711                    return False
 712            else:
 713                return False
 714        elif self.metadata is None:
 715            if other.metadata is not None:
 716                return False
 717
 718        # Parse all possibilities for features
 719        if isinstance(self.features, pd.DataFrame):
 720            if isinstance(other.features, pd.DataFrame):
 721                is_equal = self.features.equals(other.features)
 722                if not is_equal:
 723                    return False
 724            else:
 725                return False
 726        elif self.features is None:
 727            if other.features is not None:
 728                return False
 729
 730        return is_equal
 731
 732    def get_sort_order(
 733        self, by: Hashable | Sequence[Hashable], ascending: bool | Sequence[bool] = True
 734    ):
 735        """
 736        Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.
 737        :param by: name of the column(s) to sort by.
 738        :param ascending: whether to sort in ascending order; can be a list to match by
 739        :return: the order of the indices to sort by.
 740        """
 741        columns = self.get(by)
 742        return columns.sort_values(by=by, ascending=ascending).index
 743
 744    def sort(
 745        self,
 746        by: Hashable | Sequence[Hashable],
 747        ascending: bool | Sequence[bool] = True,
 748    ) -> Self:
 749        """
 750        Sort the EventArray by column(s) in the info, metadata, or features DataFrames.
 751        :param by: name of the column(s) to sort by.
 752        :param ascending: whether to sort in ascending order; can be a list to match by
 753        :return: a new, sorted EventArray.
 754        """
 755        order = self.get_sort_order(by, ascending)
 756        info = self.info.loc[order].reset_index(drop=True)
 757        if self.metadata is not None:
 758            metadata = self.metadata.loc[order].reset_index(drop=True)
 759        else:
 760            metadata = None
 761        if self.features is not None:
 762            features = self.features.loc[order].reset_index(drop=True)
 763        else:
 764            features = None
 765        return EventArray(info, metadata, features)
 766
 767    def get(self, column_names: Hashable | Sequence[Hashable]) -> pd.DataFrame:
 768        """
 769        Get a DataFrame with the specified columns from the EventArray, by value.
 770        :param column_names: the names of the columns to get.
 771        :return: a DataFrame with the specified columns.
 772        """
 773        if isinstance(column_names, Hashable):
 774            column_names = [column_names]  # Drop into a list for the loop
 775        columns = []
 776        for column_name in column_names:
 777            if column_name in self.info.columns:
 778                columns.append(self.info[column_name])
 779            elif self.metadata is not None and column_name in self.metadata.columns:
 780                columns.append(self.metadata[column_name])
 781            elif self.features is not None and column_name in self.features.columns:
 782                columns.append(self.features[column_name])
 783            else:
 784                raise ValueError(f"Column {column_name} not found in EventArray")
 785        return pd.concat(columns, axis=1)
 786
 787    def rows(self, rows: Sequence[Hashable]) -> Self:
 788        """
 789        Get a subset of the EventArray rows based on a boolean or integer index, by value.
 790        :param rows: row labels, indices, or boolean mask; anything for .loc[]
 791        :return: a new EventArray with the subset of events.
 792        """
 793        info = self.info.loc[rows].reset_index(drop=True)
 794        if self.metadata is not None:
 795            metadata = self.metadata.loc[rows].reset_index(drop=True)
 796        else:
 797            metadata = None
 798        if self.features is not None:
 799            features = self.features.loc[rows].reset_index(drop=True)
 800        else:
 801            features = None
 802        return EventArray(info, metadata, features)
 803
 804    def copy(self) -> Self:
 805        """
 806        Create a deep copy of the EventArray.
 807        :return: a deep copy of the EventArray.
 808        """
 809        return EventArray(
 810            info=self.info.copy(),
 811            metadata=None if self.metadata is None else self.metadata.copy(),
 812            features=None if self.features is None else self.features.copy(),
 813        )
 814
 815    # TODO: add a "filter" convenience function that takes a column name and values to filter by
 816
 817    def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None:
 818        """
 819        Add metadata to the EventArray. Removes the need to check if metadata is None.
 820        Overwrites any existing metadata with the same column names as the new metadata.
 821        :param new_metadata: the metadata to add.
 822        """
 823        if self.info is None or len(self.info) != len(new_metadata):
 824            raise ValueError("New metadata must match length of existing info")
 825
 826        if isinstance(new_metadata, pd.Series):
 827            # Convert to a DataFrame
 828            new_metadata = pd.DataFrame(new_metadata)
 829
 830        for col in new_metadata.columns:
 831            if col in self.INFO_COLUMNS:
 832                warnings.warn(
 833                    f"Column name {col} is reserved for info; you can only "
 834                    "access this column through the .metadata attribute"
 835                )
 836            elif self.features is not None and col in self.features.columns:
 837                warnings.warn(
 838                    f"Column name {col} also exists in the .features attribute; "
 839                    f"calling this.get({col}) will return the .metadata column"
 840                )
 841
 842        if self.metadata is None:
 843            self.metadata = new_metadata
 844        else:
 845            self.metadata.loc[:, new_metadata.columns] = new_metadata
 846
 847    def add_features(self, new_features: pd.Series | pd.DataFrame) -> None:
 848        """
 849        Add features to the EventArray. Removes the need to check if features is None.
 850        Overwrites any existing features with the same column names as the new features.
 851        :param new_features: the features to add.
 852        """
 853        if self.info is None or len(self.info) != len(new_features):
 854            raise ValueError("New features must match length of existing info")
 855
 856        if isinstance(new_features, pd.Series):
 857            # Convert to a DataFrame
 858            new_features = pd.DataFrame(new_features)
 859
 860        for col in new_features.columns:
 861            if col in self.INFO_COLUMNS:
 862                warnings.warn(
 863                    f"Column name {col} is reserved for info; you can only "
 864                    "access this column through the .features attribute"
 865                )
 866            elif self.metadata is not None and col in self.metadata.columns:
 867                warnings.warn(
 868                    f"Column name {col} already exists in the .metadata attribute;"
 869                    f"calling this.get({col}) will return the .metadata column"
 870                )
 871
 872        if self.features is None:
 873            self.features = new_features
 874        else:
 875            self.features.loc[:, new_features.columns] = new_features
 876
 877    @classmethod
 878    def merge(cls, events: Iterable[Self]) -> Self:
 879        """
 880        Combine EventArrays in a list into a single EventArray.
 881        :param events: the new list of events.
 882        """
 883        all_info = []
 884        all_metadata = []
 885        all_features = []
 886        for event_array in events:
 887            # Skip empty EventArrays
 888            if event_array.info is not None:
 889                all_info.append(event_array.info)
 890            if event_array.metadata is not None:
 891                all_metadata.append(event_array.metadata)
 892            if event_array.features is not None:
 893                all_features.append(event_array.features)
 894        if len(all_info) == 0:
 895            return EventArray()
 896        else:
 897            all_info = pd.concat(all_info, ignore_index=True)
 898        if len(all_metadata) == 0:
 899            all_metadata = None
 900        else:
 901            all_metadata = pd.concat(all_metadata, ignore_index=True)
 902        if len(all_features) == 0:
 903            all_features = None
 904        else:
 905            all_features = pd.concat(all_features, ignore_index=True)
 906
 907        return EventArray(all_info, all_metadata, all_features)
 908
 909    def to_events(
 910        self,
 911        scans: Scan | Iterable[Scan],
 912        ignore_missing_scans=True,
 913        ignore_metadata=False,
 914        ignore_features=False,
 915    ) -> list[Event]:
 916        """
 917        Get the events in the EventArray as a list of events. Returns [] if empty.
 918        :param scans: the scans that the events belong to, auto-matched by slide_id.
 919        Pass None if you don't care about scan metadata (pass ignore_missing_scans).
 920        :param ignore_missing_scans: whether to create blank scans for events without scans.
 921        :param ignore_metadata: whether to ignore metadata or not
 922        :param ignore_features: whether to ignore features or not
 923        :return:
 924        """
 925        if len(self) == 0:
 926            return []
 927        if isinstance(scans, Scan):
 928            scans = [scans]
 929        scans = {scan.slide_id: scan for scan in scans}
 930        events = []
 931        for i in range(len(self.info)):
 932            # Determine the associated scan
 933            slide_id = self.info["slide_id"][i]
 934            if slide_id not in scans:
 935                if ignore_missing_scans:
 936                    # Create a placeholder scan if the scan is missing
 937                    scan = Scan.make_placeholder(
 938                        slide_id,
 939                        self.info["tile"][i],
 940                        self.info["roi"][i],
 941                    )
 942                else:
 943                    raise ValueError(
 944                        f"Scan {self.info['slide_id'][i]} not found for event {i}."
 945                    )
 946            else:
 947                scan = scans[slide_id]
 948
 949            # Prepare the metadata and features
 950            if ignore_metadata or self.metadata is None:
 951                metadata = None
 952            else:
 953                # This Series creation method is less efficient,
 954                # but required for preserving dtypes
 955                metadata = pd.Series(
 956                    {col: self.metadata.loc[i, col] for col in self.metadata.columns},
 957                    dtype=object,
 958                )
 959            if ignore_features or self.features is None:
 960                features = None
 961            else:
 962                features = pd.Series(
 963                    {col: self.features.loc[i, col] for col in self.features.columns},
 964                    dtype=object,
 965                )
 966            # Create the event and append it to the list
 967            events.append(
 968                Event(
 969                    Tile(scan, self.info["tile"][i], self.info["roi"][i]),
 970                    self.info["x"][i],
 971                    self.info["y"][i],
 972                    metadata=metadata,
 973                    features=features,
 974                )
 975            )
 976        return events
 977
 978    @classmethod
 979    def from_events(cls, events: Iterable[Event]) -> Self:
 980        """
 981        Set the events in the EventArray to a new list of events.
 982        :param events: the new list of events.
 983        """
 984        info = pd.DataFrame(
 985            {
 986                "slide_id": [event.tile.scan.slide_id for event in events],
 987                "tile": [event.tile.n for event in events],
 988                "roi": [event.tile.n_roi for event in events],
 989                "x": [event.x for event in events],
 990                "y": [event.y for event in events],
 991            }
 992        )
 993        metadata_list = [event.metadata for event in events]
 994        # Iterate through and ensure that all metadata is the same shape
 995        for metadata in metadata_list:
 996            if type(metadata) != type(metadata_list[0]):
 997                raise ValueError("All metadata must be the same type.")
 998            if metadata is not None and metadata.shape != metadata_list[0].shape:
 999                raise ValueError("All metadata must be the same shape.")
1000        if metadata_list[0] is None:
1001            metadata = None
1002        else:
1003            metadata = pd.DataFrame(metadata_list)
1004        features_list = [event.features for event in events]
1005        # Iterate through and ensure that all features are the same shape
1006        for features in features_list:
1007            if type(features) != type(features_list[0]):
1008                raise ValueError("All features must be the same type.")
1009            if features is not None and features.shape != features_list[0].shape:
1010                raise ValueError("All features must be the same shape.")
1011        if features_list[0] is None:
1012            features = None
1013        else:
1014            features = pd.DataFrame(features_list)
1015        return EventArray(info=info, metadata=metadata, features=features)
1016
1017    def to_dataframe(self) -> pd.DataFrame:
1018        """
1019        Convert all the data in the EventArray to a single DataFrame.
1020        :return: a DataFrame with all the data in the EventArray.
1021        """
1022        # Make a copy of the info DataFrame and prepend "info_" to the column names
1023        output = self.info.copy()
1024        # Combine with the metadata and prepend "metadata_" to the column names
1025        if self.metadata is not None:
1026            metadata = self.metadata.copy()
1027            metadata.columns = [f"metadata_{col}" for col in metadata.columns]
1028            output = pd.concat([output, metadata], axis=1)
1029        # Combine with the features and prepend "features_" to the column names
1030        if self.features is not None:
1031            features = self.features.copy()
1032            features.columns = [f"features_{col}" for col in features.columns]
1033            output = pd.concat([output, features], axis=1)
1034        return output
1035
1036    @classmethod
1037    def from_dataframe(
1038        cls, df, metadata_prefix: str = "metadata_", features_prefix: str = "features_"
1039    ) -> Self:
1040        """
1041        From a single, special DataFrame, create an EventArray.
1042        :param df: the DataFrame to convert to an EventArray.
1043        :param metadata_prefix: the prefix for metadata columns.
1044        :param features_prefix: the prefix for features columns.
1045        :return: a DataFrame with all the data in the EventArray.
1046        """
1047        # Split the columns into info, metadata, and features and strip prefix
1048        info = df[[col for col in df.columns if col in cls.INFO_COLUMNS]].copy()
1049        if info.size == 0:
1050            info = None
1051        metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy()
1052        metadata.columns = [
1053            col.replace(metadata_prefix, "") for col in metadata.columns
1054        ]
1055        if metadata.size == 0:
1056            metadata = None
1057        features = df[[col for col in df.columns if col.startswith("features_")]].copy()
1058        features.columns = [
1059            col.replace(features_prefix, "") for col in features.columns
1060        ]
1061        if features.size == 0:
1062            features = None
1063        return cls(info=info, metadata=metadata, features=features)
1064
1065    @classmethod
1066    def from_mask(
1067        cls,
1068        mask: np.ndarray,
1069        tile: Tile,
1070        include_cell_id: bool = True,
1071        images: list[np.ndarray] = None,
1072        image_labels: list[str] = None,
1073        properties: list[str] = None,
1074    ) -> Self:
1075        """
1076        Extract events from a mask DataFrame, including metadata and features.
1077        :param mask: the mask to extract events from.
1078        :param tile: the Tile object associated with this mask.
1079        :param include_cell_id: whether to include the cell_id, or numerical
1080        mask label, as metadata in the EventArray.
1081        :param images: the intensity images to extract features from.
1082        :param image_labels: the labels for the intensity images.
1083        :param properties: list of properties to extract in addition to the defaults:
1084        :return: EventArray corresponding to the mask labels.
1085        """
1086        if csi_images is None:
1087            raise ModuleNotFoundError(
1088                "imageio libraries not installed! "
1089                "run `pip install csi_images[imageio]` to resolve."
1090            )
1091        # Gather mask_info
1092        if images is not None and image_labels is not None:
1093            if len(images) != len(image_labels):
1094                raise ValueError("Intensity images and labels must match lengths.")
1095
1096        mask_info = csi_images.extract_mask_info(mask, images, image_labels, properties)
1097
1098        if len(mask_info) == 0:
1099            return EventArray()
1100
1101        # Combine provided info and mask info
1102        info = pd.DataFrame(
1103            {
1104                "slide_id": tile.scan.slide_id,
1105                "tile": tile.n,
1106                "roi": tile.n_roi,
1107                "x": mask_info["x"],
1108                "y": mask_info["y"],
1109            },
1110        )
1111        # Extract a metadata column if desired
1112        if include_cell_id:
1113            metadata = pd.DataFrame({"cell_id": mask_info["id"]})
1114        else:
1115            metadata = None
1116        # If any additional properties were extracted, add them as features
1117        mask_info = mask_info.drop(columns=["id", "x", "y"], errors="ignore")
1118        if len(mask_info.columns) > 0:
1119            features = mask_info
1120            features.columns = [col.lower() for col in features.columns]
1121        else:
1122            features = None
1123        return EventArray(info, metadata, features)
1124
1125    def save_csv(self, output_path: str) -> bool:
1126        """
1127        Save the events to an CSV file, including metadata and features.
1128        :param output_path:
1129        :return:
1130        """
1131        if not output_path.endswith(".csv"):
1132            output_path += ".csv"
1133        self.to_dataframe().to_csv(output_path, index=False)
1134        return os.path.exists(output_path)
1135
1136    @classmethod
1137    def load_csv(
1138        cls,
1139        input_path: str,
1140        metadata_prefix: str = "metadata_",
1141        features_prefix: str = "features_",
1142    ) -> Self:
1143        """
1144        Load the events from an CSV file, including metadata and features.
1145        :param input_path:
1146        :param metadata_prefix:
1147        :param features_prefix:
1148        :return:
1149        """
1150        # Load the CSV file
1151        df = pd.read_csv(input_path)
1152        return cls.from_dataframe(df, metadata_prefix, features_prefix)
1153
1154    def save_json(self, output_path: str, orient: str = "records") -> bool:
1155        """
1156        Save the events to a JSON file, including metadata and features.
1157        :param output_path:
1158        :param orient: the orientation of the JSON file, see pandas.DataFrame.to_json()
1159        :return:
1160        """
1161        if not output_path.endswith(".json"):
1162            output_path += ".json"
1163        self.to_dataframe().to_json(output_path, orient=orient, indent=2)
1164        return os.path.exists(output_path)
1165
1166    @classmethod
1167    def load_json(
1168        cls,
1169        input_path: str,
1170        metadata_prefix: str = "metadata_",
1171        features_prefix: str = "features_",
1172    ) -> Self:
1173        """
1174        Load the events from a JSON file, including metadata and features.
1175        :param input_path:
1176        :param metadata_prefix:
1177        :param features_prefix:
1178        :return:
1179        """
1180        # Load the JSON file
1181        df = pd.read_json(input_path, orient="records")
1182        return cls.from_dataframe(df, metadata_prefix, features_prefix)
1183
1184    def save_hdf5(
1185        self, output_path: str, complevel: int = 1, complib="blosc:zstd"
1186    ) -> bool:
1187        """
1188        Save the events to an HDF5 file, including metadata and features.
1189        Uses the pandas-provided HDF5 functions for ease, and external compatibility,
1190        though these files are slightly harder to view in HDFView or similar.
1191        Compression defaults remain very quick while cutting file size by 50%+.
1192        :param output_path:
1193        :param complevel: see pandas.HDFStore for more details.
1194        :param complib: see pandas.HDFStore for more details.
1195        :return:
1196        """
1197        if not output_path.endswith(".hdf5") and not output_path.endswith(".h5"):
1198            output_path += ".hdf5"
1199        # Open the output_path as an HDF5 file
1200        with pd.HDFStore(
1201            output_path, mode="w", complevel=complevel, complib=complib
1202        ) as store:
1203            # Store the dataframes in the HDF5 file
1204            if self.info is not None:
1205                store.put("info", self.info, index=False)
1206            if self.metadata is not None:
1207                store.put("metadata", self.metadata, index=False)
1208            if self.features is not None:
1209                store.put("features", self.features, index=False)
1210        return os.path.exists(output_path)
1211
1212    @classmethod
1213    def load_hdf5(cls, input_path: str) -> Self:
1214        """
1215        Load the events from an HDF5 file, including metadata and features.
1216        :param input_path:
1217        :return:
1218        """
1219        # Open the input_path as an HDF5 file
1220        with pd.HDFStore(input_path, "r") as store:
1221            # Load the dataframes from the HDF5 file
1222            info = store.get("info") if "info" in store else None
1223            metadata = store.get("metadata") if "metadata" in store else None
1224            features = store.get("features") if "features" in store else None
1225        return cls(info=info, metadata=metadata, features=features)
1226
1227    def save_ocular(self, output_path: str, event_type: str = "cells"):
1228        """
1229        Save the events to an OCULAR file. Relies on the dataframe originating
1230        from an OCULAR file (same columns; duplicate metadata/info).
1231        :param output_path:
1232        :param event_type:
1233        :return:
1234        """
1235        if pyreadr is None:
1236            raise ModuleNotFoundError(
1237                "pyreadr not installed! Install pyreadr directly "
1238                "or run `pip install csi-images[rds]` option to resolve."
1239            )
1240        if event_type == "cells":
1241            file_stub = "rc-final"
1242        elif event_type == "others":
1243            file_stub = "others-final"
1244        else:
1245            raise ValueError("Invalid event type. Must be cells or others.")
1246
1247        # Ensure good metadata
1248        metadata = pd.DataFrame(
1249            {
1250                "slide_id": self.info["slide_id"],
1251                "frame_id": self.info["tile"] + 1,  # Convert to 1-indexed for R
1252                "cell_id": (
1253                    self.metadata["cell_id"]
1254                    if "cell_id" in self.metadata.columns
1255                    else range(len(self.info))
1256                ),
1257                "cellx": self.info["x"],
1258                "celly": self.info["y"],
1259            }
1260        )
1261        if self.metadata is not None:
1262            metadata[self.metadata.columns] = self.metadata.copy()
1263
1264        # Check for the "ocular_interesting" column
1265        if event_type == "cells":
1266            if "ocular_interesting" in metadata.columns:
1267                interesting_rows = metadata["ocular_interesting"].to_numpy(dtype=bool)
1268            elif "hcpc" in metadata.columns:
1269                # Interesting cells don't get an hcpc designation, leaving them as -1
1270                interesting_rows = (
1271                    metadata["hcpc"].to_numpy() == -1
1272                )  # interesting cells
1273            else:
1274                interesting_rows = []
1275            if sum(interesting_rows) > 0:
1276                # Split the metadata into interesting and regular
1277                interesting_events = self.rows(interesting_rows)
1278                interesting_df = pd.concat(
1279                    [interesting_events.features, interesting_events.metadata], axis=1
1280                )
1281                data_events = self.rows(~interesting_rows)
1282                data_df = pd.concat(
1283                    [data_events.features, data_events.metadata], axis=1
1284                )
1285                data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore")
1286
1287                # Drop particular columns for "interesting"
1288                interesting_df = interesting_df.drop(
1289                    [
1290                        "clust",
1291                        "hcpc",
1292                        "frame_id",
1293                        "cell_id",
1294                        "unique_id",
1295                        "ocular_interesting",
1296                    ],
1297                    axis=1,
1298                    errors="ignore",
1299                )
1300                # Save both .csv and .rds
1301                interesting_stub = os.path.join(output_path, "ocular_interesting")
1302                interesting_df.to_csv(f"{interesting_stub}.csv")
1303                # Suppress pandas FutureWarning
1304                with warnings.catch_warnings():
1305                    warnings.simplefilter(action="ignore", category=FutureWarning)
1306                    pyreadr.write_rds(f"{interesting_stub}.rds", interesting_df)
1307            else:
1308                data_df = pd.concat([self.features, metadata], axis=1)
1309        else:
1310            # Get all data and reset_index (will copy it)
1311            data_df = pd.concat([self.features, metadata], axis=1)
1312
1313        # Split based on cluster number to conform to *-final[1-4].rds
1314        n_clusters = max(data_df["clust"]) + 1
1315        split_idx = [round(i * n_clusters / 4) for i in range(5)]
1316        for i in range(4):
1317            subset = (split_idx[i] <= data_df["clust"]) & (
1318                data_df["clust"] < split_idx[i + 1]
1319            )
1320            data_df.loc[subset, "hcpc"] = i + 1
1321            subset = data_df[subset].reset_index(drop=True)
1322            # Suppress pandas FutureWarning
1323            with warnings.catch_warnings():
1324                warnings.simplefilter(action="ignore", category=FutureWarning)
1325                pyreadr.write_rds(
1326                    os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset
1327                )
1328
1329        # Create new example cell strings
1330        data_df["example_cell_id"] = (
1331            data_df["slide_id"]
1332            + " "
1333            + data_df["frame_id"].astype(str)
1334            + " "
1335            + data_df["cell_id"].astype(str)
1336            + " "
1337            + data_df["cellx"].astype(int).astype(str)
1338            + " "
1339            + data_df["celly"].astype(int).astype(str)
1340        )
1341        # Find averagable data columns
1342        if "cellcluster_id" in data_df.columns:
1343            end_idx = data_df.columns.get_loc("cellcluster_id")
1344        else:
1345            end_idx = data_df.columns.get_loc("slide_id")
1346        avg_cols = data_df.columns[:end_idx].tolist()
1347        # Group by cluster and average
1348        data_df = data_df.groupby("clust").agg(
1349            **{col: (col, "mean") for col in avg_cols},
1350            count=("clust", "size"),  # count rows in each cluster
1351            example_cells=("example_cell_id", lambda x: ",".join(x)),
1352            hcpc=("hcpc", lambda x: x.iloc[0]),
1353        )
1354        data_df = data_df.reset_index()  # Do NOT drop, index is "clust"
1355        # Create new columns
1356        metadata = pd.DataFrame(
1357            {
1358                "count": data_df["count"],
1359                "example_cells": data_df["example_cells"],
1360                "clust": data_df["clust"].astype(int),
1361                "hcpc": data_df["hcpc"].astype(int),
1362                "id": data_df["clust"].astype(int).astype(str),
1363                "cccluster": "0",  # Dummy value
1364                "ccdistance": 0.0,  # Dummy value
1365                "rownum": list(range(len(data_df))),
1366                "framegroup": 0,  # Dummy value
1367            }
1368        )
1369        # Need to pad the features to 761 columns, as per OCULAR report needs
1370        additional_columns = range(len(avg_cols), 761)
1371        if len(additional_columns) > 0:
1372            padding = pd.DataFrame(
1373                np.zeros((len(data_df), len(additional_columns))),
1374                columns=[f"pad{i}" for i in additional_columns],
1375            )
1376            data_df = pd.concat([data_df[avg_cols], padding, metadata], axis=1)
1377        else:
1378            data_df = pd.concat([data_df[avg_cols], metadata], axis=1)
1379
1380        # Save the cluster data
1381        data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv"))
1382        # Suppress pandas FutureWarning
1383        with warnings.catch_warnings():
1384            warnings.simplefilter(action="ignore", category=FutureWarning)
1385            pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df)
1386
1387    @classmethod
1388    def load_ocular(
1389        cls,
1390        input_path: str,
1391        event_type="cells",
1392        cell_data_files=(
1393            "rc-final1.rds",
1394            "rc-final2.rds",
1395            "rc-final3.rds",
1396            "rc-final4.rds",
1397            "ocular_interesting.rds",
1398        ),
1399        others_data_files=(
1400            "others-final1.rds",
1401            "others-final2.rds",
1402            "others-final3.rds",
1403            "others-final4.rds",
1404        ),
1405        atlas_data_files=(
1406            "ocular_interesting.rds",
1407            "ocular_not_interesting.rds",
1408        ),
1409        drop_common_events=True,
1410    ) -> Self:
1411        """
1412
1413        :param input_path:
1414        :param event_type:
1415        :param cell_data_files:
1416        :param others_data_files:
1417        :param atlas_data_files:
1418        :param drop_common_events:
1419        :return:
1420        """
1421        if pyreadr is None:
1422            raise ModuleNotFoundError(
1423                "pyreadr not installed! Install pyreadr directly "
1424                "or run `pip install csi-images[rds]` option to resolve."
1425            )
1426        # Check if the input path is a directory or a file
1427        if os.path.isfile(input_path):
1428            data_files = [os.path.basename(input_path)]
1429            input_path = os.path.dirname(input_path)
1430        if event_type == "cells":
1431            data_files = cell_data_files
1432        elif event_type == "others":
1433            data_files = others_data_files
1434        else:
1435            raise ValueError("Invalid event type.")
1436
1437        # Load the data from the OCULAR files
1438        file_data = {}
1439        for file in data_files:
1440            file_path = os.path.join(input_path, file)
1441            if not os.path.isfile(file_path):
1442                warnings.warn(f"{file} not found for in {input_path}")
1443                continue
1444            file_data[file] = pyreadr.read_r(file_path)
1445            # Get the DataFrame associated with None (pyreadr dict quirk)
1446            file_data[file] = file_data[file][None]
1447            if len(file_data[file]) == 0:
1448                # File gets dropped from the dict
1449                file_data.pop(file)
1450                warnings.warn(f"{file} has no cells")
1451                continue
1452
1453            # Drop common cells if requested and in this file
1454            if (
1455                file in atlas_data_files
1456                and drop_common_events
1457                and "catalogue_classification" in file_data[file]
1458            ):
1459                common_cell_indices = (
1460                    file_data[file]["catalogue_classification"] == "common_cell"
1461                )
1462                file_data[file] = file_data[file][common_cell_indices == False]
1463
1464            if len(file_data[file]) == 0:
1465                # File gets dropped from the dict
1466                file_data.pop(file)
1467                warnings.warn(f"{file} has no cells after dropping common cells")
1468                continue
1469
1470            # Extract frame_id and cell_id
1471            # DAPI- events already have frame_id cell_id outside rowname
1472            if event_type == "cells" and "frame_id" not in file_data[file].columns:
1473                file_data[file]["rowname"] = file_data[file]["rowname"].astype("str")
1474                # get frame_id cell_id from rownames column and split into two columns
1475                split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True)
1476                if len(split_res.columns) != 2:
1477                    warnings.warn(
1478                        f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}'
1479                    )
1480                # then assign it back to the dataframe
1481                file_data[file][["frame_id", "cell_id"]] = split_res.astype("int")
1482            # Ensure frame_id and cell_id are integers
1483            file_data[file]["frame_id"] = file_data[file]["frame_id"].astype("int")
1484            file_data[file]["cell_id"] = file_data[file]["cell_id"].astype("int")
1485            # reset indexes since they can cause NaN values in concat
1486            file_data[file] = file_data[file].reset_index(drop=True)
1487
1488        # Merge the data from all files
1489        if len(file_data) == 0:
1490            return EventArray()
1491        elif len(file_data) == 1:
1492            data = [file_data[file] for file in file_data.keys()][0]
1493        else:
1494            data = pd.concat(file_data.values())
1495
1496        # Others is missing the "slide_id". Insert it right before "frame_id" column
1497        if event_type == "others" and "slide_id" not in data.columns:
1498            if os.path.basename(input_path) == "ocular":
1499                slide_id = os.path.basename(os.path.dirname(input_path))
1500            else:
1501                slide_id = "UNKNOWN"
1502            data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id)
1503
1504        # Sort according to ascending cell_id to keep the original, which is in manual_df
1505        data = data.sort_values(by=["cell_id"], ascending=True)
1506        # Filter out duplicates by x & y
1507        data = data.assign(
1508            unique_id=data["slide_id"]
1509            + "_"
1510            + data["frame_id"].astype(str)
1511            + "_"
1512            + data["cellx"].astype(int).astype(str)
1513            + "_"
1514            + data["celly"].astype(int).astype(str)
1515        )
1516        data = data.drop_duplicates(subset=["unique_id"], keep="first")
1517        # Normal unique_id is with cell_id
1518        data = data.assign(
1519            unique_id=data["slide_id"]
1520            + "_"
1521            + data["frame_id"].astype(str)
1522            + "_"
1523            + data["cell_id"].astype(str)
1524        )
1525        data = data.reset_index(drop=True)
1526        # All columns up to "slide_id" are features; drop the "slide_id"
1527        features = data.loc[:, :"slide_id"].iloc[:, :-1]
1528        data = data.loc[:, "slide_id":]
1529        # Grab the info columns
1530        info = data[["slide_id", "frame_id", "cellx", "celly"]]
1531        info.columns = ["slide_id", "tile", "x", "y"]
1532        info = info.assign(roi=0)  # OCULAR only works on 1 ROI, as far as known
1533        info = info[["slide_id", "tile", "roi", "x", "y"]]
1534        # Metadata has duplicate columns for later convenience
1535        metadata = data
1536        # Certain columns tend to be problematic with mixed data formats...
1537        for col in ["TRITC", "CY5", "FITC"]:
1538            if col in metadata:
1539                labels = {
1540                    "False": False,
1541                    "True": True,
1542                    "FALSE": False,
1543                    "TRUE": True,
1544                    False: False,
1545                    True: True,
1546                }
1547                metadata[col] = metadata[col].map(labels).astype(bool)
1548        for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]:
1549            if col in metadata:
1550                metadata[col] = metadata[col].fillna(-1).astype(int)
1551        info["tile"] = info["tile"] - 1  # Convert to 0-based indexing
1552        return EventArray(info, metadata, features)
class Event:
 40class Event:
 41    """
 42    A class that represents a single event in a scan, making it easy to evaluate
 43    singular events. Required metadata is exposed as attributes, and optional
 44    metadata and features are stored as DataFrames.
 45    """
 46
 47    SCAN_TO_SLIDE_TRANSFORM = {
 48        # Axioscan zero is in the top-right corner instead of top-left
 49        Scan.Type.AXIOSCAN7: np.array(
 50            [
 51                [1, 0, 75000],
 52                [0, 1, 0],
 53                [0, 0, 1],
 54            ]
 55        ),
 56        # BZScanner coordinates are a special kind of messed up:
 57        # - The slide is upside-down.
 58        # - The slide is oriented vertically, with the barcode at the bottom.
 59        # - Tiles are numbered from the top-right
 60        Scan.Type.BZSCANNER: np.array(
 61            [
 62                [0, -1, 75000],
 63                [-1, 0, 25000],
 64                [0, 0, 1],
 65            ]
 66        ),
 67    }
 68    """
 69    Homogeneous transformation matrices for converting between scanner and slide
 70    coordinates. The matrices are 3x3, with the final column representing the
 71    translation in micrometers (um). For more information, see 
 72    [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations).
 73    
 74    Transformations are nominal, and accuracy is not guaranteed; this is due to 
 75    imperfections in slides and alignment in the scanners. Units are in micrometers.
 76    """
 77
 78    def __init__(
 79        self,
 80        tile: Tile,
 81        x: int,
 82        y: int,
 83        metadata: pd.Series = None,
 84        features: pd.Series = None,
 85    ):
 86        self.tile = tile
 87        self.x = int(x)
 88        self.y = int(y)
 89        self.metadata = metadata
 90        self.features = features
 91
 92    def __repr__(self) -> str:
 93        return f"{self.tile}-{self.x}-{self.y}"
 94
 95    def __eq__(self, other) -> bool:
 96        return self.__repr__() == other.__repr__()
 97
 98    def __lt__(self, other):
 99        return self.__repr__() < other.__repr__()
100
101    def get_scan_position(self) -> tuple[float, float]:
102        """
103        Get the position of the event in the scanner's coordinate frame.
104        :return: the scan position of the event in micrometers (um).
105        """
106        # Get overall pixel position
107        real_tile_height, real_tile_width = self.tile.scan.get_image_size()
108        pixel_x = self.x + (real_tile_width * self.tile.x)
109        pixel_y = self.y + (real_tile_height * self.tile.y)
110        # Convert to micrometers
111        x_um = pixel_x * self.tile.scan.pixel_size_um
112        y_um = pixel_y * self.tile.scan.pixel_size_um
113        # Add the scan's origin in the scanner frame
114        x_um += self.tile.scan.roi[self.tile.n_roi].origin_x_um
115        y_um += self.tile.scan.roi[self.tile.n_roi].origin_y_um
116        return x_um, y_um
117
118    def get_slide_position(self) -> tuple[float, float]:
119        """
120        Get the slide position of the event in micrometers (um).
121        :return: the slide position of the event.
122        """
123        # Turn scan_position into a 3x1 vector
124        scan_position = self.get_scan_position()
125        scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]])
126
127        # Multiply by the appropriate homogeneous matrix
128        if self.tile.scan.scanner_id.startswith(self.tile.scan.Type.AXIOSCAN7.value):
129            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.tile.scan.Type.AXIOSCAN7]
130        elif self.tile.scan.scanner_id.startswith(self.tile.scan.Type.BZSCANNER.value):
131            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.tile.scan.Type.BZSCANNER]
132        else:
133            raise ValueError(f"Scanner type {self.tile.scan.scanner_id} not supported.")
134        slide_position = np.matmul(transform, scan_position)
135        return float(slide_position[0][0]), float(slide_position[1][0])
136
137    def crop(
138        self, images: Iterable[np.ndarray], crop_size: int = 100, in_pixels: bool = True
139    ) -> list[np.ndarray]:
140        """
141        Crop the event from the provided frame images. Use if you have already gotten
142        frame images; useful for cropping multiple events from the same frame image.
143        :param images: the frame images.
144        :param crop_size: the square size of the image crop to get for this event.
145        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
146        :return: image_size x image_size crops of the event in the provided frames. If
147        the event is too close to the edge, the crop will be smaller and not centered.
148        """
149        # Convert a crop size in micrometers to pixels
150        if not in_pixels:
151            crop_size = round(crop_size / self.tile.scan.pixel_size_um)
152        image_height, image_width = 0, 0
153        for image in images:
154            if image_height == 0 and image_width == 0:
155                image_height, image_width = image.shape
156            else:
157                if image_height != image.shape[0] or image_width != image.shape[1]:
158                    raise ValueError("All images must be the same size")
159        if image_height == 0 or image_width == 0:
160            raise ValueError("No images provided")
161
162        # Find the crop bounds
163        bounds = [
164            self.x - (crop_size // 2) + 1,
165            self.y - (crop_size // 2) + 1,
166            self.x + math.ceil(crop_size / 2) + 1,
167            self.y + math.ceil(crop_size / 2) + 1,
168        ]
169        # Determine how much the bounds violate the image size
170        displacements = [
171            max(0, -bounds[0]),
172            max(0, -bounds[1]),
173            max(0, bounds[2] - image_width),
174            max(0, bounds[3] - image_height),
175        ]
176        # Cap off the bounds
177        bounds = [
178            max(0, bounds[0]),
179            max(0, bounds[1]),
180            min(image_width, bounds[2]),
181            min(image_height, bounds[3]),
182        ]
183
184        # Crop the images
185        crops = []
186        for image in images:
187            # Create a blank image of the right size
188            crop = np.zeros((crop_size, crop_size), dtype=image.dtype)
189
190            # Insert the cropped image into the blank image, leaving a black buffer
191            # around the edges if the crop would go beyond the original image bounds
192            crop[
193                displacements[1] : crop_size - displacements[3],
194                displacements[0] : crop_size - displacements[2],
195            ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]]
196            crops.append(crop)
197        return crops
198
199    def get_crops(
200        self,
201        crop_size: int = 100,
202        in_pixels: bool = True,
203        input_path: str = None,
204        channels: Iterable[int | str] = None,
205        apply_gain: bool | Iterable[bool] = True,
206    ) -> list[np.ndarray]:
207        """
208        Gets the frame images for this event and then crops the event from the images.
209        Convenient for retrieving a single event's crops, but less efficient when
210        retrieving multiple events from the same tile as it will reread the images.
211        :param crop_size: the square size of the image crop to get for this event.
212        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
213        :param input_path: the path to the input images. Defaults to None (uses the scan's path).
214        :param channels: the channels to extract images for. Defaults to all channels.
215        :param apply_gain: whether to apply scanner-calculated gain to the images, if
216        not already applied. If a list, matches the channels.
217        :return: a list of cropped images from the scan in the order of the channels.
218        """
219        # This function validates channels
220        frames = Frame.get_frames(self.tile, channels)
221        # Convert individual inputs to lists of appropriate length
222        if isinstance(apply_gain, bool):
223            apply_gain = [apply_gain] * len(frames)
224        images = [f.get_image(input_path, a) for f, a in zip(frames, apply_gain)]
225        return self.crop(images, crop_size, in_pixels)
226
227    def save_crops(
228        self,
229        crops: Sequence[np.ndarray],
230        output_path: str,
231        labels: Sequence[str],
232        ext: str = "auto",
233    ):
234        """
235        Save the crops to image files.
236        :param crops: the crops to save. Will save as RGB if 3 channel [h, w, 3] or
237        grayscale if 1 channel [h, w] or [h, w, 1].
238        :param labels: the labels to append to the file name, usually the channel names
239        associated with each crop.
240        :param output_path: the folder to save the crops to. Will make if needed.
241        :param ext: the file extension to save the crops as. Defaults to "auto", which
242        will save as .tif for grayscale images and .jpg for RGB images.
243        :return: None
244        """
245        if len(crops) != len(labels):
246            raise ValueError("Crops and labels must be the same length")
247
248        if csi_images is None or imageio is None:
249            raise ModuleNotFoundError(
250                "imageio libraries not installed! "
251                "run `pip install csi_images[imageio]` to resolve."
252            )
253
254        os.makedirs(output_path, exist_ok=True)
255
256        for crop, label in zip(crops, labels):
257            if ext == "auto":
258                if len(crop.shape) == 2 or crop.shape[2] == 1:
259                    file_extension = ".tif"
260                elif crop.shape[2] == 3:
261                    file_extension = ".jpg"
262                else:
263                    warnings.warn(
264                        f"Image shape {crop.shape} not recognized; saving as .tif"
265                    )
266                    file_extension = ".tif"
267            else:
268                file_extension = ext
269            file = os.path.join(output_path, f"{self}-{label}{file_extension}")
270            # TODO: add more file types here
271            if file_extension == ".tif":
272                imageio.imwrite(file, crop, compression="deflate")
273            elif file_extension in [".jpg", ".jpeg"]:
274                crop = csi_images.scale_bit_depth(crop, np.uint8)
275                imageio.imwrite(file, crop, quality=80)
276            else:
277                imageio.imwrite(file, crop)
278
279    def load_crops(
280        self, input_path: str, labels: list[str] = None
281    ) -> dict[str, np.ndarray]:
282        """
283        Loads previously saved crop files from a folder.
284        :param input_path: folder containing crop files.
285        :param labels: optional label filter, will only return crops with these labels.
286        :return: a tuple of lists containing the crops and their labels.
287        """
288        crops = {}
289        for file in glob.glob(os.path.join(input_path, f"{self}-*")):
290            label = os.path.splitext(os.path.basename(file))[0].split("-")[-1]
291            # Skip if we have labels to target
292            if labels is not None and label not in labels:
293                continue
294            crops[label] = imageio.imread(file)
295        return crops
296
297    def get_montage_channels(
298        self,
299        channels: Sequence[int | str] | None = None,
300        composites: dict[int | str, tuple[float, float, float]] | None = None,
301    ) -> tuple[list[int], list[int], dict[int, tuple[float, float, float]]]:
302        """
303        Get the channel names for the montage from the event's tile.
304        :param channels: channel indices or names for grayscale channels
305        :param composites: dictionary of channel indices or names and RGB values
306        :return: (1) channel indices to retrieve,
307                 (2) relative grayscale channel indices, and
308                 (3) composite channel indices and RGB values.
309        """
310        if channels is None:
311            channels = list(range(len(self.tile.scan.channels)))
312        if (len(channels) == 0) and (composites is None or len(composites) == 0):
313            raise ValueError("Must provide at least one channel type to montage")
314
315        channels_to_get = []
316
317        # Build the list of channels to retrieve
318        if channels is not None:
319            if isinstance(channels[0], str):
320                channels = self.tile.scan.get_channel_indices(channels)
321            channels_to_get += channels
322            order = list(range(len(channels)))  # Always the first n channels
323        else:
324            order = None
325
326        if composites is not None:
327            relative_composites = {}  # Relative indices for retrieved channels
328            # Convert to scan indices
329            rgb_channels = list(composites.keys())
330            if isinstance(rgb_channels[0], str):
331                rgb_channels = self.tile.scan.get_channel_indices(rgb_channels)
332            # Find the index or add to the end
333            for channel, rgb in zip(rgb_channels, composites.values()):
334                if channel not in channels_to_get:
335                    channels_to_get.append(channel)
336                    relative_composites[channel] = rgb
337                else:
338                    relative_composites[channels_to_get.index(channel)] = rgb
339        else:
340            relative_composites = None
341
342        return channels_to_get, order, relative_composites
343
344    def get_montage(
345        self,
346        channels: Sequence[int | str] = None,
347        composites: dict[int | str, tuple[float, float, float]] = None,
348        mask: np.ndarray[np.uint8] = None,
349        labels: Sequence[str] = None,
350        crop_size: int = 100,
351        in_pixels: bool = True,
352        input_path: str = None,
353        apply_gain: bool = True,
354        **kwargs,
355    ) -> np.ndarray:
356        """
357        Convenience function for getting frame images and creating a montage. Mirrors
358        csi_images.make_montage(). Convenient for a single event's montage, but less
359        efficient when for multiple events from the same tile.
360        :param channels: the channels to use for black-and-white montages.
361        :param composites: dictionary of indices and RGB tuples for a composite.
362        :param mask: a mask to apply to the montage. Must be the same size as the crop.
363        :param labels: the labels to subtitle montage images, usually the channel names
364        :param crop_size: the square size of the image crop to get for this event.
365        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
366        :param input_path: the path to the input images. Defaults to None (uses the scan's path).
367        :param apply_gain: whether to apply scanner-calculated gain to the images, if
368        not already applied. If a list, matches the channels.
369        :param kwargs: montage options. See csi_images.make_montage() for more details.
370        :return: numpy array representing the montage.
371        """
372        channels, order, composites = self.get_montage_channels(channels, composites)
373        images = self.get_crops(crop_size, in_pixels, input_path, channels, apply_gain)
374        return csi_images.make_montage(
375            images, order, composites, mask, labels, **kwargs
376        )
377
378    def save_montage(
379        self,
380        montage: np.ndarray,
381        output_path: str,
382        ocular_names: bool = False,
383        tag: str = "",
384        file_extension: str = ".jpeg",
385        **kwargs,
386    ):
387        """
388        Save the montage as a JPEG image with a set name.
389        :param montage: the montage to save.
390        :param output_path: the folder to save the montage in. Will make if needed.
391        :param ocular_names: whether to use the OCULAR naming convention.
392        :param tag: a tag to append to the file name. Ignored if ocular_names is True.
393        :param file_extension: the file extension to save the montage as. Defaults to .jpeg.
394        :param kwargs: additional arguments to pass to imageio.imwrite().
395        :return: None
396        """
397        if csi_images is None or imageio is None:
398            raise ModuleNotFoundError(
399                "imageio libraries not installed! "
400                "run `pip install csi_images[imageio]` to resolve."
401            )
402
403        montage = csi_images.scale_bit_depth(montage, np.uint8)
404
405        if not file_extension.startswith("."):
406            file_extension = f".{file_extension}"
407
408        if ocular_names:
409            if "cell_id" not in self.metadata.index:
410                raise ValueError(
411                    "Event metadata must include 'cell_id' for OCULAR naming."
412                )
413            file = f"{self.tile.n}-{self.metadata['cell_id']}-{self.x}-{self.y}{file_extension}"
414        else:
415            file = f"{self}{tag}{file_extension}"
416
417        os.makedirs(output_path, exist_ok=True)
418        imageio.imwrite(os.path.join(output_path, file), montage, **kwargs)
419
420    def load_montage(self, input_path: str, tag: str = "") -> np.ndarray:
421        """
422        Loads the montage from a file saved by Event.save_montage.
423        :param input_path: the path to the folder where the montage was saved.
424        :param tag: a string to add to the file name, before the extension.
425        :return:
426        """
427        file = f"{self}{tag}.jpeg"
428        return imageio.imread(os.path.join(input_path, file))
429
430    @classmethod
431    def get_many_crops(
432        cls,
433        events: Sequence[Self],
434        crop_size: int | Sequence[int] = 100,
435        in_pixels: bool = True,
436        input_path: str | Sequence[str] = None,
437        channels: Sequence[int | str] = None,
438        apply_gain: bool | Sequence[bool] = True,
439    ) -> list[list[np.ndarray]]:
440        """
441        Get the crops for a list of events, ensuring that there is no wasteful reading
442        of the same tile multiple times. This function is more efficient than calling
443        get_crops() for each event.
444        :param events: the events to get crops for.
445        :param crop_size: the square size of the image crop to get for this event.
446                          Defaults to four times the size of the event.
447        :param in_pixels: whether the crop size is in pixels or micrometers.
448                          Defaults to pixels, and is ignored if crop_size is None.
449        :param input_path: the path to the input images. Will only work for lists of events
450                           from the same scan. Defaults to None (uses the scan's path).
451        :param channels: the channels to extract images for. Defaults to all channels.
452        :param apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True.
453                           Can be supplied as a list to apply gain to individual channels.
454        :return: a list of lists of cropped images for each event.
455        """
456        if len(events) == 0:
457            return []
458        # Adapt singular inputs to lists of appropriate length
459        if isinstance(crop_size, int):
460            crop_size = [crop_size] * len(events)
461        if input_path is None or isinstance(input_path, str):
462            input_path = [input_path] * len(events)
463
464        # Get the order of the events when sorted by slide/tile
465        order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__()))
466
467        # Allocate the list to size
468        crops = [[]] * len(events)
469        last_tile = None
470        images = None  # Holds large numpy arrays, so expensive to compare
471        # Iterate through in slide/tile sorted order
472        for i in order:
473            if last_tile != events[i].tile:
474                # Gather the frame images, preserving them for the next event
475                frames = Frame.get_frames(events[i].tile, channels)
476                if isinstance(apply_gain, bool):
477                    apply = [apply_gain] * len(frames)
478                else:
479                    apply = apply_gain
480                images = [f.get_image(input_path[i], a) for f, a in zip(frames, apply)]
481                last_tile = events[i].tile
482            # Use the frame images to crop the event images
483            crops[i] = events[i].crop(images, crop_size[i], in_pixels)
484        return crops
485
486    @classmethod
487    def get_many_montages(
488        cls,
489        events: Sequence[Self],
490        channels: Sequence[int | str] = None,
491        composites: dict[int | str, tuple[float, float, float]] = None,
492        masks: Sequence[np.ndarray[np.uint8]] = None,
493        labels: Sequence[str] = None,
494        crop_size: int = 100,
495        in_pixels: bool = True,
496        input_path: str = None,
497        apply_gain: bool | Iterable[bool] = True,
498        **kwargs,
499    ) -> list[np.ndarray]:
500        """
501        Convenience function for get_montage(), but for a list of events. More efficient
502        than get_montage() when working with multiple events from the same tile.
503        :param events: a list of Event objects.
504        :param channels: the channels to extract images for. Defaults to all channels.
505        :param composites: dictionary of indices and RGB tuples for a composite.
506        :param masks: a list of masks to apply to the montages. Must be the same size as the crops.
507        :param labels: the labels to subtitle montage images, usually the channel names
508        :param crop_size: the square size of the image crop to get for this event.
509        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
510        :param input_path: the path to the input images. Defaults to None (uses the scan's path).
511        :param apply_gain: whether to apply scanner-calculated gain to the images, if
512        not already applied. If a list, matches the channels.
513        :param kwargs: montage options. See csi_images.make_montage() for more details.
514        :return: a list of numpy arrays representing the montages.
515        """
516        if len(events) == 0:
517            return []
518        # Adapt singular inputs to lists of appropriate length
519        if isinstance(crop_size, int):
520            crop_size = [crop_size] * len(events)
521        if input_path is None or isinstance(input_path, str):
522            input_path = [input_path] * len(events)
523        if masks is None or isinstance(masks, np.ndarray):
524            masks = [masks] * len(events)
525
526        # Get the order of the events when sorted by slide/tile
527        event_order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__()))
528
529        # Allocate the list to size
530        montages = [np.empty(0)] * len(events)
531        # Placeholder variables to avoid rereading the same tile
532        images = None  # Holds large numpy arrays, so expensive to compare
533        order = None
534        rel_composites = None
535        last_tile = None
536        # Iterate through in slide/tile sorted order
537        for i in event_order:
538            if last_tile != events[i].tile:
539                channels_to_get, order, rel_composites = events[i].get_montage_channels(
540                    channels, composites
541                )
542                # Gather the frame images, preserving them for the next event
543                frames = Frame.get_frames(events[i].tile, channels_to_get)
544                if isinstance(apply_gain, bool):
545                    apply = [apply_gain] * len(frames)
546                else:
547                    apply = apply_gain
548                images = [f.get_image(input_path[i], a) for f, a in zip(frames, apply)]
549                last_tile = events[i].tile
550            # Use the frame images to crop the event images and make montages
551            crops = events[i].crop(images, crop_size[i], in_pixels)
552            montages[i] = csi_images.make_montage(
553                crops, order, rel_composites, masks[i], labels, **kwargs
554            )
555
556        return montages
557
558    @classmethod
559    def get_and_save_many_crops(
560        cls,
561        events: list[Self],
562        output_path: str,
563        labels: Sequence[str],
564        ext: str = "auto",
565        additional_gain: Sequence[float] = None,
566        **kwargs,
567    ) -> None:
568        """
569        Get and save the crops for a list of events, ensuring that there is no wasteful
570        reading and limiting the image data in memory to 1 tile at a time. This function
571        is more efficient that chaining get_crops() and save_crops() for each event or
572        get_many_crops() and then save_crops().
573        :param events: list of events to get, crop, and save.
574        :param output_path: the folder to save the crops in. Will make if needed.
575        :param labels: the labels to save the crops with. See save_crops().
576        :param ext: the file extension to save the crops as. See save_crops().
577        :param additional_gain: additional gain to apply to the crops. If not None, must
578        match the length of the number of crop channels.
579        :param kwargs: see get_many_crops() for more parameters.
580        :return:
581        """
582        unique_tiles = set([event.tile for event in events])
583
584        for tile in unique_tiles:
585            # Get one tile's worth of event crops
586            tile_events = [e for e in events if e.tile == tile]
587            crops_list = cls.get_many_crops(tile_events, **kwargs)
588            for event, crops in zip(tile_events, crops_list):
589                # Apply any additional gains
590                if additional_gain is not None:
591                    crops = [gain * crop for gain, crop in zip(additional_gain, crops)]
592                event.save_crops(crops, output_path, labels, ext)
593
594    @classmethod
595    def get_and_save_many_montages(
596        cls,
597        events: list[Self],
598        output_path: str,
599        ocular_names: bool = False,
600        tag: str = "",
601        **kwargs,
602    ) -> None:
603        """
604        Save montages of the events to image files.
605        :param events: the events to get, montage, and save.
606        :param output_path: the folder to save the montages to. Will make if needed.
607        :param ocular_names: whether to use the OCULAR naming convention.
608        :param tag: a tag to append to the file name. Ignored if ocular_names is True.
609        :param kwargs: see get_many_montages() for more parameters.
610        """
611        unique_tiles = set([event.tile for event in events])
612
613        for tile in unique_tiles:
614            # Get one tile's worth of event crops
615            tile_events = [e for e in events if e.tile == tile]
616            montages = cls.get_many_montages(tile_events, **kwargs)
617            for event, montage in zip(tile_events, montages):
618                event.save_montage(montage, output_path, ocular_names, tag)

A class that represents a single event in a scan, making it easy to evaluate singular events. Required metadata is exposed as attributes, and optional metadata and features are stored as DataFrames.

Event( tile: csi_images.csi_tiles.Tile, x: int, y: int, metadata: pandas.core.series.Series = None, features: pandas.core.series.Series = None)
78    def __init__(
79        self,
80        tile: Tile,
81        x: int,
82        y: int,
83        metadata: pd.Series = None,
84        features: pd.Series = None,
85    ):
86        self.tile = tile
87        self.x = int(x)
88        self.y = int(y)
89        self.metadata = metadata
90        self.features = features
SCAN_TO_SLIDE_TRANSFORM = {<Type.AXIOSCAN7: 'axioscan7'>: array([[ 1, 0, 75000], [ 0, 1, 0], [ 0, 0, 1]]), <Type.BZSCANNER: 'bzscanner'>: array([[ 0, -1, 75000], [ -1, 0, 25000], [ 0, 0, 1]])}

Homogeneous transformation matrices for converting between scanner and slide coordinates. The matrices are 3x3, with the final column representing the translation in micrometers (um). For more information, see affine transformations.

Transformations are nominal, and accuracy is not guaranteed; this is due to imperfections in slides and alignment in the scanners. Units are in micrometers.

tile
x
y
metadata
features
def get_scan_position(self) -> tuple[float, float]:
101    def get_scan_position(self) -> tuple[float, float]:
102        """
103        Get the position of the event in the scanner's coordinate frame.
104        :return: the scan position of the event in micrometers (um).
105        """
106        # Get overall pixel position
107        real_tile_height, real_tile_width = self.tile.scan.get_image_size()
108        pixel_x = self.x + (real_tile_width * self.tile.x)
109        pixel_y = self.y + (real_tile_height * self.tile.y)
110        # Convert to micrometers
111        x_um = pixel_x * self.tile.scan.pixel_size_um
112        y_um = pixel_y * self.tile.scan.pixel_size_um
113        # Add the scan's origin in the scanner frame
114        x_um += self.tile.scan.roi[self.tile.n_roi].origin_x_um
115        y_um += self.tile.scan.roi[self.tile.n_roi].origin_y_um
116        return x_um, y_um

Get the position of the event in the scanner's coordinate frame.

Returns

the scan position of the event in micrometers (um).

def get_slide_position(self) -> tuple[float, float]:
118    def get_slide_position(self) -> tuple[float, float]:
119        """
120        Get the slide position of the event in micrometers (um).
121        :return: the slide position of the event.
122        """
123        # Turn scan_position into a 3x1 vector
124        scan_position = self.get_scan_position()
125        scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]])
126
127        # Multiply by the appropriate homogeneous matrix
128        if self.tile.scan.scanner_id.startswith(self.tile.scan.Type.AXIOSCAN7.value):
129            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.tile.scan.Type.AXIOSCAN7]
130        elif self.tile.scan.scanner_id.startswith(self.tile.scan.Type.BZSCANNER.value):
131            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.tile.scan.Type.BZSCANNER]
132        else:
133            raise ValueError(f"Scanner type {self.tile.scan.scanner_id} not supported.")
134        slide_position = np.matmul(transform, scan_position)
135        return float(slide_position[0][0]), float(slide_position[1][0])

Get the slide position of the event in micrometers (um).

Returns

the slide position of the event.

def crop( self, images: Iterable[numpy.ndarray], crop_size: int = 100, in_pixels: bool = True) -> list[numpy.ndarray]:
137    def crop(
138        self, images: Iterable[np.ndarray], crop_size: int = 100, in_pixels: bool = True
139    ) -> list[np.ndarray]:
140        """
141        Crop the event from the provided frame images. Use if you have already gotten
142        frame images; useful for cropping multiple events from the same frame image.
143        :param images: the frame images.
144        :param crop_size: the square size of the image crop to get for this event.
145        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
146        :return: image_size x image_size crops of the event in the provided frames. If
147        the event is too close to the edge, the crop will be smaller and not centered.
148        """
149        # Convert a crop size in micrometers to pixels
150        if not in_pixels:
151            crop_size = round(crop_size / self.tile.scan.pixel_size_um)
152        image_height, image_width = 0, 0
153        for image in images:
154            if image_height == 0 and image_width == 0:
155                image_height, image_width = image.shape
156            else:
157                if image_height != image.shape[0] or image_width != image.shape[1]:
158                    raise ValueError("All images must be the same size")
159        if image_height == 0 or image_width == 0:
160            raise ValueError("No images provided")
161
162        # Find the crop bounds
163        bounds = [
164            self.x - (crop_size // 2) + 1,
165            self.y - (crop_size // 2) + 1,
166            self.x + math.ceil(crop_size / 2) + 1,
167            self.y + math.ceil(crop_size / 2) + 1,
168        ]
169        # Determine how much the bounds violate the image size
170        displacements = [
171            max(0, -bounds[0]),
172            max(0, -bounds[1]),
173            max(0, bounds[2] - image_width),
174            max(0, bounds[3] - image_height),
175        ]
176        # Cap off the bounds
177        bounds = [
178            max(0, bounds[0]),
179            max(0, bounds[1]),
180            min(image_width, bounds[2]),
181            min(image_height, bounds[3]),
182        ]
183
184        # Crop the images
185        crops = []
186        for image in images:
187            # Create a blank image of the right size
188            crop = np.zeros((crop_size, crop_size), dtype=image.dtype)
189
190            # Insert the cropped image into the blank image, leaving a black buffer
191            # around the edges if the crop would go beyond the original image bounds
192            crop[
193                displacements[1] : crop_size - displacements[3],
194                displacements[0] : crop_size - displacements[2],
195            ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]]
196            crops.append(crop)
197        return crops

Crop the event from the provided frame images. Use if you have already gotten frame images; useful for cropping multiple events from the same frame image.

Parameters
  • images: the frame images.
  • crop_size: the square size of the image crop to get for this event.
  • in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
Returns

image_size x image_size crops of the event in the provided frames. If the event is too close to the edge, the crop will be smaller and not centered.

def get_crops( self, crop_size: int = 100, in_pixels: bool = True, input_path: str = None, channels: Iterable[int | str] = None, apply_gain: Union[bool, Iterable[bool]] = True) -> list[numpy.ndarray]:
199    def get_crops(
200        self,
201        crop_size: int = 100,
202        in_pixels: bool = True,
203        input_path: str = None,
204        channels: Iterable[int | str] = None,
205        apply_gain: bool | Iterable[bool] = True,
206    ) -> list[np.ndarray]:
207        """
208        Gets the frame images for this event and then crops the event from the images.
209        Convenient for retrieving a single event's crops, but less efficient when
210        retrieving multiple events from the same tile as it will reread the images.
211        :param crop_size: the square size of the image crop to get for this event.
212        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
213        :param input_path: the path to the input images. Defaults to None (uses the scan's path).
214        :param channels: the channels to extract images for. Defaults to all channels.
215        :param apply_gain: whether to apply scanner-calculated gain to the images, if
216        not already applied. If a list, matches the channels.
217        :return: a list of cropped images from the scan in the order of the channels.
218        """
219        # This function validates channels
220        frames = Frame.get_frames(self.tile, channels)
221        # Convert individual inputs to lists of appropriate length
222        if isinstance(apply_gain, bool):
223            apply_gain = [apply_gain] * len(frames)
224        images = [f.get_image(input_path, a) for f, a in zip(frames, apply_gain)]
225        return self.crop(images, crop_size, in_pixels)

Gets the frame images for this event and then crops the event from the images. Convenient for retrieving a single event's crops, but less efficient when retrieving multiple events from the same tile as it will reread the images.

Parameters
  • crop_size: the square size of the image crop to get for this event.
  • in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
  • input_path: the path to the input images. Defaults to None (uses the scan's path).
  • channels: the channels to extract images for. Defaults to all channels.
  • apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. If a list, matches the channels.
Returns

a list of cropped images from the scan in the order of the channels.

def save_crops( self, crops: Sequence[numpy.ndarray], output_path: str, labels: Sequence[str], ext: str = 'auto'):
227    def save_crops(
228        self,
229        crops: Sequence[np.ndarray],
230        output_path: str,
231        labels: Sequence[str],
232        ext: str = "auto",
233    ):
234        """
235        Save the crops to image files.
236        :param crops: the crops to save. Will save as RGB if 3 channel [h, w, 3] or
237        grayscale if 1 channel [h, w] or [h, w, 1].
238        :param labels: the labels to append to the file name, usually the channel names
239        associated with each crop.
240        :param output_path: the folder to save the crops to. Will make if needed.
241        :param ext: the file extension to save the crops as. Defaults to "auto", which
242        will save as .tif for grayscale images and .jpg for RGB images.
243        :return: None
244        """
245        if len(crops) != len(labels):
246            raise ValueError("Crops and labels must be the same length")
247
248        if csi_images is None or imageio is None:
249            raise ModuleNotFoundError(
250                "imageio libraries not installed! "
251                "run `pip install csi_images[imageio]` to resolve."
252            )
253
254        os.makedirs(output_path, exist_ok=True)
255
256        for crop, label in zip(crops, labels):
257            if ext == "auto":
258                if len(crop.shape) == 2 or crop.shape[2] == 1:
259                    file_extension = ".tif"
260                elif crop.shape[2] == 3:
261                    file_extension = ".jpg"
262                else:
263                    warnings.warn(
264                        f"Image shape {crop.shape} not recognized; saving as .tif"
265                    )
266                    file_extension = ".tif"
267            else:
268                file_extension = ext
269            file = os.path.join(output_path, f"{self}-{label}{file_extension}")
270            # TODO: add more file types here
271            if file_extension == ".tif":
272                imageio.imwrite(file, crop, compression="deflate")
273            elif file_extension in [".jpg", ".jpeg"]:
274                crop = csi_images.scale_bit_depth(crop, np.uint8)
275                imageio.imwrite(file, crop, quality=80)
276            else:
277                imageio.imwrite(file, crop)

Save the crops to image files.

Parameters
  • crops: the crops to save. Will save as RGB if 3 channel [h, w, 3] or grayscale if 1 channel [h, w] or [h, w, 1].
  • labels: the labels to append to the file name, usually the channel names associated with each crop.
  • output_path: the folder to save the crops to. Will make if needed.
  • ext: the file extension to save the crops as. Defaults to "auto", which will save as .tif for grayscale images and .jpg for RGB images.
Returns

None

def load_crops( self, input_path: str, labels: list[str] = None) -> dict[str, numpy.ndarray]:
279    def load_crops(
280        self, input_path: str, labels: list[str] = None
281    ) -> dict[str, np.ndarray]:
282        """
283        Loads previously saved crop files from a folder.
284        :param input_path: folder containing crop files.
285        :param labels: optional label filter, will only return crops with these labels.
286        :return: a tuple of lists containing the crops and their labels.
287        """
288        crops = {}
289        for file in glob.glob(os.path.join(input_path, f"{self}-*")):
290            label = os.path.splitext(os.path.basename(file))[0].split("-")[-1]
291            # Skip if we have labels to target
292            if labels is not None and label not in labels:
293                continue
294            crops[label] = imageio.imread(file)
295        return crops

Loads previously saved crop files from a folder.

Parameters
  • input_path: folder containing crop files.
  • labels: optional label filter, will only return crops with these labels.
Returns

a tuple of lists containing the crops and their labels.

def get_montage_channels( self, channels: Optional[Sequence[int | str]] = None, composites: dict[int | str, tuple[float, float, float]] | None = None) -> tuple[list[int], list[int], dict[int, tuple[float, float, float]]]:
297    def get_montage_channels(
298        self,
299        channels: Sequence[int | str] | None = None,
300        composites: dict[int | str, tuple[float, float, float]] | None = None,
301    ) -> tuple[list[int], list[int], dict[int, tuple[float, float, float]]]:
302        """
303        Get the channel names for the montage from the event's tile.
304        :param channels: channel indices or names for grayscale channels
305        :param composites: dictionary of channel indices or names and RGB values
306        :return: (1) channel indices to retrieve,
307                 (2) relative grayscale channel indices, and
308                 (3) composite channel indices and RGB values.
309        """
310        if channels is None:
311            channels = list(range(len(self.tile.scan.channels)))
312        if (len(channels) == 0) and (composites is None or len(composites) == 0):
313            raise ValueError("Must provide at least one channel type to montage")
314
315        channels_to_get = []
316
317        # Build the list of channels to retrieve
318        if channels is not None:
319            if isinstance(channels[0], str):
320                channels = self.tile.scan.get_channel_indices(channels)
321            channels_to_get += channels
322            order = list(range(len(channels)))  # Always the first n channels
323        else:
324            order = None
325
326        if composites is not None:
327            relative_composites = {}  # Relative indices for retrieved channels
328            # Convert to scan indices
329            rgb_channels = list(composites.keys())
330            if isinstance(rgb_channels[0], str):
331                rgb_channels = self.tile.scan.get_channel_indices(rgb_channels)
332            # Find the index or add to the end
333            for channel, rgb in zip(rgb_channels, composites.values()):
334                if channel not in channels_to_get:
335                    channels_to_get.append(channel)
336                    relative_composites[channel] = rgb
337                else:
338                    relative_composites[channels_to_get.index(channel)] = rgb
339        else:
340            relative_composites = None
341
342        return channels_to_get, order, relative_composites

Get the channel names for the montage from the event's tile.

Parameters
  • channels: channel indices or names for grayscale channels
  • composites: dictionary of channel indices or names and RGB values
Returns

(1) channel indices to retrieve, (2) relative grayscale channel indices, and (3) composite channel indices and RGB values.

def get_montage( self, channels: Sequence[int | str] = None, composites: dict[int | str, tuple[float, float, float]] = None, mask: numpy.ndarray[numpy.uint8] = None, labels: Sequence[str] = None, crop_size: int = 100, in_pixels: bool = True, input_path: str = None, apply_gain: bool = True, **kwargs) -> numpy.ndarray:
344    def get_montage(
345        self,
346        channels: Sequence[int | str] = None,
347        composites: dict[int | str, tuple[float, float, float]] = None,
348        mask: np.ndarray[np.uint8] = None,
349        labels: Sequence[str] = None,
350        crop_size: int = 100,
351        in_pixels: bool = True,
352        input_path: str = None,
353        apply_gain: bool = True,
354        **kwargs,
355    ) -> np.ndarray:
356        """
357        Convenience function for getting frame images and creating a montage. Mirrors
358        csi_images.make_montage(). Convenient for a single event's montage, but less
359        efficient when for multiple events from the same tile.
360        :param channels: the channels to use for black-and-white montages.
361        :param composites: dictionary of indices and RGB tuples for a composite.
362        :param mask: a mask to apply to the montage. Must be the same size as the crop.
363        :param labels: the labels to subtitle montage images, usually the channel names
364        :param crop_size: the square size of the image crop to get for this event.
365        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
366        :param input_path: the path to the input images. Defaults to None (uses the scan's path).
367        :param apply_gain: whether to apply scanner-calculated gain to the images, if
368        not already applied. If a list, matches the channels.
369        :param kwargs: montage options. See csi_images.make_montage() for more details.
370        :return: numpy array representing the montage.
371        """
372        channels, order, composites = self.get_montage_channels(channels, composites)
373        images = self.get_crops(crop_size, in_pixels, input_path, channels, apply_gain)
374        return csi_images.make_montage(
375            images, order, composites, mask, labels, **kwargs
376        )

Convenience function for getting frame images and creating a montage. Mirrors csi_images.make_montage(). Convenient for a single event's montage, but less efficient when for multiple events from the same tile.

Parameters
  • channels: the channels to use for black-and-white montages.
  • composites: dictionary of indices and RGB tuples for a composite.
  • mask: a mask to apply to the montage. Must be the same size as the crop.
  • labels: the labels to subtitle montage images, usually the channel names
  • crop_size: the square size of the image crop to get for this event.
  • in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
  • input_path: the path to the input images. Defaults to None (uses the scan's path).
  • apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. If a list, matches the channels.
  • kwargs: montage options. See csi_images.make_montage() for more details.
Returns

numpy array representing the montage.

def save_montage( self, montage: numpy.ndarray, output_path: str, ocular_names: bool = False, tag: str = '', file_extension: str = '.jpeg', **kwargs):
378    def save_montage(
379        self,
380        montage: np.ndarray,
381        output_path: str,
382        ocular_names: bool = False,
383        tag: str = "",
384        file_extension: str = ".jpeg",
385        **kwargs,
386    ):
387        """
388        Save the montage as a JPEG image with a set name.
389        :param montage: the montage to save.
390        :param output_path: the folder to save the montage in. Will make if needed.
391        :param ocular_names: whether to use the OCULAR naming convention.
392        :param tag: a tag to append to the file name. Ignored if ocular_names is True.
393        :param file_extension: the file extension to save the montage as. Defaults to .jpeg.
394        :param kwargs: additional arguments to pass to imageio.imwrite().
395        :return: None
396        """
397        if csi_images is None or imageio is None:
398            raise ModuleNotFoundError(
399                "imageio libraries not installed! "
400                "run `pip install csi_images[imageio]` to resolve."
401            )
402
403        montage = csi_images.scale_bit_depth(montage, np.uint8)
404
405        if not file_extension.startswith("."):
406            file_extension = f".{file_extension}"
407
408        if ocular_names:
409            if "cell_id" not in self.metadata.index:
410                raise ValueError(
411                    "Event metadata must include 'cell_id' for OCULAR naming."
412                )
413            file = f"{self.tile.n}-{self.metadata['cell_id']}-{self.x}-{self.y}{file_extension}"
414        else:
415            file = f"{self}{tag}{file_extension}"
416
417        os.makedirs(output_path, exist_ok=True)
418        imageio.imwrite(os.path.join(output_path, file), montage, **kwargs)

Save the montage as a JPEG image with a set name.

Parameters
  • montage: the montage to save.
  • output_path: the folder to save the montage in. Will make if needed.
  • ocular_names: whether to use the OCULAR naming convention.
  • tag: a tag to append to the file name. Ignored if ocular_names is True.
  • file_extension: the file extension to save the montage as. Defaults to .jpeg.
  • kwargs: additional arguments to pass to imageio.imwrite().
Returns

None

def load_montage(self, input_path: str, tag: str = '') -> numpy.ndarray:
420    def load_montage(self, input_path: str, tag: str = "") -> np.ndarray:
421        """
422        Loads the montage from a file saved by Event.save_montage.
423        :param input_path: the path to the folder where the montage was saved.
424        :param tag: a string to add to the file name, before the extension.
425        :return:
426        """
427        file = f"{self}{tag}.jpeg"
428        return imageio.imread(os.path.join(input_path, file))

Loads the montage from a file saved by Event.save_montage.

Parameters
  • input_path: the path to the folder where the montage was saved.
  • tag: a string to add to the file name, before the extension.
Returns
@classmethod
def get_many_crops( cls, events: Sequence[Self], crop_size: Union[int, Sequence[int]] = 100, in_pixels: bool = True, input_path: Union[str, Sequence[str]] = None, channels: Sequence[int | str] = None, apply_gain: Union[bool, Sequence[bool]] = True) -> list[list[numpy.ndarray]]:
430    @classmethod
431    def get_many_crops(
432        cls,
433        events: Sequence[Self],
434        crop_size: int | Sequence[int] = 100,
435        in_pixels: bool = True,
436        input_path: str | Sequence[str] = None,
437        channels: Sequence[int | str] = None,
438        apply_gain: bool | Sequence[bool] = True,
439    ) -> list[list[np.ndarray]]:
440        """
441        Get the crops for a list of events, ensuring that there is no wasteful reading
442        of the same tile multiple times. This function is more efficient than calling
443        get_crops() for each event.
444        :param events: the events to get crops for.
445        :param crop_size: the square size of the image crop to get for this event.
446                          Defaults to four times the size of the event.
447        :param in_pixels: whether the crop size is in pixels or micrometers.
448                          Defaults to pixels, and is ignored if crop_size is None.
449        :param input_path: the path to the input images. Will only work for lists of events
450                           from the same scan. Defaults to None (uses the scan's path).
451        :param channels: the channels to extract images for. Defaults to all channels.
452        :param apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True.
453                           Can be supplied as a list to apply gain to individual channels.
454        :return: a list of lists of cropped images for each event.
455        """
456        if len(events) == 0:
457            return []
458        # Adapt singular inputs to lists of appropriate length
459        if isinstance(crop_size, int):
460            crop_size = [crop_size] * len(events)
461        if input_path is None or isinstance(input_path, str):
462            input_path = [input_path] * len(events)
463
464        # Get the order of the events when sorted by slide/tile
465        order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__()))
466
467        # Allocate the list to size
468        crops = [[]] * len(events)
469        last_tile = None
470        images = None  # Holds large numpy arrays, so expensive to compare
471        # Iterate through in slide/tile sorted order
472        for i in order:
473            if last_tile != events[i].tile:
474                # Gather the frame images, preserving them for the next event
475                frames = Frame.get_frames(events[i].tile, channels)
476                if isinstance(apply_gain, bool):
477                    apply = [apply_gain] * len(frames)
478                else:
479                    apply = apply_gain
480                images = [f.get_image(input_path[i], a) for f, a in zip(frames, apply)]
481                last_tile = events[i].tile
482            # Use the frame images to crop the event images
483            crops[i] = events[i].crop(images, crop_size[i], in_pixels)
484        return crops

Get the crops for a list of events, ensuring that there is no wasteful reading of the same tile multiple times. This function is more efficient than calling get_crops() for each event.

Parameters
  • events: the events to get crops for.
  • crop_size: the square size of the image crop to get for this event. Defaults to four times the size of the event.
  • in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels, and is ignored if crop_size is None.
  • input_path: the path to the input images. Will only work for lists of events from the same scan. Defaults to None (uses the scan's path).
  • channels: the channels to extract images for. Defaults to all channels.
  • apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True. Can be supplied as a list to apply gain to individual channels.
Returns

a list of lists of cropped images for each event.

@classmethod
def get_many_montages( cls, events: Sequence[Self], channels: Sequence[int | str] = None, composites: dict[int | str, tuple[float, float, float]] = None, masks: Sequence[numpy.ndarray[numpy.uint8]] = None, labels: Sequence[str] = None, crop_size: int = 100, in_pixels: bool = True, input_path: str = None, apply_gain: Union[bool, Iterable[bool]] = True, **kwargs) -> list[numpy.ndarray]:
486    @classmethod
487    def get_many_montages(
488        cls,
489        events: Sequence[Self],
490        channels: Sequence[int | str] = None,
491        composites: dict[int | str, tuple[float, float, float]] = None,
492        masks: Sequence[np.ndarray[np.uint8]] = None,
493        labels: Sequence[str] = None,
494        crop_size: int = 100,
495        in_pixels: bool = True,
496        input_path: str = None,
497        apply_gain: bool | Iterable[bool] = True,
498        **kwargs,
499    ) -> list[np.ndarray]:
500        """
501        Convenience function for get_montage(), but for a list of events. More efficient
502        than get_montage() when working with multiple events from the same tile.
503        :param events: a list of Event objects.
504        :param channels: the channels to extract images for. Defaults to all channels.
505        :param composites: dictionary of indices and RGB tuples for a composite.
506        :param masks: a list of masks to apply to the montages. Must be the same size as the crops.
507        :param labels: the labels to subtitle montage images, usually the channel names
508        :param crop_size: the square size of the image crop to get for this event.
509        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
510        :param input_path: the path to the input images. Defaults to None (uses the scan's path).
511        :param apply_gain: whether to apply scanner-calculated gain to the images, if
512        not already applied. If a list, matches the channels.
513        :param kwargs: montage options. See csi_images.make_montage() for more details.
514        :return: a list of numpy arrays representing the montages.
515        """
516        if len(events) == 0:
517            return []
518        # Adapt singular inputs to lists of appropriate length
519        if isinstance(crop_size, int):
520            crop_size = [crop_size] * len(events)
521        if input_path is None or isinstance(input_path, str):
522            input_path = [input_path] * len(events)
523        if masks is None or isinstance(masks, np.ndarray):
524            masks = [masks] * len(events)
525
526        # Get the order of the events when sorted by slide/tile
527        event_order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__()))
528
529        # Allocate the list to size
530        montages = [np.empty(0)] * len(events)
531        # Placeholder variables to avoid rereading the same tile
532        images = None  # Holds large numpy arrays, so expensive to compare
533        order = None
534        rel_composites = None
535        last_tile = None
536        # Iterate through in slide/tile sorted order
537        for i in event_order:
538            if last_tile != events[i].tile:
539                channels_to_get, order, rel_composites = events[i].get_montage_channels(
540                    channels, composites
541                )
542                # Gather the frame images, preserving them for the next event
543                frames = Frame.get_frames(events[i].tile, channels_to_get)
544                if isinstance(apply_gain, bool):
545                    apply = [apply_gain] * len(frames)
546                else:
547                    apply = apply_gain
548                images = [f.get_image(input_path[i], a) for f, a in zip(frames, apply)]
549                last_tile = events[i].tile
550            # Use the frame images to crop the event images and make montages
551            crops = events[i].crop(images, crop_size[i], in_pixels)
552            montages[i] = csi_images.make_montage(
553                crops, order, rel_composites, masks[i], labels, **kwargs
554            )
555
556        return montages

Convenience function for get_montage(), but for a list of events. More efficient than get_montage() when working with multiple events from the same tile.

Parameters
  • events: a list of Event objects.
  • channels: the channels to extract images for. Defaults to all channels.
  • composites: dictionary of indices and RGB tuples for a composite.
  • masks: a list of masks to apply to the montages. Must be the same size as the crops.
  • labels: the labels to subtitle montage images, usually the channel names
  • crop_size: the square size of the image crop to get for this event.
  • in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
  • input_path: the path to the input images. Defaults to None (uses the scan's path).
  • apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. If a list, matches the channels.
  • kwargs: montage options. See csi_images.make_montage() for more details.
Returns

a list of numpy arrays representing the montages.

@classmethod
def get_and_save_many_crops( cls, events: list[typing.Self], output_path: str, labels: Sequence[str], ext: str = 'auto', additional_gain: Sequence[float] = None, **kwargs) -> None:
558    @classmethod
559    def get_and_save_many_crops(
560        cls,
561        events: list[Self],
562        output_path: str,
563        labels: Sequence[str],
564        ext: str = "auto",
565        additional_gain: Sequence[float] = None,
566        **kwargs,
567    ) -> None:
568        """
569        Get and save the crops for a list of events, ensuring that there is no wasteful
570        reading and limiting the image data in memory to 1 tile at a time. This function
571        is more efficient that chaining get_crops() and save_crops() for each event or
572        get_many_crops() and then save_crops().
573        :param events: list of events to get, crop, and save.
574        :param output_path: the folder to save the crops in. Will make if needed.
575        :param labels: the labels to save the crops with. See save_crops().
576        :param ext: the file extension to save the crops as. See save_crops().
577        :param additional_gain: additional gain to apply to the crops. If not None, must
578        match the length of the number of crop channels.
579        :param kwargs: see get_many_crops() for more parameters.
580        :return:
581        """
582        unique_tiles = set([event.tile for event in events])
583
584        for tile in unique_tiles:
585            # Get one tile's worth of event crops
586            tile_events = [e for e in events if e.tile == tile]
587            crops_list = cls.get_many_crops(tile_events, **kwargs)
588            for event, crops in zip(tile_events, crops_list):
589                # Apply any additional gains
590                if additional_gain is not None:
591                    crops = [gain * crop for gain, crop in zip(additional_gain, crops)]
592                event.save_crops(crops, output_path, labels, ext)

Get and save the crops for a list of events, ensuring that there is no wasteful reading and limiting the image data in memory to 1 tile at a time. This function is more efficient that chaining get_crops() and save_crops() for each event or get_many_crops() and then save_crops().

Parameters
  • events: list of events to get, crop, and save.
  • output_path: the folder to save the crops in. Will make if needed.
  • labels: the labels to save the crops with. See save_crops().
  • ext: the file extension to save the crops as. See save_crops().
  • additional_gain: additional gain to apply to the crops. If not None, must match the length of the number of crop channels.
  • kwargs: see get_many_crops() for more parameters.
Returns
@classmethod
def get_and_save_many_montages( cls, events: list[typing.Self], output_path: str, ocular_names: bool = False, tag: str = '', **kwargs) -> None:
594    @classmethod
595    def get_and_save_many_montages(
596        cls,
597        events: list[Self],
598        output_path: str,
599        ocular_names: bool = False,
600        tag: str = "",
601        **kwargs,
602    ) -> None:
603        """
604        Save montages of the events to image files.
605        :param events: the events to get, montage, and save.
606        :param output_path: the folder to save the montages to. Will make if needed.
607        :param ocular_names: whether to use the OCULAR naming convention.
608        :param tag: a tag to append to the file name. Ignored if ocular_names is True.
609        :param kwargs: see get_many_montages() for more parameters.
610        """
611        unique_tiles = set([event.tile for event in events])
612
613        for tile in unique_tiles:
614            # Get one tile's worth of event crops
615            tile_events = [e for e in events if e.tile == tile]
616            montages = cls.get_many_montages(tile_events, **kwargs)
617            for event, montage in zip(tile_events, montages):
618                event.save_montage(montage, output_path, ocular_names, tag)

Save montages of the events to image files.

Parameters
  • events: the events to get, montage, and save.
  • output_path: the folder to save the montages to. Will make if needed.
  • ocular_names: whether to use the OCULAR naming convention.
  • tag: a tag to append to the file name. Ignored if ocular_names is True.
  • kwargs: see get_many_montages() for more parameters.
class EventArray:
 621class EventArray:
 622    """
 623    A class that holds a large number of events' data, making it easy to analyze and
 624    manipulate many events at once. A more separated version of the Event class.
 625    """
 626
 627    INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y"]
 628
 629    def __init__(
 630        self,
 631        info: pd.DataFrame = None,
 632        metadata: pd.DataFrame = None,
 633        features: pd.DataFrame = None,
 634    ):
 635
 636        # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y"
 637        self.info = info
 638        if self.info is not None:
 639            # Special case: "roi" is often not required, so we'll fill in if its missing
 640            if "roi" not in info.columns:
 641                self.info = self.info.assign(roi=0)
 642            if set(self.info.columns) != set(self.INFO_COLUMNS):
 643                raise ValueError(
 644                    f"EventArray.info must have columns:"
 645                    f"{self.INFO_COLUMNS}; had {list(self.info.columns)}"
 646                )
 647            # Ensure order and data types
 648            self.info = pd.DataFrame(
 649                {
 650                    "slide_id": self.info["slide_id"].astype(str),
 651                    "tile": self.info["tile"].astype(np.uint16),
 652                    "roi": self.info["roi"].astype(np.uint8),
 653                    "x": self.info["x"].round().astype(np.uint16),
 654                    "y": self.info["y"].round().astype(np.uint16),
 655                }
 656            )
 657
 658        # All DataFrames must all have the same number of rows
 659        if metadata is not None and (info is None or len(info) != len(metadata)):
 660            raise ValueError(
 661                "If EventArray.metadata is not None, it should match rows with .info"
 662            )
 663        if features is not None and (info is None or len(info) != len(features)):
 664            raise ValueError(
 665                "If EventArray.features is not None, it should match rows with .info"
 666            )
 667        # No columns named "metadata_", "features_", or "None"
 668        column_names = []
 669        if metadata is not None:
 670            column_names += metadata.columns.tolist()
 671        if features is not None:
 672            column_names += features.columns.tolist()
 673        if any([col.lower().startswith("metadata_") for col in column_names]):
 674            raise ValueError("EventArray column names cannot start with 'metadata_'")
 675        if any([col.lower().startswith("features_") for col in column_names]):
 676            raise ValueError("EventArray column names cannot start with 'features_'")
 677        if any([col.lower() == "none" for col in column_names]):
 678            raise ValueError("EventArray column names cannot be 'none'")
 679
 680        # Add metadata and features
 681        self.metadata = None
 682        self.features = None
 683        if metadata is not None:
 684            self.add_metadata(metadata)
 685        if features is not None:
 686            self.add_features(features)
 687
 688    def __len__(self) -> int:
 689        # Convenience method to get the number of events
 690        if self.info is None:
 691            return 0
 692        else:
 693            return len(self.info)
 694
 695    def __eq__(self, other):
 696        # Parse all possibilities for info
 697        if isinstance(self.info, pd.DataFrame):
 698            if isinstance(other.info, pd.DataFrame):
 699                if not self.info.equals(other.info):
 700                    return False
 701            else:
 702                return False
 703        elif self.info is None:
 704            if other.info is not None:
 705                return False
 706
 707        # Parse all possibilities for metadata
 708        if isinstance(self.metadata, pd.DataFrame):
 709            if isinstance(other.metadata, pd.DataFrame):
 710                is_equal = self.metadata.equals(other.metadata)
 711                if not is_equal:
 712                    return False
 713            else:
 714                return False
 715        elif self.metadata is None:
 716            if other.metadata is not None:
 717                return False
 718
 719        # Parse all possibilities for features
 720        if isinstance(self.features, pd.DataFrame):
 721            if isinstance(other.features, pd.DataFrame):
 722                is_equal = self.features.equals(other.features)
 723                if not is_equal:
 724                    return False
 725            else:
 726                return False
 727        elif self.features is None:
 728            if other.features is not None:
 729                return False
 730
 731        return is_equal
 732
 733    def get_sort_order(
 734        self, by: Hashable | Sequence[Hashable], ascending: bool | Sequence[bool] = True
 735    ):
 736        """
 737        Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.
 738        :param by: name of the column(s) to sort by.
 739        :param ascending: whether to sort in ascending order; can be a list to match by
 740        :return: the order of the indices to sort by.
 741        """
 742        columns = self.get(by)
 743        return columns.sort_values(by=by, ascending=ascending).index
 744
 745    def sort(
 746        self,
 747        by: Hashable | Sequence[Hashable],
 748        ascending: bool | Sequence[bool] = True,
 749    ) -> Self:
 750        """
 751        Sort the EventArray by column(s) in the info, metadata, or features DataFrames.
 752        :param by: name of the column(s) to sort by.
 753        :param ascending: whether to sort in ascending order; can be a list to match by
 754        :return: a new, sorted EventArray.
 755        """
 756        order = self.get_sort_order(by, ascending)
 757        info = self.info.loc[order].reset_index(drop=True)
 758        if self.metadata is not None:
 759            metadata = self.metadata.loc[order].reset_index(drop=True)
 760        else:
 761            metadata = None
 762        if self.features is not None:
 763            features = self.features.loc[order].reset_index(drop=True)
 764        else:
 765            features = None
 766        return EventArray(info, metadata, features)
 767
 768    def get(self, column_names: Hashable | Sequence[Hashable]) -> pd.DataFrame:
 769        """
 770        Get a DataFrame with the specified columns from the EventArray, by value.
 771        :param column_names: the names of the columns to get.
 772        :return: a DataFrame with the specified columns.
 773        """
 774        if isinstance(column_names, Hashable):
 775            column_names = [column_names]  # Drop into a list for the loop
 776        columns = []
 777        for column_name in column_names:
 778            if column_name in self.info.columns:
 779                columns.append(self.info[column_name])
 780            elif self.metadata is not None and column_name in self.metadata.columns:
 781                columns.append(self.metadata[column_name])
 782            elif self.features is not None and column_name in self.features.columns:
 783                columns.append(self.features[column_name])
 784            else:
 785                raise ValueError(f"Column {column_name} not found in EventArray")
 786        return pd.concat(columns, axis=1)
 787
 788    def rows(self, rows: Sequence[Hashable]) -> Self:
 789        """
 790        Get a subset of the EventArray rows based on a boolean or integer index, by value.
 791        :param rows: row labels, indices, or boolean mask; anything for .loc[]
 792        :return: a new EventArray with the subset of events.
 793        """
 794        info = self.info.loc[rows].reset_index(drop=True)
 795        if self.metadata is not None:
 796            metadata = self.metadata.loc[rows].reset_index(drop=True)
 797        else:
 798            metadata = None
 799        if self.features is not None:
 800            features = self.features.loc[rows].reset_index(drop=True)
 801        else:
 802            features = None
 803        return EventArray(info, metadata, features)
 804
 805    def copy(self) -> Self:
 806        """
 807        Create a deep copy of the EventArray.
 808        :return: a deep copy of the EventArray.
 809        """
 810        return EventArray(
 811            info=self.info.copy(),
 812            metadata=None if self.metadata is None else self.metadata.copy(),
 813            features=None if self.features is None else self.features.copy(),
 814        )
 815
 816    # TODO: add a "filter" convenience function that takes a column name and values to filter by
 817
 818    def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None:
 819        """
 820        Add metadata to the EventArray. Removes the need to check if metadata is None.
 821        Overwrites any existing metadata with the same column names as the new metadata.
 822        :param new_metadata: the metadata to add.
 823        """
 824        if self.info is None or len(self.info) != len(new_metadata):
 825            raise ValueError("New metadata must match length of existing info")
 826
 827        if isinstance(new_metadata, pd.Series):
 828            # Convert to a DataFrame
 829            new_metadata = pd.DataFrame(new_metadata)
 830
 831        for col in new_metadata.columns:
 832            if col in self.INFO_COLUMNS:
 833                warnings.warn(
 834                    f"Column name {col} is reserved for info; you can only "
 835                    "access this column through the .metadata attribute"
 836                )
 837            elif self.features is not None and col in self.features.columns:
 838                warnings.warn(
 839                    f"Column name {col} also exists in the .features attribute; "
 840                    f"calling this.get({col}) will return the .metadata column"
 841                )
 842
 843        if self.metadata is None:
 844            self.metadata = new_metadata
 845        else:
 846            self.metadata.loc[:, new_metadata.columns] = new_metadata
 847
 848    def add_features(self, new_features: pd.Series | pd.DataFrame) -> None:
 849        """
 850        Add features to the EventArray. Removes the need to check if features is None.
 851        Overwrites any existing features with the same column names as the new features.
 852        :param new_features: the features to add.
 853        """
 854        if self.info is None or len(self.info) != len(new_features):
 855            raise ValueError("New features must match length of existing info")
 856
 857        if isinstance(new_features, pd.Series):
 858            # Convert to a DataFrame
 859            new_features = pd.DataFrame(new_features)
 860
 861        for col in new_features.columns:
 862            if col in self.INFO_COLUMNS:
 863                warnings.warn(
 864                    f"Column name {col} is reserved for info; you can only "
 865                    "access this column through the .features attribute"
 866                )
 867            elif self.metadata is not None and col in self.metadata.columns:
 868                warnings.warn(
 869                    f"Column name {col} already exists in the .metadata attribute;"
 870                    f"calling this.get({col}) will return the .metadata column"
 871                )
 872
 873        if self.features is None:
 874            self.features = new_features
 875        else:
 876            self.features.loc[:, new_features.columns] = new_features
 877
 878    @classmethod
 879    def merge(cls, events: Iterable[Self]) -> Self:
 880        """
 881        Combine EventArrays in a list into a single EventArray.
 882        :param events: the new list of events.
 883        """
 884        all_info = []
 885        all_metadata = []
 886        all_features = []
 887        for event_array in events:
 888            # Skip empty EventArrays
 889            if event_array.info is not None:
 890                all_info.append(event_array.info)
 891            if event_array.metadata is not None:
 892                all_metadata.append(event_array.metadata)
 893            if event_array.features is not None:
 894                all_features.append(event_array.features)
 895        if len(all_info) == 0:
 896            return EventArray()
 897        else:
 898            all_info = pd.concat(all_info, ignore_index=True)
 899        if len(all_metadata) == 0:
 900            all_metadata = None
 901        else:
 902            all_metadata = pd.concat(all_metadata, ignore_index=True)
 903        if len(all_features) == 0:
 904            all_features = None
 905        else:
 906            all_features = pd.concat(all_features, ignore_index=True)
 907
 908        return EventArray(all_info, all_metadata, all_features)
 909
 910    def to_events(
 911        self,
 912        scans: Scan | Iterable[Scan],
 913        ignore_missing_scans=True,
 914        ignore_metadata=False,
 915        ignore_features=False,
 916    ) -> list[Event]:
 917        """
 918        Get the events in the EventArray as a list of events. Returns [] if empty.
 919        :param scans: the scans that the events belong to, auto-matched by slide_id.
 920        Pass None if you don't care about scan metadata (pass ignore_missing_scans).
 921        :param ignore_missing_scans: whether to create blank scans for events without scans.
 922        :param ignore_metadata: whether to ignore metadata or not
 923        :param ignore_features: whether to ignore features or not
 924        :return:
 925        """
 926        if len(self) == 0:
 927            return []
 928        if isinstance(scans, Scan):
 929            scans = [scans]
 930        scans = {scan.slide_id: scan for scan in scans}
 931        events = []
 932        for i in range(len(self.info)):
 933            # Determine the associated scan
 934            slide_id = self.info["slide_id"][i]
 935            if slide_id not in scans:
 936                if ignore_missing_scans:
 937                    # Create a placeholder scan if the scan is missing
 938                    scan = Scan.make_placeholder(
 939                        slide_id,
 940                        self.info["tile"][i],
 941                        self.info["roi"][i],
 942                    )
 943                else:
 944                    raise ValueError(
 945                        f"Scan {self.info['slide_id'][i]} not found for event {i}."
 946                    )
 947            else:
 948                scan = scans[slide_id]
 949
 950            # Prepare the metadata and features
 951            if ignore_metadata or self.metadata is None:
 952                metadata = None
 953            else:
 954                # This Series creation method is less efficient,
 955                # but required for preserving dtypes
 956                metadata = pd.Series(
 957                    {col: self.metadata.loc[i, col] for col in self.metadata.columns},
 958                    dtype=object,
 959                )
 960            if ignore_features or self.features is None:
 961                features = None
 962            else:
 963                features = pd.Series(
 964                    {col: self.features.loc[i, col] for col in self.features.columns},
 965                    dtype=object,
 966                )
 967            # Create the event and append it to the list
 968            events.append(
 969                Event(
 970                    Tile(scan, self.info["tile"][i], self.info["roi"][i]),
 971                    self.info["x"][i],
 972                    self.info["y"][i],
 973                    metadata=metadata,
 974                    features=features,
 975                )
 976            )
 977        return events
 978
 979    @classmethod
 980    def from_events(cls, events: Iterable[Event]) -> Self:
 981        """
 982        Set the events in the EventArray to a new list of events.
 983        :param events: the new list of events.
 984        """
 985        info = pd.DataFrame(
 986            {
 987                "slide_id": [event.tile.scan.slide_id for event in events],
 988                "tile": [event.tile.n for event in events],
 989                "roi": [event.tile.n_roi for event in events],
 990                "x": [event.x for event in events],
 991                "y": [event.y for event in events],
 992            }
 993        )
 994        metadata_list = [event.metadata for event in events]
 995        # Iterate through and ensure that all metadata is the same shape
 996        for metadata in metadata_list:
 997            if type(metadata) != type(metadata_list[0]):
 998                raise ValueError("All metadata must be the same type.")
 999            if metadata is not None and metadata.shape != metadata_list[0].shape:
1000                raise ValueError("All metadata must be the same shape.")
1001        if metadata_list[0] is None:
1002            metadata = None
1003        else:
1004            metadata = pd.DataFrame(metadata_list)
1005        features_list = [event.features for event in events]
1006        # Iterate through and ensure that all features are the same shape
1007        for features in features_list:
1008            if type(features) != type(features_list[0]):
1009                raise ValueError("All features must be the same type.")
1010            if features is not None and features.shape != features_list[0].shape:
1011                raise ValueError("All features must be the same shape.")
1012        if features_list[0] is None:
1013            features = None
1014        else:
1015            features = pd.DataFrame(features_list)
1016        return EventArray(info=info, metadata=metadata, features=features)
1017
1018    def to_dataframe(self) -> pd.DataFrame:
1019        """
1020        Convert all the data in the EventArray to a single DataFrame.
1021        :return: a DataFrame with all the data in the EventArray.
1022        """
1023        # Make a copy of the info DataFrame and prepend "info_" to the column names
1024        output = self.info.copy()
1025        # Combine with the metadata and prepend "metadata_" to the column names
1026        if self.metadata is not None:
1027            metadata = self.metadata.copy()
1028            metadata.columns = [f"metadata_{col}" for col in metadata.columns]
1029            output = pd.concat([output, metadata], axis=1)
1030        # Combine with the features and prepend "features_" to the column names
1031        if self.features is not None:
1032            features = self.features.copy()
1033            features.columns = [f"features_{col}" for col in features.columns]
1034            output = pd.concat([output, features], axis=1)
1035        return output
1036
1037    @classmethod
1038    def from_dataframe(
1039        cls, df, metadata_prefix: str = "metadata_", features_prefix: str = "features_"
1040    ) -> Self:
1041        """
1042        From a single, special DataFrame, create an EventArray.
1043        :param df: the DataFrame to convert to an EventArray.
1044        :param metadata_prefix: the prefix for metadata columns.
1045        :param features_prefix: the prefix for features columns.
1046        :return: a DataFrame with all the data in the EventArray.
1047        """
1048        # Split the columns into info, metadata, and features and strip prefix
1049        info = df[[col for col in df.columns if col in cls.INFO_COLUMNS]].copy()
1050        if info.size == 0:
1051            info = None
1052        metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy()
1053        metadata.columns = [
1054            col.replace(metadata_prefix, "") for col in metadata.columns
1055        ]
1056        if metadata.size == 0:
1057            metadata = None
1058        features = df[[col for col in df.columns if col.startswith("features_")]].copy()
1059        features.columns = [
1060            col.replace(features_prefix, "") for col in features.columns
1061        ]
1062        if features.size == 0:
1063            features = None
1064        return cls(info=info, metadata=metadata, features=features)
1065
1066    @classmethod
1067    def from_mask(
1068        cls,
1069        mask: np.ndarray,
1070        tile: Tile,
1071        include_cell_id: bool = True,
1072        images: list[np.ndarray] = None,
1073        image_labels: list[str] = None,
1074        properties: list[str] = None,
1075    ) -> Self:
1076        """
1077        Extract events from a mask DataFrame, including metadata and features.
1078        :param mask: the mask to extract events from.
1079        :param tile: the Tile object associated with this mask.
1080        :param include_cell_id: whether to include the cell_id, or numerical
1081        mask label, as metadata in the EventArray.
1082        :param images: the intensity images to extract features from.
1083        :param image_labels: the labels for the intensity images.
1084        :param properties: list of properties to extract in addition to the defaults:
1085        :return: EventArray corresponding to the mask labels.
1086        """
1087        if csi_images is None:
1088            raise ModuleNotFoundError(
1089                "imageio libraries not installed! "
1090                "run `pip install csi_images[imageio]` to resolve."
1091            )
1092        # Gather mask_info
1093        if images is not None and image_labels is not None:
1094            if len(images) != len(image_labels):
1095                raise ValueError("Intensity images and labels must match lengths.")
1096
1097        mask_info = csi_images.extract_mask_info(mask, images, image_labels, properties)
1098
1099        if len(mask_info) == 0:
1100            return EventArray()
1101
1102        # Combine provided info and mask info
1103        info = pd.DataFrame(
1104            {
1105                "slide_id": tile.scan.slide_id,
1106                "tile": tile.n,
1107                "roi": tile.n_roi,
1108                "x": mask_info["x"],
1109                "y": mask_info["y"],
1110            },
1111        )
1112        # Extract a metadata column if desired
1113        if include_cell_id:
1114            metadata = pd.DataFrame({"cell_id": mask_info["id"]})
1115        else:
1116            metadata = None
1117        # If any additional properties were extracted, add them as features
1118        mask_info = mask_info.drop(columns=["id", "x", "y"], errors="ignore")
1119        if len(mask_info.columns) > 0:
1120            features = mask_info
1121            features.columns = [col.lower() for col in features.columns]
1122        else:
1123            features = None
1124        return EventArray(info, metadata, features)
1125
1126    def save_csv(self, output_path: str) -> bool:
1127        """
1128        Save the events to an CSV file, including metadata and features.
1129        :param output_path:
1130        :return:
1131        """
1132        if not output_path.endswith(".csv"):
1133            output_path += ".csv"
1134        self.to_dataframe().to_csv(output_path, index=False)
1135        return os.path.exists(output_path)
1136
1137    @classmethod
1138    def load_csv(
1139        cls,
1140        input_path: str,
1141        metadata_prefix: str = "metadata_",
1142        features_prefix: str = "features_",
1143    ) -> Self:
1144        """
1145        Load the events from an CSV file, including metadata and features.
1146        :param input_path:
1147        :param metadata_prefix:
1148        :param features_prefix:
1149        :return:
1150        """
1151        # Load the CSV file
1152        df = pd.read_csv(input_path)
1153        return cls.from_dataframe(df, metadata_prefix, features_prefix)
1154
1155    def save_json(self, output_path: str, orient: str = "records") -> bool:
1156        """
1157        Save the events to a JSON file, including metadata and features.
1158        :param output_path:
1159        :param orient: the orientation of the JSON file, see pandas.DataFrame.to_json()
1160        :return:
1161        """
1162        if not output_path.endswith(".json"):
1163            output_path += ".json"
1164        self.to_dataframe().to_json(output_path, orient=orient, indent=2)
1165        return os.path.exists(output_path)
1166
1167    @classmethod
1168    def load_json(
1169        cls,
1170        input_path: str,
1171        metadata_prefix: str = "metadata_",
1172        features_prefix: str = "features_",
1173    ) -> Self:
1174        """
1175        Load the events from a JSON file, including metadata and features.
1176        :param input_path:
1177        :param metadata_prefix:
1178        :param features_prefix:
1179        :return:
1180        """
1181        # Load the JSON file
1182        df = pd.read_json(input_path, orient="records")
1183        return cls.from_dataframe(df, metadata_prefix, features_prefix)
1184
1185    def save_hdf5(
1186        self, output_path: str, complevel: int = 1, complib="blosc:zstd"
1187    ) -> bool:
1188        """
1189        Save the events to an HDF5 file, including metadata and features.
1190        Uses the pandas-provided HDF5 functions for ease, and external compatibility,
1191        though these files are slightly harder to view in HDFView or similar.
1192        Compression defaults remain very quick while cutting file size by 50%+.
1193        :param output_path:
1194        :param complevel: see pandas.HDFStore for more details.
1195        :param complib: see pandas.HDFStore for more details.
1196        :return:
1197        """
1198        if not output_path.endswith(".hdf5") and not output_path.endswith(".h5"):
1199            output_path += ".hdf5"
1200        # Open the output_path as an HDF5 file
1201        with pd.HDFStore(
1202            output_path, mode="w", complevel=complevel, complib=complib
1203        ) as store:
1204            # Store the dataframes in the HDF5 file
1205            if self.info is not None:
1206                store.put("info", self.info, index=False)
1207            if self.metadata is not None:
1208                store.put("metadata", self.metadata, index=False)
1209            if self.features is not None:
1210                store.put("features", self.features, index=False)
1211        return os.path.exists(output_path)
1212
1213    @classmethod
1214    def load_hdf5(cls, input_path: str) -> Self:
1215        """
1216        Load the events from an HDF5 file, including metadata and features.
1217        :param input_path:
1218        :return:
1219        """
1220        # Open the input_path as an HDF5 file
1221        with pd.HDFStore(input_path, "r") as store:
1222            # Load the dataframes from the HDF5 file
1223            info = store.get("info") if "info" in store else None
1224            metadata = store.get("metadata") if "metadata" in store else None
1225            features = store.get("features") if "features" in store else None
1226        return cls(info=info, metadata=metadata, features=features)
1227
1228    def save_ocular(self, output_path: str, event_type: str = "cells"):
1229        """
1230        Save the events to an OCULAR file. Relies on the dataframe originating
1231        from an OCULAR file (same columns; duplicate metadata/info).
1232        :param output_path:
1233        :param event_type:
1234        :return:
1235        """
1236        if pyreadr is None:
1237            raise ModuleNotFoundError(
1238                "pyreadr not installed! Install pyreadr directly "
1239                "or run `pip install csi-images[rds]` option to resolve."
1240            )
1241        if event_type == "cells":
1242            file_stub = "rc-final"
1243        elif event_type == "others":
1244            file_stub = "others-final"
1245        else:
1246            raise ValueError("Invalid event type. Must be cells or others.")
1247
1248        # Ensure good metadata
1249        metadata = pd.DataFrame(
1250            {
1251                "slide_id": self.info["slide_id"],
1252                "frame_id": self.info["tile"] + 1,  # Convert to 1-indexed for R
1253                "cell_id": (
1254                    self.metadata["cell_id"]
1255                    if "cell_id" in self.metadata.columns
1256                    else range(len(self.info))
1257                ),
1258                "cellx": self.info["x"],
1259                "celly": self.info["y"],
1260            }
1261        )
1262        if self.metadata is not None:
1263            metadata[self.metadata.columns] = self.metadata.copy()
1264
1265        # Check for the "ocular_interesting" column
1266        if event_type == "cells":
1267            if "ocular_interesting" in metadata.columns:
1268                interesting_rows = metadata["ocular_interesting"].to_numpy(dtype=bool)
1269            elif "hcpc" in metadata.columns:
1270                # Interesting cells don't get an hcpc designation, leaving them as -1
1271                interesting_rows = (
1272                    metadata["hcpc"].to_numpy() == -1
1273                )  # interesting cells
1274            else:
1275                interesting_rows = []
1276            if sum(interesting_rows) > 0:
1277                # Split the metadata into interesting and regular
1278                interesting_events = self.rows(interesting_rows)
1279                interesting_df = pd.concat(
1280                    [interesting_events.features, interesting_events.metadata], axis=1
1281                )
1282                data_events = self.rows(~interesting_rows)
1283                data_df = pd.concat(
1284                    [data_events.features, data_events.metadata], axis=1
1285                )
1286                data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore")
1287
1288                # Drop particular columns for "interesting"
1289                interesting_df = interesting_df.drop(
1290                    [
1291                        "clust",
1292                        "hcpc",
1293                        "frame_id",
1294                        "cell_id",
1295                        "unique_id",
1296                        "ocular_interesting",
1297                    ],
1298                    axis=1,
1299                    errors="ignore",
1300                )
1301                # Save both .csv and .rds
1302                interesting_stub = os.path.join(output_path, "ocular_interesting")
1303                interesting_df.to_csv(f"{interesting_stub}.csv")
1304                # Suppress pandas FutureWarning
1305                with warnings.catch_warnings():
1306                    warnings.simplefilter(action="ignore", category=FutureWarning)
1307                    pyreadr.write_rds(f"{interesting_stub}.rds", interesting_df)
1308            else:
1309                data_df = pd.concat([self.features, metadata], axis=1)
1310        else:
1311            # Get all data and reset_index (will copy it)
1312            data_df = pd.concat([self.features, metadata], axis=1)
1313
1314        # Split based on cluster number to conform to *-final[1-4].rds
1315        n_clusters = max(data_df["clust"]) + 1
1316        split_idx = [round(i * n_clusters / 4) for i in range(5)]
1317        for i in range(4):
1318            subset = (split_idx[i] <= data_df["clust"]) & (
1319                data_df["clust"] < split_idx[i + 1]
1320            )
1321            data_df.loc[subset, "hcpc"] = i + 1
1322            subset = data_df[subset].reset_index(drop=True)
1323            # Suppress pandas FutureWarning
1324            with warnings.catch_warnings():
1325                warnings.simplefilter(action="ignore", category=FutureWarning)
1326                pyreadr.write_rds(
1327                    os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset
1328                )
1329
1330        # Create new example cell strings
1331        data_df["example_cell_id"] = (
1332            data_df["slide_id"]
1333            + " "
1334            + data_df["frame_id"].astype(str)
1335            + " "
1336            + data_df["cell_id"].astype(str)
1337            + " "
1338            + data_df["cellx"].astype(int).astype(str)
1339            + " "
1340            + data_df["celly"].astype(int).astype(str)
1341        )
1342        # Find averagable data columns
1343        if "cellcluster_id" in data_df.columns:
1344            end_idx = data_df.columns.get_loc("cellcluster_id")
1345        else:
1346            end_idx = data_df.columns.get_loc("slide_id")
1347        avg_cols = data_df.columns[:end_idx].tolist()
1348        # Group by cluster and average
1349        data_df = data_df.groupby("clust").agg(
1350            **{col: (col, "mean") for col in avg_cols},
1351            count=("clust", "size"),  # count rows in each cluster
1352            example_cells=("example_cell_id", lambda x: ",".join(x)),
1353            hcpc=("hcpc", lambda x: x.iloc[0]),
1354        )
1355        data_df = data_df.reset_index()  # Do NOT drop, index is "clust"
1356        # Create new columns
1357        metadata = pd.DataFrame(
1358            {
1359                "count": data_df["count"],
1360                "example_cells": data_df["example_cells"],
1361                "clust": data_df["clust"].astype(int),
1362                "hcpc": data_df["hcpc"].astype(int),
1363                "id": data_df["clust"].astype(int).astype(str),
1364                "cccluster": "0",  # Dummy value
1365                "ccdistance": 0.0,  # Dummy value
1366                "rownum": list(range(len(data_df))),
1367                "framegroup": 0,  # Dummy value
1368            }
1369        )
1370        # Need to pad the features to 761 columns, as per OCULAR report needs
1371        additional_columns = range(len(avg_cols), 761)
1372        if len(additional_columns) > 0:
1373            padding = pd.DataFrame(
1374                np.zeros((len(data_df), len(additional_columns))),
1375                columns=[f"pad{i}" for i in additional_columns],
1376            )
1377            data_df = pd.concat([data_df[avg_cols], padding, metadata], axis=1)
1378        else:
1379            data_df = pd.concat([data_df[avg_cols], metadata], axis=1)
1380
1381        # Save the cluster data
1382        data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv"))
1383        # Suppress pandas FutureWarning
1384        with warnings.catch_warnings():
1385            warnings.simplefilter(action="ignore", category=FutureWarning)
1386            pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df)
1387
1388    @classmethod
1389    def load_ocular(
1390        cls,
1391        input_path: str,
1392        event_type="cells",
1393        cell_data_files=(
1394            "rc-final1.rds",
1395            "rc-final2.rds",
1396            "rc-final3.rds",
1397            "rc-final4.rds",
1398            "ocular_interesting.rds",
1399        ),
1400        others_data_files=(
1401            "others-final1.rds",
1402            "others-final2.rds",
1403            "others-final3.rds",
1404            "others-final4.rds",
1405        ),
1406        atlas_data_files=(
1407            "ocular_interesting.rds",
1408            "ocular_not_interesting.rds",
1409        ),
1410        drop_common_events=True,
1411    ) -> Self:
1412        """
1413
1414        :param input_path:
1415        :param event_type:
1416        :param cell_data_files:
1417        :param others_data_files:
1418        :param atlas_data_files:
1419        :param drop_common_events:
1420        :return:
1421        """
1422        if pyreadr is None:
1423            raise ModuleNotFoundError(
1424                "pyreadr not installed! Install pyreadr directly "
1425                "or run `pip install csi-images[rds]` option to resolve."
1426            )
1427        # Check if the input path is a directory or a file
1428        if os.path.isfile(input_path):
1429            data_files = [os.path.basename(input_path)]
1430            input_path = os.path.dirname(input_path)
1431        if event_type == "cells":
1432            data_files = cell_data_files
1433        elif event_type == "others":
1434            data_files = others_data_files
1435        else:
1436            raise ValueError("Invalid event type.")
1437
1438        # Load the data from the OCULAR files
1439        file_data = {}
1440        for file in data_files:
1441            file_path = os.path.join(input_path, file)
1442            if not os.path.isfile(file_path):
1443                warnings.warn(f"{file} not found for in {input_path}")
1444                continue
1445            file_data[file] = pyreadr.read_r(file_path)
1446            # Get the DataFrame associated with None (pyreadr dict quirk)
1447            file_data[file] = file_data[file][None]
1448            if len(file_data[file]) == 0:
1449                # File gets dropped from the dict
1450                file_data.pop(file)
1451                warnings.warn(f"{file} has no cells")
1452                continue
1453
1454            # Drop common cells if requested and in this file
1455            if (
1456                file in atlas_data_files
1457                and drop_common_events
1458                and "catalogue_classification" in file_data[file]
1459            ):
1460                common_cell_indices = (
1461                    file_data[file]["catalogue_classification"] == "common_cell"
1462                )
1463                file_data[file] = file_data[file][common_cell_indices == False]
1464
1465            if len(file_data[file]) == 0:
1466                # File gets dropped from the dict
1467                file_data.pop(file)
1468                warnings.warn(f"{file} has no cells after dropping common cells")
1469                continue
1470
1471            # Extract frame_id and cell_id
1472            # DAPI- events already have frame_id cell_id outside rowname
1473            if event_type == "cells" and "frame_id" not in file_data[file].columns:
1474                file_data[file]["rowname"] = file_data[file]["rowname"].astype("str")
1475                # get frame_id cell_id from rownames column and split into two columns
1476                split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True)
1477                if len(split_res.columns) != 2:
1478                    warnings.warn(
1479                        f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}'
1480                    )
1481                # then assign it back to the dataframe
1482                file_data[file][["frame_id", "cell_id"]] = split_res.astype("int")
1483            # Ensure frame_id and cell_id are integers
1484            file_data[file]["frame_id"] = file_data[file]["frame_id"].astype("int")
1485            file_data[file]["cell_id"] = file_data[file]["cell_id"].astype("int")
1486            # reset indexes since they can cause NaN values in concat
1487            file_data[file] = file_data[file].reset_index(drop=True)
1488
1489        # Merge the data from all files
1490        if len(file_data) == 0:
1491            return EventArray()
1492        elif len(file_data) == 1:
1493            data = [file_data[file] for file in file_data.keys()][0]
1494        else:
1495            data = pd.concat(file_data.values())
1496
1497        # Others is missing the "slide_id". Insert it right before "frame_id" column
1498        if event_type == "others" and "slide_id" not in data.columns:
1499            if os.path.basename(input_path) == "ocular":
1500                slide_id = os.path.basename(os.path.dirname(input_path))
1501            else:
1502                slide_id = "UNKNOWN"
1503            data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id)
1504
1505        # Sort according to ascending cell_id to keep the original, which is in manual_df
1506        data = data.sort_values(by=["cell_id"], ascending=True)
1507        # Filter out duplicates by x & y
1508        data = data.assign(
1509            unique_id=data["slide_id"]
1510            + "_"
1511            + data["frame_id"].astype(str)
1512            + "_"
1513            + data["cellx"].astype(int).astype(str)
1514            + "_"
1515            + data["celly"].astype(int).astype(str)
1516        )
1517        data = data.drop_duplicates(subset=["unique_id"], keep="first")
1518        # Normal unique_id is with cell_id
1519        data = data.assign(
1520            unique_id=data["slide_id"]
1521            + "_"
1522            + data["frame_id"].astype(str)
1523            + "_"
1524            + data["cell_id"].astype(str)
1525        )
1526        data = data.reset_index(drop=True)
1527        # All columns up to "slide_id" are features; drop the "slide_id"
1528        features = data.loc[:, :"slide_id"].iloc[:, :-1]
1529        data = data.loc[:, "slide_id":]
1530        # Grab the info columns
1531        info = data[["slide_id", "frame_id", "cellx", "celly"]]
1532        info.columns = ["slide_id", "tile", "x", "y"]
1533        info = info.assign(roi=0)  # OCULAR only works on 1 ROI, as far as known
1534        info = info[["slide_id", "tile", "roi", "x", "y"]]
1535        # Metadata has duplicate columns for later convenience
1536        metadata = data
1537        # Certain columns tend to be problematic with mixed data formats...
1538        for col in ["TRITC", "CY5", "FITC"]:
1539            if col in metadata:
1540                labels = {
1541                    "False": False,
1542                    "True": True,
1543                    "FALSE": False,
1544                    "TRUE": True,
1545                    False: False,
1546                    True: True,
1547                }
1548                metadata[col] = metadata[col].map(labels).astype(bool)
1549        for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]:
1550            if col in metadata:
1551                metadata[col] = metadata[col].fillna(-1).astype(int)
1552        info["tile"] = info["tile"] - 1  # Convert to 0-based indexing
1553        return EventArray(info, metadata, features)

A class that holds a large number of events' data, making it easy to analyze and manipulate many events at once. A more separated version of the Event class.

EventArray( info: pandas.core.frame.DataFrame = None, metadata: pandas.core.frame.DataFrame = None, features: pandas.core.frame.DataFrame = None)
629    def __init__(
630        self,
631        info: pd.DataFrame = None,
632        metadata: pd.DataFrame = None,
633        features: pd.DataFrame = None,
634    ):
635
636        # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y"
637        self.info = info
638        if self.info is not None:
639            # Special case: "roi" is often not required, so we'll fill in if its missing
640            if "roi" not in info.columns:
641                self.info = self.info.assign(roi=0)
642            if set(self.info.columns) != set(self.INFO_COLUMNS):
643                raise ValueError(
644                    f"EventArray.info must have columns:"
645                    f"{self.INFO_COLUMNS}; had {list(self.info.columns)}"
646                )
647            # Ensure order and data types
648            self.info = pd.DataFrame(
649                {
650                    "slide_id": self.info["slide_id"].astype(str),
651                    "tile": self.info["tile"].astype(np.uint16),
652                    "roi": self.info["roi"].astype(np.uint8),
653                    "x": self.info["x"].round().astype(np.uint16),
654                    "y": self.info["y"].round().astype(np.uint16),
655                }
656            )
657
658        # All DataFrames must all have the same number of rows
659        if metadata is not None and (info is None or len(info) != len(metadata)):
660            raise ValueError(
661                "If EventArray.metadata is not None, it should match rows with .info"
662            )
663        if features is not None and (info is None or len(info) != len(features)):
664            raise ValueError(
665                "If EventArray.features is not None, it should match rows with .info"
666            )
667        # No columns named "metadata_", "features_", or "None"
668        column_names = []
669        if metadata is not None:
670            column_names += metadata.columns.tolist()
671        if features is not None:
672            column_names += features.columns.tolist()
673        if any([col.lower().startswith("metadata_") for col in column_names]):
674            raise ValueError("EventArray column names cannot start with 'metadata_'")
675        if any([col.lower().startswith("features_") for col in column_names]):
676            raise ValueError("EventArray column names cannot start with 'features_'")
677        if any([col.lower() == "none" for col in column_names]):
678            raise ValueError("EventArray column names cannot be 'none'")
679
680        # Add metadata and features
681        self.metadata = None
682        self.features = None
683        if metadata is not None:
684            self.add_metadata(metadata)
685        if features is not None:
686            self.add_features(features)
INFO_COLUMNS = ['slide_id', 'tile', 'roi', 'x', 'y']
info
metadata
features
def get_sort_order( self, by: Union[Hashable, Sequence[Hashable]], ascending: Union[bool, Sequence[bool]] = True):
733    def get_sort_order(
734        self, by: Hashable | Sequence[Hashable], ascending: bool | Sequence[bool] = True
735    ):
736        """
737        Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.
738        :param by: name of the column(s) to sort by.
739        :param ascending: whether to sort in ascending order; can be a list to match by
740        :return: the order of the indices to sort by.
741        """
742        columns = self.get(by)
743        return columns.sort_values(by=by, ascending=ascending).index

Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.

Parameters
  • by: name of the column(s) to sort by.
  • ascending: whether to sort in ascending order; can be a list to match by
Returns

the order of the indices to sort by.

def sort( self, by: Union[Hashable, Sequence[Hashable]], ascending: Union[bool, Sequence[bool]] = True) -> Self:
745    def sort(
746        self,
747        by: Hashable | Sequence[Hashable],
748        ascending: bool | Sequence[bool] = True,
749    ) -> Self:
750        """
751        Sort the EventArray by column(s) in the info, metadata, or features DataFrames.
752        :param by: name of the column(s) to sort by.
753        :param ascending: whether to sort in ascending order; can be a list to match by
754        :return: a new, sorted EventArray.
755        """
756        order = self.get_sort_order(by, ascending)
757        info = self.info.loc[order].reset_index(drop=True)
758        if self.metadata is not None:
759            metadata = self.metadata.loc[order].reset_index(drop=True)
760        else:
761            metadata = None
762        if self.features is not None:
763            features = self.features.loc[order].reset_index(drop=True)
764        else:
765            features = None
766        return EventArray(info, metadata, features)

Sort the EventArray by column(s) in the info, metadata, or features DataFrames.

Parameters
  • by: name of the column(s) to sort by.
  • ascending: whether to sort in ascending order; can be a list to match by
Returns

a new, sorted EventArray.

def get( self, column_names: Union[Hashable, Sequence[Hashable]]) -> pandas.core.frame.DataFrame:
768    def get(self, column_names: Hashable | Sequence[Hashable]) -> pd.DataFrame:
769        """
770        Get a DataFrame with the specified columns from the EventArray, by value.
771        :param column_names: the names of the columns to get.
772        :return: a DataFrame with the specified columns.
773        """
774        if isinstance(column_names, Hashable):
775            column_names = [column_names]  # Drop into a list for the loop
776        columns = []
777        for column_name in column_names:
778            if column_name in self.info.columns:
779                columns.append(self.info[column_name])
780            elif self.metadata is not None and column_name in self.metadata.columns:
781                columns.append(self.metadata[column_name])
782            elif self.features is not None and column_name in self.features.columns:
783                columns.append(self.features[column_name])
784            else:
785                raise ValueError(f"Column {column_name} not found in EventArray")
786        return pd.concat(columns, axis=1)

Get a DataFrame with the specified columns from the EventArray, by value.

Parameters
  • column_names: the names of the columns to get.
Returns

a DataFrame with the specified columns.

def rows(self, rows: Sequence[Hashable]) -> Self:
788    def rows(self, rows: Sequence[Hashable]) -> Self:
789        """
790        Get a subset of the EventArray rows based on a boolean or integer index, by value.
791        :param rows: row labels, indices, or boolean mask; anything for .loc[]
792        :return: a new EventArray with the subset of events.
793        """
794        info = self.info.loc[rows].reset_index(drop=True)
795        if self.metadata is not None:
796            metadata = self.metadata.loc[rows].reset_index(drop=True)
797        else:
798            metadata = None
799        if self.features is not None:
800            features = self.features.loc[rows].reset_index(drop=True)
801        else:
802            features = None
803        return EventArray(info, metadata, features)

Get a subset of the EventArray rows based on a boolean or integer index, by value.

Parameters
  • rows: row labels, indices, or boolean mask; anything for .loc[]
Returns

a new EventArray with the subset of events.

def copy(self) -> Self:
805    def copy(self) -> Self:
806        """
807        Create a deep copy of the EventArray.
808        :return: a deep copy of the EventArray.
809        """
810        return EventArray(
811            info=self.info.copy(),
812            metadata=None if self.metadata is None else self.metadata.copy(),
813            features=None if self.features is None else self.features.copy(),
814        )

Create a deep copy of the EventArray.

Returns

a deep copy of the EventArray.

def add_metadata( self, new_metadata: pandas.core.series.Series | pandas.core.frame.DataFrame) -> None:
818    def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None:
819        """
820        Add metadata to the EventArray. Removes the need to check if metadata is None.
821        Overwrites any existing metadata with the same column names as the new metadata.
822        :param new_metadata: the metadata to add.
823        """
824        if self.info is None or len(self.info) != len(new_metadata):
825            raise ValueError("New metadata must match length of existing info")
826
827        if isinstance(new_metadata, pd.Series):
828            # Convert to a DataFrame
829            new_metadata = pd.DataFrame(new_metadata)
830
831        for col in new_metadata.columns:
832            if col in self.INFO_COLUMNS:
833                warnings.warn(
834                    f"Column name {col} is reserved for info; you can only "
835                    "access this column through the .metadata attribute"
836                )
837            elif self.features is not None and col in self.features.columns:
838                warnings.warn(
839                    f"Column name {col} also exists in the .features attribute; "
840                    f"calling this.get({col}) will return the .metadata column"
841                )
842
843        if self.metadata is None:
844            self.metadata = new_metadata
845        else:
846            self.metadata.loc[:, new_metadata.columns] = new_metadata

Add metadata to the EventArray. Removes the need to check if metadata is None. Overwrites any existing metadata with the same column names as the new metadata.

Parameters
  • new_metadata: the metadata to add.
def add_features( self, new_features: pandas.core.series.Series | pandas.core.frame.DataFrame) -> None:
848    def add_features(self, new_features: pd.Series | pd.DataFrame) -> None:
849        """
850        Add features to the EventArray. Removes the need to check if features is None.
851        Overwrites any existing features with the same column names as the new features.
852        :param new_features: the features to add.
853        """
854        if self.info is None or len(self.info) != len(new_features):
855            raise ValueError("New features must match length of existing info")
856
857        if isinstance(new_features, pd.Series):
858            # Convert to a DataFrame
859            new_features = pd.DataFrame(new_features)
860
861        for col in new_features.columns:
862            if col in self.INFO_COLUMNS:
863                warnings.warn(
864                    f"Column name {col} is reserved for info; you can only "
865                    "access this column through the .features attribute"
866                )
867            elif self.metadata is not None and col in self.metadata.columns:
868                warnings.warn(
869                    f"Column name {col} already exists in the .metadata attribute;"
870                    f"calling this.get({col}) will return the .metadata column"
871                )
872
873        if self.features is None:
874            self.features = new_features
875        else:
876            self.features.loc[:, new_features.columns] = new_features

Add features to the EventArray. Removes the need to check if features is None. Overwrites any existing features with the same column names as the new features.

Parameters
  • new_features: the features to add.
@classmethod
def merge(cls, events: Iterable[Self]) -> Self:
878    @classmethod
879    def merge(cls, events: Iterable[Self]) -> Self:
880        """
881        Combine EventArrays in a list into a single EventArray.
882        :param events: the new list of events.
883        """
884        all_info = []
885        all_metadata = []
886        all_features = []
887        for event_array in events:
888            # Skip empty EventArrays
889            if event_array.info is not None:
890                all_info.append(event_array.info)
891            if event_array.metadata is not None:
892                all_metadata.append(event_array.metadata)
893            if event_array.features is not None:
894                all_features.append(event_array.features)
895        if len(all_info) == 0:
896            return EventArray()
897        else:
898            all_info = pd.concat(all_info, ignore_index=True)
899        if len(all_metadata) == 0:
900            all_metadata = None
901        else:
902            all_metadata = pd.concat(all_metadata, ignore_index=True)
903        if len(all_features) == 0:
904            all_features = None
905        else:
906            all_features = pd.concat(all_features, ignore_index=True)
907
908        return EventArray(all_info, all_metadata, all_features)

Combine EventArrays in a list into a single EventArray.

Parameters
  • events: the new list of events.
def to_events( self, scans: Union[csi_images.csi_scans.Scan, Iterable[csi_images.csi_scans.Scan]], ignore_missing_scans=True, ignore_metadata=False, ignore_features=False) -> list[Event]:
910    def to_events(
911        self,
912        scans: Scan | Iterable[Scan],
913        ignore_missing_scans=True,
914        ignore_metadata=False,
915        ignore_features=False,
916    ) -> list[Event]:
917        """
918        Get the events in the EventArray as a list of events. Returns [] if empty.
919        :param scans: the scans that the events belong to, auto-matched by slide_id.
920        Pass None if you don't care about scan metadata (pass ignore_missing_scans).
921        :param ignore_missing_scans: whether to create blank scans for events without scans.
922        :param ignore_metadata: whether to ignore metadata or not
923        :param ignore_features: whether to ignore features or not
924        :return:
925        """
926        if len(self) == 0:
927            return []
928        if isinstance(scans, Scan):
929            scans = [scans]
930        scans = {scan.slide_id: scan for scan in scans}
931        events = []
932        for i in range(len(self.info)):
933            # Determine the associated scan
934            slide_id = self.info["slide_id"][i]
935            if slide_id not in scans:
936                if ignore_missing_scans:
937                    # Create a placeholder scan if the scan is missing
938                    scan = Scan.make_placeholder(
939                        slide_id,
940                        self.info["tile"][i],
941                        self.info["roi"][i],
942                    )
943                else:
944                    raise ValueError(
945                        f"Scan {self.info['slide_id'][i]} not found for event {i}."
946                    )
947            else:
948                scan = scans[slide_id]
949
950            # Prepare the metadata and features
951            if ignore_metadata or self.metadata is None:
952                metadata = None
953            else:
954                # This Series creation method is less efficient,
955                # but required for preserving dtypes
956                metadata = pd.Series(
957                    {col: self.metadata.loc[i, col] for col in self.metadata.columns},
958                    dtype=object,
959                )
960            if ignore_features or self.features is None:
961                features = None
962            else:
963                features = pd.Series(
964                    {col: self.features.loc[i, col] for col in self.features.columns},
965                    dtype=object,
966                )
967            # Create the event and append it to the list
968            events.append(
969                Event(
970                    Tile(scan, self.info["tile"][i], self.info["roi"][i]),
971                    self.info["x"][i],
972                    self.info["y"][i],
973                    metadata=metadata,
974                    features=features,
975                )
976            )
977        return events

Get the events in the EventArray as a list of events. Returns [] if empty.

Parameters
  • scans: the scans that the events belong to, auto-matched by slide_id. Pass None if you don't care about scan metadata (pass ignore_missing_scans).
  • ignore_missing_scans: whether to create blank scans for events without scans.
  • ignore_metadata: whether to ignore metadata or not
  • ignore_features: whether to ignore features or not
Returns
@classmethod
def from_events(cls, events: Iterable[Event]) -> Self:
 979    @classmethod
 980    def from_events(cls, events: Iterable[Event]) -> Self:
 981        """
 982        Set the events in the EventArray to a new list of events.
 983        :param events: the new list of events.
 984        """
 985        info = pd.DataFrame(
 986            {
 987                "slide_id": [event.tile.scan.slide_id for event in events],
 988                "tile": [event.tile.n for event in events],
 989                "roi": [event.tile.n_roi for event in events],
 990                "x": [event.x for event in events],
 991                "y": [event.y for event in events],
 992            }
 993        )
 994        metadata_list = [event.metadata for event in events]
 995        # Iterate through and ensure that all metadata is the same shape
 996        for metadata in metadata_list:
 997            if type(metadata) != type(metadata_list[0]):
 998                raise ValueError("All metadata must be the same type.")
 999            if metadata is not None and metadata.shape != metadata_list[0].shape:
1000                raise ValueError("All metadata must be the same shape.")
1001        if metadata_list[0] is None:
1002            metadata = None
1003        else:
1004            metadata = pd.DataFrame(metadata_list)
1005        features_list = [event.features for event in events]
1006        # Iterate through and ensure that all features are the same shape
1007        for features in features_list:
1008            if type(features) != type(features_list[0]):
1009                raise ValueError("All features must be the same type.")
1010            if features is not None and features.shape != features_list[0].shape:
1011                raise ValueError("All features must be the same shape.")
1012        if features_list[0] is None:
1013            features = None
1014        else:
1015            features = pd.DataFrame(features_list)
1016        return EventArray(info=info, metadata=metadata, features=features)

Set the events in the EventArray to a new list of events.

Parameters
  • events: the new list of events.
def to_dataframe(self) -> pandas.core.frame.DataFrame:
1018    def to_dataframe(self) -> pd.DataFrame:
1019        """
1020        Convert all the data in the EventArray to a single DataFrame.
1021        :return: a DataFrame with all the data in the EventArray.
1022        """
1023        # Make a copy of the info DataFrame and prepend "info_" to the column names
1024        output = self.info.copy()
1025        # Combine with the metadata and prepend "metadata_" to the column names
1026        if self.metadata is not None:
1027            metadata = self.metadata.copy()
1028            metadata.columns = [f"metadata_{col}" for col in metadata.columns]
1029            output = pd.concat([output, metadata], axis=1)
1030        # Combine with the features and prepend "features_" to the column names
1031        if self.features is not None:
1032            features = self.features.copy()
1033            features.columns = [f"features_{col}" for col in features.columns]
1034            output = pd.concat([output, features], axis=1)
1035        return output

Convert all the data in the EventArray to a single DataFrame.

Returns

a DataFrame with all the data in the EventArray.

@classmethod
def from_dataframe( cls, df, metadata_prefix: str = 'metadata_', features_prefix: str = 'features_') -> Self:
1037    @classmethod
1038    def from_dataframe(
1039        cls, df, metadata_prefix: str = "metadata_", features_prefix: str = "features_"
1040    ) -> Self:
1041        """
1042        From a single, special DataFrame, create an EventArray.
1043        :param df: the DataFrame to convert to an EventArray.
1044        :param metadata_prefix: the prefix for metadata columns.
1045        :param features_prefix: the prefix for features columns.
1046        :return: a DataFrame with all the data in the EventArray.
1047        """
1048        # Split the columns into info, metadata, and features and strip prefix
1049        info = df[[col for col in df.columns if col in cls.INFO_COLUMNS]].copy()
1050        if info.size == 0:
1051            info = None
1052        metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy()
1053        metadata.columns = [
1054            col.replace(metadata_prefix, "") for col in metadata.columns
1055        ]
1056        if metadata.size == 0:
1057            metadata = None
1058        features = df[[col for col in df.columns if col.startswith("features_")]].copy()
1059        features.columns = [
1060            col.replace(features_prefix, "") for col in features.columns
1061        ]
1062        if features.size == 0:
1063            features = None
1064        return cls(info=info, metadata=metadata, features=features)

From a single, special DataFrame, create an EventArray.

Parameters
  • df: the DataFrame to convert to an EventArray.
  • metadata_prefix: the prefix for metadata columns.
  • features_prefix: the prefix for features columns.
Returns

a DataFrame with all the data in the EventArray.

@classmethod
def from_mask( cls, mask: numpy.ndarray, tile: csi_images.csi_tiles.Tile, include_cell_id: bool = True, images: list[numpy.ndarray] = None, image_labels: list[str] = None, properties: list[str] = None) -> Self:
1066    @classmethod
1067    def from_mask(
1068        cls,
1069        mask: np.ndarray,
1070        tile: Tile,
1071        include_cell_id: bool = True,
1072        images: list[np.ndarray] = None,
1073        image_labels: list[str] = None,
1074        properties: list[str] = None,
1075    ) -> Self:
1076        """
1077        Extract events from a mask DataFrame, including metadata and features.
1078        :param mask: the mask to extract events from.
1079        :param tile: the Tile object associated with this mask.
1080        :param include_cell_id: whether to include the cell_id, or numerical
1081        mask label, as metadata in the EventArray.
1082        :param images: the intensity images to extract features from.
1083        :param image_labels: the labels for the intensity images.
1084        :param properties: list of properties to extract in addition to the defaults:
1085        :return: EventArray corresponding to the mask labels.
1086        """
1087        if csi_images is None:
1088            raise ModuleNotFoundError(
1089                "imageio libraries not installed! "
1090                "run `pip install csi_images[imageio]` to resolve."
1091            )
1092        # Gather mask_info
1093        if images is not None and image_labels is not None:
1094            if len(images) != len(image_labels):
1095                raise ValueError("Intensity images and labels must match lengths.")
1096
1097        mask_info = csi_images.extract_mask_info(mask, images, image_labels, properties)
1098
1099        if len(mask_info) == 0:
1100            return EventArray()
1101
1102        # Combine provided info and mask info
1103        info = pd.DataFrame(
1104            {
1105                "slide_id": tile.scan.slide_id,
1106                "tile": tile.n,
1107                "roi": tile.n_roi,
1108                "x": mask_info["x"],
1109                "y": mask_info["y"],
1110            },
1111        )
1112        # Extract a metadata column if desired
1113        if include_cell_id:
1114            metadata = pd.DataFrame({"cell_id": mask_info["id"]})
1115        else:
1116            metadata = None
1117        # If any additional properties were extracted, add them as features
1118        mask_info = mask_info.drop(columns=["id", "x", "y"], errors="ignore")
1119        if len(mask_info.columns) > 0:
1120            features = mask_info
1121            features.columns = [col.lower() for col in features.columns]
1122        else:
1123            features = None
1124        return EventArray(info, metadata, features)

Extract events from a mask DataFrame, including metadata and features.

Parameters
  • mask: the mask to extract events from.
  • tile: the Tile object associated with this mask.
  • include_cell_id: whether to include the cell_id, or numerical mask label, as metadata in the EventArray.
  • images: the intensity images to extract features from.
  • image_labels: the labels for the intensity images.
  • properties: list of properties to extract in addition to the defaults:
Returns

EventArray corresponding to the mask labels.

def save_csv(self, output_path: str) -> bool:
1126    def save_csv(self, output_path: str) -> bool:
1127        """
1128        Save the events to an CSV file, including metadata and features.
1129        :param output_path:
1130        :return:
1131        """
1132        if not output_path.endswith(".csv"):
1133            output_path += ".csv"
1134        self.to_dataframe().to_csv(output_path, index=False)
1135        return os.path.exists(output_path)

Save the events to an CSV file, including metadata and features.

Parameters
  • output_path:
Returns
@classmethod
def load_csv( cls, input_path: str, metadata_prefix: str = 'metadata_', features_prefix: str = 'features_') -> Self:
1137    @classmethod
1138    def load_csv(
1139        cls,
1140        input_path: str,
1141        metadata_prefix: str = "metadata_",
1142        features_prefix: str = "features_",
1143    ) -> Self:
1144        """
1145        Load the events from an CSV file, including metadata and features.
1146        :param input_path:
1147        :param metadata_prefix:
1148        :param features_prefix:
1149        :return:
1150        """
1151        # Load the CSV file
1152        df = pd.read_csv(input_path)
1153        return cls.from_dataframe(df, metadata_prefix, features_prefix)

Load the events from an CSV file, including metadata and features.

Parameters
  • input_path:
  • metadata_prefix:
  • features_prefix:
Returns
def save_json(self, output_path: str, orient: str = 'records') -> bool:
1155    def save_json(self, output_path: str, orient: str = "records") -> bool:
1156        """
1157        Save the events to a JSON file, including metadata and features.
1158        :param output_path:
1159        :param orient: the orientation of the JSON file, see pandas.DataFrame.to_json()
1160        :return:
1161        """
1162        if not output_path.endswith(".json"):
1163            output_path += ".json"
1164        self.to_dataframe().to_json(output_path, orient=orient, indent=2)
1165        return os.path.exists(output_path)

Save the events to a JSON file, including metadata and features.

Parameters
  • output_path:
  • orient: the orientation of the JSON file, see pandas.DataFrame.to_json()
Returns
@classmethod
def load_json( cls, input_path: str, metadata_prefix: str = 'metadata_', features_prefix: str = 'features_') -> Self:
1167    @classmethod
1168    def load_json(
1169        cls,
1170        input_path: str,
1171        metadata_prefix: str = "metadata_",
1172        features_prefix: str = "features_",
1173    ) -> Self:
1174        """
1175        Load the events from a JSON file, including metadata and features.
1176        :param input_path:
1177        :param metadata_prefix:
1178        :param features_prefix:
1179        :return:
1180        """
1181        # Load the JSON file
1182        df = pd.read_json(input_path, orient="records")
1183        return cls.from_dataframe(df, metadata_prefix, features_prefix)

Load the events from a JSON file, including metadata and features.

Parameters
  • input_path:
  • metadata_prefix:
  • features_prefix:
Returns
def save_hdf5(self, output_path: str, complevel: int = 1, complib='blosc:zstd') -> bool:
1185    def save_hdf5(
1186        self, output_path: str, complevel: int = 1, complib="blosc:zstd"
1187    ) -> bool:
1188        """
1189        Save the events to an HDF5 file, including metadata and features.
1190        Uses the pandas-provided HDF5 functions for ease, and external compatibility,
1191        though these files are slightly harder to view in HDFView or similar.
1192        Compression defaults remain very quick while cutting file size by 50%+.
1193        :param output_path:
1194        :param complevel: see pandas.HDFStore for more details.
1195        :param complib: see pandas.HDFStore for more details.
1196        :return:
1197        """
1198        if not output_path.endswith(".hdf5") and not output_path.endswith(".h5"):
1199            output_path += ".hdf5"
1200        # Open the output_path as an HDF5 file
1201        with pd.HDFStore(
1202            output_path, mode="w", complevel=complevel, complib=complib
1203        ) as store:
1204            # Store the dataframes in the HDF5 file
1205            if self.info is not None:
1206                store.put("info", self.info, index=False)
1207            if self.metadata is not None:
1208                store.put("metadata", self.metadata, index=False)
1209            if self.features is not None:
1210                store.put("features", self.features, index=False)
1211        return os.path.exists(output_path)

Save the events to an HDF5 file, including metadata and features. Uses the pandas-provided HDF5 functions for ease, and external compatibility, though these files are slightly harder to view in HDFView or similar. Compression defaults remain very quick while cutting file size by 50%+.

Parameters
  • output_path:
  • complevel: see pandas.HDFStore for more details.
  • complib: see pandas.HDFStore for more details.
Returns
@classmethod
def load_hdf5(cls, input_path: str) -> Self:
1213    @classmethod
1214    def load_hdf5(cls, input_path: str) -> Self:
1215        """
1216        Load the events from an HDF5 file, including metadata and features.
1217        :param input_path:
1218        :return:
1219        """
1220        # Open the input_path as an HDF5 file
1221        with pd.HDFStore(input_path, "r") as store:
1222            # Load the dataframes from the HDF5 file
1223            info = store.get("info") if "info" in store else None
1224            metadata = store.get("metadata") if "metadata" in store else None
1225            features = store.get("features") if "features" in store else None
1226        return cls(info=info, metadata=metadata, features=features)

Load the events from an HDF5 file, including metadata and features.

Parameters
  • input_path:
Returns
def save_ocular(self, output_path: str, event_type: str = 'cells'):
1228    def save_ocular(self, output_path: str, event_type: str = "cells"):
1229        """
1230        Save the events to an OCULAR file. Relies on the dataframe originating
1231        from an OCULAR file (same columns; duplicate metadata/info).
1232        :param output_path:
1233        :param event_type:
1234        :return:
1235        """
1236        if pyreadr is None:
1237            raise ModuleNotFoundError(
1238                "pyreadr not installed! Install pyreadr directly "
1239                "or run `pip install csi-images[rds]` option to resolve."
1240            )
1241        if event_type == "cells":
1242            file_stub = "rc-final"
1243        elif event_type == "others":
1244            file_stub = "others-final"
1245        else:
1246            raise ValueError("Invalid event type. Must be cells or others.")
1247
1248        # Ensure good metadata
1249        metadata = pd.DataFrame(
1250            {
1251                "slide_id": self.info["slide_id"],
1252                "frame_id": self.info["tile"] + 1,  # Convert to 1-indexed for R
1253                "cell_id": (
1254                    self.metadata["cell_id"]
1255                    if "cell_id" in self.metadata.columns
1256                    else range(len(self.info))
1257                ),
1258                "cellx": self.info["x"],
1259                "celly": self.info["y"],
1260            }
1261        )
1262        if self.metadata is not None:
1263            metadata[self.metadata.columns] = self.metadata.copy()
1264
1265        # Check for the "ocular_interesting" column
1266        if event_type == "cells":
1267            if "ocular_interesting" in metadata.columns:
1268                interesting_rows = metadata["ocular_interesting"].to_numpy(dtype=bool)
1269            elif "hcpc" in metadata.columns:
1270                # Interesting cells don't get an hcpc designation, leaving them as -1
1271                interesting_rows = (
1272                    metadata["hcpc"].to_numpy() == -1
1273                )  # interesting cells
1274            else:
1275                interesting_rows = []
1276            if sum(interesting_rows) > 0:
1277                # Split the metadata into interesting and regular
1278                interesting_events = self.rows(interesting_rows)
1279                interesting_df = pd.concat(
1280                    [interesting_events.features, interesting_events.metadata], axis=1
1281                )
1282                data_events = self.rows(~interesting_rows)
1283                data_df = pd.concat(
1284                    [data_events.features, data_events.metadata], axis=1
1285                )
1286                data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore")
1287
1288                # Drop particular columns for "interesting"
1289                interesting_df = interesting_df.drop(
1290                    [
1291                        "clust",
1292                        "hcpc",
1293                        "frame_id",
1294                        "cell_id",
1295                        "unique_id",
1296                        "ocular_interesting",
1297                    ],
1298                    axis=1,
1299                    errors="ignore",
1300                )
1301                # Save both .csv and .rds
1302                interesting_stub = os.path.join(output_path, "ocular_interesting")
1303                interesting_df.to_csv(f"{interesting_stub}.csv")
1304                # Suppress pandas FutureWarning
1305                with warnings.catch_warnings():
1306                    warnings.simplefilter(action="ignore", category=FutureWarning)
1307                    pyreadr.write_rds(f"{interesting_stub}.rds", interesting_df)
1308            else:
1309                data_df = pd.concat([self.features, metadata], axis=1)
1310        else:
1311            # Get all data and reset_index (will copy it)
1312            data_df = pd.concat([self.features, metadata], axis=1)
1313
1314        # Split based on cluster number to conform to *-final[1-4].rds
1315        n_clusters = max(data_df["clust"]) + 1
1316        split_idx = [round(i * n_clusters / 4) for i in range(5)]
1317        for i in range(4):
1318            subset = (split_idx[i] <= data_df["clust"]) & (
1319                data_df["clust"] < split_idx[i + 1]
1320            )
1321            data_df.loc[subset, "hcpc"] = i + 1
1322            subset = data_df[subset].reset_index(drop=True)
1323            # Suppress pandas FutureWarning
1324            with warnings.catch_warnings():
1325                warnings.simplefilter(action="ignore", category=FutureWarning)
1326                pyreadr.write_rds(
1327                    os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset
1328                )
1329
1330        # Create new example cell strings
1331        data_df["example_cell_id"] = (
1332            data_df["slide_id"]
1333            + " "
1334            + data_df["frame_id"].astype(str)
1335            + " "
1336            + data_df["cell_id"].astype(str)
1337            + " "
1338            + data_df["cellx"].astype(int).astype(str)
1339            + " "
1340            + data_df["celly"].astype(int).astype(str)
1341        )
1342        # Find averagable data columns
1343        if "cellcluster_id" in data_df.columns:
1344            end_idx = data_df.columns.get_loc("cellcluster_id")
1345        else:
1346            end_idx = data_df.columns.get_loc("slide_id")
1347        avg_cols = data_df.columns[:end_idx].tolist()
1348        # Group by cluster and average
1349        data_df = data_df.groupby("clust").agg(
1350            **{col: (col, "mean") for col in avg_cols},
1351            count=("clust", "size"),  # count rows in each cluster
1352            example_cells=("example_cell_id", lambda x: ",".join(x)),
1353            hcpc=("hcpc", lambda x: x.iloc[0]),
1354        )
1355        data_df = data_df.reset_index()  # Do NOT drop, index is "clust"
1356        # Create new columns
1357        metadata = pd.DataFrame(
1358            {
1359                "count": data_df["count"],
1360                "example_cells": data_df["example_cells"],
1361                "clust": data_df["clust"].astype(int),
1362                "hcpc": data_df["hcpc"].astype(int),
1363                "id": data_df["clust"].astype(int).astype(str),
1364                "cccluster": "0",  # Dummy value
1365                "ccdistance": 0.0,  # Dummy value
1366                "rownum": list(range(len(data_df))),
1367                "framegroup": 0,  # Dummy value
1368            }
1369        )
1370        # Need to pad the features to 761 columns, as per OCULAR report needs
1371        additional_columns = range(len(avg_cols), 761)
1372        if len(additional_columns) > 0:
1373            padding = pd.DataFrame(
1374                np.zeros((len(data_df), len(additional_columns))),
1375                columns=[f"pad{i}" for i in additional_columns],
1376            )
1377            data_df = pd.concat([data_df[avg_cols], padding, metadata], axis=1)
1378        else:
1379            data_df = pd.concat([data_df[avg_cols], metadata], axis=1)
1380
1381        # Save the cluster data
1382        data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv"))
1383        # Suppress pandas FutureWarning
1384        with warnings.catch_warnings():
1385            warnings.simplefilter(action="ignore", category=FutureWarning)
1386            pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df)

Save the events to an OCULAR file. Relies on the dataframe originating from an OCULAR file (same columns; duplicate metadata/info).

Parameters
  • output_path:
  • event_type:
Returns
@classmethod
def load_ocular( cls, input_path: str, event_type='cells', cell_data_files=('rc-final1.rds', 'rc-final2.rds', 'rc-final3.rds', 'rc-final4.rds', 'ocular_interesting.rds'), others_data_files=('others-final1.rds', 'others-final2.rds', 'others-final3.rds', 'others-final4.rds'), atlas_data_files=('ocular_interesting.rds', 'ocular_not_interesting.rds'), drop_common_events=True) -> Self:
1388    @classmethod
1389    def load_ocular(
1390        cls,
1391        input_path: str,
1392        event_type="cells",
1393        cell_data_files=(
1394            "rc-final1.rds",
1395            "rc-final2.rds",
1396            "rc-final3.rds",
1397            "rc-final4.rds",
1398            "ocular_interesting.rds",
1399        ),
1400        others_data_files=(
1401            "others-final1.rds",
1402            "others-final2.rds",
1403            "others-final3.rds",
1404            "others-final4.rds",
1405        ),
1406        atlas_data_files=(
1407            "ocular_interesting.rds",
1408            "ocular_not_interesting.rds",
1409        ),
1410        drop_common_events=True,
1411    ) -> Self:
1412        """
1413
1414        :param input_path:
1415        :param event_type:
1416        :param cell_data_files:
1417        :param others_data_files:
1418        :param atlas_data_files:
1419        :param drop_common_events:
1420        :return:
1421        """
1422        if pyreadr is None:
1423            raise ModuleNotFoundError(
1424                "pyreadr not installed! Install pyreadr directly "
1425                "or run `pip install csi-images[rds]` option to resolve."
1426            )
1427        # Check if the input path is a directory or a file
1428        if os.path.isfile(input_path):
1429            data_files = [os.path.basename(input_path)]
1430            input_path = os.path.dirname(input_path)
1431        if event_type == "cells":
1432            data_files = cell_data_files
1433        elif event_type == "others":
1434            data_files = others_data_files
1435        else:
1436            raise ValueError("Invalid event type.")
1437
1438        # Load the data from the OCULAR files
1439        file_data = {}
1440        for file in data_files:
1441            file_path = os.path.join(input_path, file)
1442            if not os.path.isfile(file_path):
1443                warnings.warn(f"{file} not found for in {input_path}")
1444                continue
1445            file_data[file] = pyreadr.read_r(file_path)
1446            # Get the DataFrame associated with None (pyreadr dict quirk)
1447            file_data[file] = file_data[file][None]
1448            if len(file_data[file]) == 0:
1449                # File gets dropped from the dict
1450                file_data.pop(file)
1451                warnings.warn(f"{file} has no cells")
1452                continue
1453
1454            # Drop common cells if requested and in this file
1455            if (
1456                file in atlas_data_files
1457                and drop_common_events
1458                and "catalogue_classification" in file_data[file]
1459            ):
1460                common_cell_indices = (
1461                    file_data[file]["catalogue_classification"] == "common_cell"
1462                )
1463                file_data[file] = file_data[file][common_cell_indices == False]
1464
1465            if len(file_data[file]) == 0:
1466                # File gets dropped from the dict
1467                file_data.pop(file)
1468                warnings.warn(f"{file} has no cells after dropping common cells")
1469                continue
1470
1471            # Extract frame_id and cell_id
1472            # DAPI- events already have frame_id cell_id outside rowname
1473            if event_type == "cells" and "frame_id" not in file_data[file].columns:
1474                file_data[file]["rowname"] = file_data[file]["rowname"].astype("str")
1475                # get frame_id cell_id from rownames column and split into two columns
1476                split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True)
1477                if len(split_res.columns) != 2:
1478                    warnings.warn(
1479                        f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}'
1480                    )
1481                # then assign it back to the dataframe
1482                file_data[file][["frame_id", "cell_id"]] = split_res.astype("int")
1483            # Ensure frame_id and cell_id are integers
1484            file_data[file]["frame_id"] = file_data[file]["frame_id"].astype("int")
1485            file_data[file]["cell_id"] = file_data[file]["cell_id"].astype("int")
1486            # reset indexes since they can cause NaN values in concat
1487            file_data[file] = file_data[file].reset_index(drop=True)
1488
1489        # Merge the data from all files
1490        if len(file_data) == 0:
1491            return EventArray()
1492        elif len(file_data) == 1:
1493            data = [file_data[file] for file in file_data.keys()][0]
1494        else:
1495            data = pd.concat(file_data.values())
1496
1497        # Others is missing the "slide_id". Insert it right before "frame_id" column
1498        if event_type == "others" and "slide_id" not in data.columns:
1499            if os.path.basename(input_path) == "ocular":
1500                slide_id = os.path.basename(os.path.dirname(input_path))
1501            else:
1502                slide_id = "UNKNOWN"
1503            data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id)
1504
1505        # Sort according to ascending cell_id to keep the original, which is in manual_df
1506        data = data.sort_values(by=["cell_id"], ascending=True)
1507        # Filter out duplicates by x & y
1508        data = data.assign(
1509            unique_id=data["slide_id"]
1510            + "_"
1511            + data["frame_id"].astype(str)
1512            + "_"
1513            + data["cellx"].astype(int).astype(str)
1514            + "_"
1515            + data["celly"].astype(int).astype(str)
1516        )
1517        data = data.drop_duplicates(subset=["unique_id"], keep="first")
1518        # Normal unique_id is with cell_id
1519        data = data.assign(
1520            unique_id=data["slide_id"]
1521            + "_"
1522            + data["frame_id"].astype(str)
1523            + "_"
1524            + data["cell_id"].astype(str)
1525        )
1526        data = data.reset_index(drop=True)
1527        # All columns up to "slide_id" are features; drop the "slide_id"
1528        features = data.loc[:, :"slide_id"].iloc[:, :-1]
1529        data = data.loc[:, "slide_id":]
1530        # Grab the info columns
1531        info = data[["slide_id", "frame_id", "cellx", "celly"]]
1532        info.columns = ["slide_id", "tile", "x", "y"]
1533        info = info.assign(roi=0)  # OCULAR only works on 1 ROI, as far as known
1534        info = info[["slide_id", "tile", "roi", "x", "y"]]
1535        # Metadata has duplicate columns for later convenience
1536        metadata = data
1537        # Certain columns tend to be problematic with mixed data formats...
1538        for col in ["TRITC", "CY5", "FITC"]:
1539            if col in metadata:
1540                labels = {
1541                    "False": False,
1542                    "True": True,
1543                    "FALSE": False,
1544                    "TRUE": True,
1545                    False: False,
1546                    True: True,
1547                }
1548                metadata[col] = metadata[col].map(labels).astype(bool)
1549        for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]:
1550            if col in metadata:
1551                metadata[col] = metadata[col].fillna(-1).astype(int)
1552        info["tile"] = info["tile"] - 1  # Convert to 0-based indexing
1553        return EventArray(info, metadata, features)
Parameters
  • input_path:
  • event_type:
  • cell_data_files:
  • others_data_files:
  • atlas_data_files:
  • drop_common_events:
Returns