csi_images.csi_events

Contains the Event class, which represents a single event in a scan. The Event class optionally holds metadata and features. Lists of events with similar metadata or features can be combined into DataFrames for analysis.

The Event class holds the position of the event in the frame, which can be converted to the position in the scanner or slide coordinate positions. See the csi_utils.csi_scans documentation page for more information on the coordinate systems.

   1"""
   2Contains the Event class, which represents a single event in a scan.
   3The Event class optionally holds metadata and features. Lists of events with
   4similar metadata or features can be combined into DataFrames for analysis.
   5
   6The Event class holds the position of the event in the frame, which can be converted
   7to the position in the scanner or slide coordinate positions. See the
   8csi_utils.csi_scans documentation page for more information on the coordinate systems.
   9"""
  10
  11import os
  12import glob
  13import math
  14import warnings
  15from typing import Self, Iterable, Hashable, Sequence
  16
  17import numpy as np
  18import pandas as pd
  19
  20from .csi_scans import Scan
  21from .csi_tiles import Tile
  22from .csi_frames import Frame
  23
  24# Optional dependencies; will raise errors in particular functions if not installed
  25try:
  26    from . import csi_images
  27except ImportError:
  28    csi_images = None
  29try:
  30    import imageio.v3 as imageio
  31except ImportError:
  32    imageio = None
  33try:
  34    import pyreadr
  35except ImportError:
  36    pyreadr = None
  37
  38
  39class Event:
  40    """
  41    A class that represents a single event in a scan, making it easy to evaluate
  42    singular events. Required metadata is exposed as attributes, and optional
  43    metadata and features are stored as DataFrames.
  44    """
  45
  46    SCAN_TO_SLIDE_TRANSFORM = {
  47        # Axioscan zero is in the top-right corner instead of top-left
  48        Scan.Type.AXIOSCAN7: np.array(
  49            [
  50                [1, 0, 75000],
  51                [0, 1, 0],
  52                [0, 0, 1],
  53            ]
  54        ),
  55        # BZScanner coordinates are a special kind of messed up:
  56        # - The slide is upside-down.
  57        # - The slide is oriented vertically, with the barcode at the bottom.
  58        # - Tiles are numbered from the top-right
  59        Scan.Type.BZSCANNER: np.array(
  60            [
  61                [0, -1, 75000],
  62                [-1, 0, 25000],
  63                [0, 0, 1],
  64            ]
  65        ),
  66    }
  67    """
  68    Homogeneous transformation matrices for converting between scanner and slide
  69    coordinates. The matrices are 3x3, with the final column representing the
  70    translation in micrometers (um). For more information, see 
  71    [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations).
  72    
  73    Transformations are nominal, and accuracy is not guaranteed; this is due to 
  74    imperfections in slides and alignment in the scanners. Units are in micrometers.
  75    """
  76
  77    def __init__(
  78        self,
  79        tile: Tile,
  80        x: int,
  81        y: int,
  82        metadata: pd.Series = None,
  83        features: pd.Series = None,
  84    ):
  85        self.tile = tile
  86        self.x = int(x)
  87        self.y = int(y)
  88        self.metadata = metadata
  89        self.features = features
  90
  91    def __repr__(self) -> str:
  92        return f"{self.tile}-{self.x}-{self.y}"
  93
  94    def __eq__(self, other) -> bool:
  95        return self.__repr__() == other.__repr__()
  96
  97    def __lt__(self, other):
  98        return self.__repr__() < other.__repr__()
  99
 100    def get_scan_position(self) -> tuple[float, float]:
 101        """
 102        Get the position of the event in the scanner's coordinate frame.
 103        :return: the scan position of the event in micrometers (um).
 104        """
 105        # Get overall pixel position
 106        real_tile_height, real_tile_width = self.tile.scan.get_image_size()
 107        pixel_x = self.x + (real_tile_width * self.tile.x)
 108        pixel_y = self.y + (real_tile_height * self.tile.y)
 109        # Convert to micrometers
 110        x_um = pixel_x * self.tile.scan.pixel_size_um
 111        y_um = pixel_y * self.tile.scan.pixel_size_um
 112        # Add the scan's origin in the scanner frame
 113        x_um += self.tile.scan.roi[self.tile.n_roi].origin_x_um
 114        y_um += self.tile.scan.roi[self.tile.n_roi].origin_y_um
 115        return x_um, y_um
 116
 117    def get_slide_position(self) -> tuple[float, float]:
 118        """
 119        Get the slide position of the event in micrometers (um).
 120        :return: the slide position of the event.
 121        """
 122        # Turn scan_position into a 3x1 vector
 123        scan_position = self.get_scan_position()
 124        scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]])
 125
 126        # Multiply by the appropriate homogeneous matrix
 127        if self.tile.scan.scanner_id.startswith(self.tile.scan.Type.AXIOSCAN7.value):
 128            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.tile.scan.Type.AXIOSCAN7]
 129        elif self.tile.scan.scanner_id.startswith(self.tile.scan.Type.BZSCANNER.value):
 130            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.tile.scan.Type.BZSCANNER]
 131        else:
 132            raise ValueError(f"Scanner type {self.tile.scan.scanner_id} not supported.")
 133        slide_position = np.matmul(transform, scan_position)
 134        return float(slide_position[0][0]), float(slide_position[1][0])
 135
 136    def crop(
 137        self, images: Iterable[np.ndarray], crop_size: int = 100, in_pixels: bool = True
 138    ) -> list[np.ndarray]:
 139        """
 140        Crop the event from the provided frame images. Use if you have already gotten
 141        frame images; useful for cropping multiple events from the same frame image.
 142        :param images: the frame images.
 143        :param crop_size: the square size of the image crop to get for this event.
 144        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
 145        :return: image_size x image_size crops of the event in the provided frames. If
 146        the event is too close to the edge, the crop will be smaller and not centered.
 147        """
 148        # Convert a crop size in micrometers to pixels
 149        if not in_pixels:
 150            crop_size = round(crop_size / self.tile.scan.pixel_size_um)
 151        image_height, image_width = 0, 0
 152        for image in images:
 153            if image_height == 0 and image_width == 0:
 154                image_height, image_width = image.shape
 155            else:
 156                if image_height != image.shape[0] or image_width != image.shape[1]:
 157                    raise ValueError("All images must be the same size")
 158        if image_height == 0 or image_width == 0:
 159            raise ValueError("No images provided")
 160
 161        # Find the crop bounds
 162        bounds = [
 163            self.x - (crop_size // 2) + 1,
 164            self.y - (crop_size // 2) + 1,
 165            self.x + math.ceil(crop_size / 2) + 1,
 166            self.y + math.ceil(crop_size / 2) + 1,
 167        ]
 168        # Determine how much the bounds violate the image size
 169        displacements = [
 170            max(0, -bounds[0]),
 171            max(0, -bounds[1]),
 172            max(0, bounds[2] - image_width),
 173            max(0, bounds[3] - image_height),
 174        ]
 175        # Cap off the bounds
 176        bounds = [
 177            max(0, bounds[0]),
 178            max(0, bounds[1]),
 179            min(image_width, bounds[2]),
 180            min(image_height, bounds[3]),
 181        ]
 182
 183        # Crop the images
 184        crops = []
 185        for image in images:
 186            # Create a blank image of the right size
 187            crop = np.zeros((crop_size, crop_size), dtype=image.dtype)
 188
 189            # Insert the cropped image into the blank image, leaving a black buffer
 190            # around the edges if the crop would go beyond the original image bounds
 191            crop[
 192                displacements[1] : crop_size - displacements[3],
 193                displacements[0] : crop_size - displacements[2],
 194            ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]]
 195            crops.append(crop)
 196        return crops
 197
 198    def get_crops(
 199        self,
 200        crop_size: int = 100,
 201        in_pixels: bool = True,
 202        input_path: str = None,
 203        channels: Iterable[int | str] = None,
 204        apply_gain: bool | Iterable[bool] = True,
 205    ) -> list[np.ndarray]:
 206        """
 207        Gets the frame images for this event and then crops the event from the images.
 208        Convenient for retrieving a single event's crops, but less efficient when
 209        retrieving multiple events from the same tile as it will reread the images.
 210        :param crop_size: the square size of the image crop to get for this event.
 211        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
 212        :param input_path: the path to the input images. Defaults to None (uses the scan's path).
 213        :param channels: the channels to extract images for. Defaults to all channels.
 214        :param apply_gain: whether to apply scanner-calculated gain to the images, if
 215        not already applied. If a list, matches the channels.
 216        :return: a list of cropped images from the scan in the order of the channels.
 217        """
 218        # This function validates channels
 219        frames = Frame.get_frames(self.tile, channels)
 220        # Convert individual inputs to lists of appropriate length
 221        if isinstance(apply_gain, bool):
 222            apply_gain = [apply_gain] * len(frames)
 223        images = [f.get_image(input_path, a) for f, a in zip(frames, apply_gain)]
 224        return self.crop(images, crop_size, in_pixels)
 225
 226    def save_crops(
 227        self,
 228        crops: Sequence[np.ndarray],
 229        output_path: str,
 230        labels: Sequence[str],
 231        ext: str = "auto",
 232    ):
 233        """
 234        Save the crops to image files.
 235        :param crops: the crops to save. Will save as RGB if 3 channel [h, w, 3] or
 236        grayscale if 1 channel [h, w] or [h, w, 1].
 237        :param labels: the labels to append to the file name, usually the channel names
 238        associated with each crop.
 239        :param output_path: the folder to save the crops to. Will make if needed.
 240        :param ext: the file extension to save the crops as. Defaults to "auto", which
 241        will save as .tif for grayscale images and .jpg for RGB images.
 242        :return: None
 243        """
 244        if len(crops) != len(labels):
 245            raise ValueError("Crops and labels must be the same length")
 246
 247        if csi_images is None or imageio is None:
 248            raise ModuleNotFoundError(
 249                "imageio libraries not installed! "
 250                "run `pip install csi_images[imageio]` to resolve."
 251            )
 252
 253        os.makedirs(output_path, exist_ok=True)
 254
 255        for crop, label in zip(crops, labels):
 256            if ext == "auto":
 257                if len(crop.shape) == 2 or crop.shape[2] == 1:
 258                    file_extension = ".tif"
 259                elif crop.shape[2] == 3:
 260                    file_extension = ".jpg"
 261                else:
 262                    warnings.warn(
 263                        f"Image shape {crop.shape} not recognized; saving as .tif"
 264                    )
 265                    file_extension = ".tif"
 266            else:
 267                file_extension = ext
 268            file = os.path.join(output_path, f"{self}-{label}{file_extension}")
 269            # TODO: add more file types here
 270            if file_extension == ".tif":
 271                imageio.imwrite(file, crop, compression="deflate")
 272            elif file_extension in [".jpg", ".jpeg"]:
 273                crop = csi_images.scale_bit_depth(crop, np.uint8)
 274                imageio.imwrite(file, crop, quality=80)
 275            else:
 276                imageio.imwrite(file, crop)
 277
 278    def load_crops(
 279        self, input_path: str, labels: list[str] = None
 280    ) -> dict[str, np.ndarray]:
 281        """
 282        Loads previously saved crop files from a folder.
 283        :param input_path: folder containing crop files.
 284        :param labels: optional label filter, will only return crops with these labels.
 285        :return: a tuple of lists containing the crops and their labels.
 286        """
 287        crops = {}
 288        for file in glob.glob(os.path.join(input_path, f"{self}-*")):
 289            label = os.path.splitext(os.path.basename(file))[0].split("-")[-1]
 290            # Skip if we have labels to target
 291            if labels is not None and label not in labels:
 292                continue
 293            crops[label] = imageio.imread(file)
 294        return crops
 295
 296    def get_montage_channels(
 297        self,
 298        channels: Sequence[int | str] | None,
 299        composites: dict[int | str, tuple[float, float, float]] | None,
 300    ) -> tuple[list[int], list[int], dict[int, tuple[float, float, float]]]:
 301        """
 302        Get the channel names for the montage from the event's tile.
 303        :param channels: channel indices or names for grayscale channels
 304        :param composites: dictionary of channel indices or names and RGB values
 305        :return: (1) channel indices to retrieve,
 306                 (2) relative grayscale channel indices, and
 307                 (3) composite channel indices and RGB values.
 308        """
 309        if channels is None:
 310            channels = list(range(len(self.tile.scan.channels)))
 311        if (len(channels) == 0) and (composites is None or len(composites) == 0):
 312            raise ValueError("Must provide at least one channel type to montage")
 313
 314        channels_to_get = []
 315
 316        # Build the list of channels to retrieve
 317        if channels is not None:
 318            if isinstance(channels[0], str):
 319                channels = self.tile.scan.get_channel_indices(channels)
 320            channels_to_get += channels
 321            order = list(range(len(channels)))  # Always the first n channels
 322        else:
 323            order = None
 324
 325        if composites is not None:
 326            relative_composites = {}  # Relative indices for retrieved channels
 327            # Convert to scan indices
 328            rgb_channels = list(composites.keys())
 329            if isinstance(rgb_channels[0], str):
 330                rgb_channels = self.tile.scan.get_channel_indices(rgb_channels)
 331            # Find the index or add to the end
 332            for channel, rgb in zip(rgb_channels, composites.values()):
 333                if channel not in channels_to_get:
 334                    channels_to_get.append(channel)
 335                    relative_composites[channel] = rgb
 336                else:
 337                    relative_composites[channels_to_get.index(channel)] = rgb
 338        else:
 339            relative_composites = None
 340
 341        return channels_to_get, order, relative_composites
 342
 343    def get_montage(
 344        self,
 345        channels: Sequence[int | str] = None,
 346        composites: dict[int | str, tuple[float, float, float]] = None,
 347        crop_size: int = 100,
 348        in_pixels: bool = True,
 349        input_path: str = None,
 350        apply_gain: bool = True,
 351        **kwargs,
 352    ) -> np.ndarray:
 353        """
 354        Convenience function for getting frame images and creating a montage. Mirrors
 355        csi_images.make_montage(). Convenient for a single event's montage, but less
 356        efficient when for multiple events from the same tile.
 357        :param channels: the channels to use for black-and-white montages.
 358        :param composites: dictionary of indices and RGB tuples for a composite.
 359        :param crop_size: the square size of the image crop to get for this event.
 360        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
 361        :param input_path: the path to the input images. Defaults to None (uses the scan's path).
 362        :param apply_gain: whether to apply scanner-calculated gain to the images, if
 363        not already applied. If a list, matches the channels.
 364        :param kwargs: montage options. See csi_images.make_montage() for more details.
 365        :return: numpy array representing the montage.
 366        """
 367        channels, order, composites = self.get_montage_channels(channels, composites)
 368        images = self.get_crops(crop_size, in_pixels, input_path, channels, apply_gain)
 369        return csi_images.make_montage(images, order, composites, **kwargs)
 370
 371    def save_montage(
 372        self,
 373        montage: np.ndarray,
 374        output_path: str,
 375        ocular_names: bool = False,
 376        tag: str = "",
 377    ):
 378        """
 379        Save the montage as a JPEG image with a set name.
 380        :param montage: the montage to save.
 381        :param output_path: the folder to save the montage in. Wil make if needed.
 382        :param ocular_names: whether to use the OCULAR naming convention.
 383        :param tag: a tag to append to the file name. Ignored if ocular_names is True.
 384        :return: None
 385        """
 386        if csi_images is None or imageio is None:
 387            raise ModuleNotFoundError(
 388                "imageio libraries not installed! "
 389                "run `pip install csi_images[imageio]` to resolve."
 390            )
 391
 392        montage = csi_images.scale_bit_depth(montage, np.uint8)
 393
 394        if ocular_names:
 395            if "cell_id" not in self.metadata.index:
 396                raise ValueError(
 397                    "Event metadata must include 'cell_id' for OCULAR naming."
 398                )
 399            file = f"{self.tile.n}-{self.metadata['cell_id']}-{self.x}-{self.y}.jpeg"
 400        else:
 401            file = f"{self}{tag}.jpeg"
 402
 403        os.makedirs(output_path, exist_ok=True)
 404        imageio.imwrite(os.path.join(output_path, file), montage, quality=80)
 405
 406    def load_montage(self, input_path: str, tag: str = "") -> np.ndarray:
 407        """
 408        Loads the montage from a file saved by Event.save_montage.
 409        :param input_path: the path to the folder where the montage was saved.
 410        :param tag: a string to add to the file name, before the extension.
 411        :return:
 412        """
 413        file = f"{self}{tag}.jpeg"
 414        return imageio.imread(os.path.join(input_path, file))
 415
 416    @classmethod
 417    def get_many_crops(
 418        cls,
 419        events: Sequence[Self],
 420        crop_size: int | Sequence[int] = 100,
 421        in_pixels: bool = True,
 422        input_path: str | Sequence[str] = None,
 423        channels: Sequence[int | str] = None,
 424        apply_gain: bool | Sequence[bool] = True,
 425    ) -> list[list[np.ndarray]]:
 426        """
 427        Get the crops for a list of events, ensuring that there is no wasteful reading
 428        of the same tile multiple times. This function is more efficient than calling
 429        get_crops() for each event.
 430        :param events: the events to get crops for.
 431        :param crop_size: the square size of the image crop to get for this event.
 432                          Defaults to four times the size of the event.
 433        :param in_pixels: whether the crop size is in pixels or micrometers.
 434                          Defaults to pixels, and is ignored if crop_size is None.
 435        :param input_path: the path to the input images. Will only work for lists of events
 436                           from the same scan. Defaults to None (uses the scan's path).
 437        :param channels: the channels to extract images for. Defaults to all channels.
 438        :param apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True.
 439                           Can be supplied as a list to apply gain to individual channels.
 440        :return: a list of lists of cropped images for each event.
 441        """
 442        if len(events) == 0:
 443            return []
 444        # Adapt singular inputs to lists of appropriate length
 445        if isinstance(crop_size, int):
 446            crop_size = [crop_size] * len(events)
 447        if input_path is None or isinstance(input_path, str):
 448            input_path = [input_path] * len(events)
 449
 450        # Get the order of the events when sorted by slide/tile
 451        order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__()))
 452
 453        # Allocate the list to size
 454        crops = [[]] * len(events)
 455        last_tile = None
 456        images = None  # Holds large numpy arrays, so expensive to compare
 457        # Iterate through in slide/tile sorted order
 458        for i in order:
 459            if last_tile != events[i].tile:
 460                # Gather the frame images, preserving them for the next event
 461                frames = Frame.get_frames(events[i].tile, channels)
 462                if isinstance(apply_gain, bool):
 463                    apply = [apply_gain] * len(frames)
 464                else:
 465                    apply = apply_gain
 466                images = [f.get_image(input_path[i], a) for f, a in zip(frames, apply)]
 467                last_tile = events[i].tile
 468            # Use the frame images to crop the event images
 469            crops[i] = events[i].crop(images, crop_size[i], in_pixels)
 470        return crops
 471
 472    @classmethod
 473    def get_many_montages(
 474        cls,
 475        events: Sequence[Self],
 476        channels: Sequence[int | str] = None,
 477        composites: dict[int | str, tuple[float, float, float]] = None,
 478        crop_size: int = 100,
 479        in_pixels: bool = True,
 480        input_path: str = None,
 481        apply_gain: bool | Iterable[bool] = True,
 482        **kwargs,
 483    ) -> list[np.ndarray]:
 484        """
 485        Convenience function for get_montage(), but for a list of events. More efficient
 486        thank get_montage() when working with multiple events from the same tile.
 487        :param events: a list of Event objects.
 488        :param channels: the channels to extract images for. Defaults to all channels.
 489        :param composites: dictionary of indices and RGB tuples for a composite.
 490        :param crop_size: the square size of the image crop to get for this event.
 491        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
 492        :param input_path: the path to the input images. Defaults to None (uses the scan's path).
 493        :param apply_gain: whether to apply scanner-calculated gain to the images, if
 494        not already applied. If a list, matches the channels.
 495        :param kwargs: montage options. See csi_images.make_montage() for more details.
 496        :return: a list of numpy arrays representing the montages.
 497        """
 498        if len(events) == 0:
 499            return []
 500        # Adapt singular inputs to lists of appropriate length
 501        if isinstance(crop_size, int):
 502            crop_size = [crop_size] * len(events)
 503        if input_path is None or isinstance(input_path, str):
 504            input_path = [input_path] * len(events)
 505
 506        # Get the order of the events when sorted by slide/tile
 507        event_order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__()))
 508
 509        # Allocate the list to size
 510        montages = [np.empty(0)] * len(events)
 511        # Placeholder variables to avoid rereading the same tile
 512        images = None  # Holds large numpy arrays, so expensive to compare
 513        order = None
 514        rel_composites = None
 515        last_tile = None
 516        # Iterate through in slide/tile sorted order
 517        for i in event_order:
 518            if last_tile != events[i].tile:
 519                channels_to_get, order, rel_composites = events[i].get_montage_channels(
 520                    channels, composites
 521                )
 522                # Gather the frame images, preserving them for the next event
 523                frames = Frame.get_frames(events[i].tile, channels_to_get)
 524                if isinstance(apply_gain, bool):
 525                    apply = [apply_gain] * len(frames)
 526                else:
 527                    apply = apply_gain
 528                images = [f.get_image(input_path[i], a) for f, a in zip(frames, apply)]
 529                last_tile = events[i].tile
 530            # Use the frame images to crop the event images and make montages
 531            crops = events[i].crop(images, crop_size[i], in_pixels)
 532            montages[i] = csi_images.make_montage(
 533                crops, order, rel_composites, **kwargs
 534            )
 535
 536        return montages
 537
 538    @classmethod
 539    def get_and_save_many_crops(
 540        cls,
 541        events: list[Self],
 542        output_path: str,
 543        labels: Sequence[str],
 544        ext: str = "auto",
 545        additional_gain: Sequence[float] = None,
 546        **kwargs,
 547    ) -> None:
 548        """
 549        Get and save the crops for a list of events, ensuring that there is no wasteful
 550        reading and limiting the image data in memory to 1 tile at a time. This function
 551        is more efficient that chaining get_crops() and save_crops() for each event or
 552        get_many_crops() and then save_crops().
 553        :param events: list of events to get, crop, and save.
 554        :param output_path: the folder to save the crops in. Will make if needed.
 555        :param labels: the labels to save the crops with. See save_crops().
 556        :param ext: the file extension to save the crops as. See save_crops().
 557        :param additional_gain: additional gain to apply to the crops. If not None, must
 558        match the length of the number of crop channels.
 559        :param kwargs: see get_many_crops() for more parameters.
 560        :return:
 561        """
 562        unique_tiles = set([event.tile for event in events])
 563
 564        for tile in unique_tiles:
 565            # Get one tile's worth of event crops
 566            tile_events = [e for e in events if e.tile == tile]
 567            crops_list = cls.get_many_crops(tile_events, **kwargs)
 568            for event, crops in zip(tile_events, crops_list):
 569                # Apply any additional gains
 570                if additional_gain is not None:
 571                    crops = [gain * crop for gain, crop in zip(additional_gain, crops)]
 572                event.save_crops(crops, output_path, labels, ext)
 573
 574    @classmethod
 575    def get_and_save_many_montages(
 576        cls,
 577        events: list[Self],
 578        output_path: str,
 579        ocular_names: bool = False,
 580        tag: str = "",
 581        **kwargs,
 582    ) -> None:
 583        """
 584        Save montages of the events to image files.
 585        :param events: the events to get, montage, and save.
 586        :param output_path: the folder to save the montages to. Will make if needed.
 587        :param ocular_names: whether to use the OCULAR naming convention.
 588        :param tag: a tag to append to the file name. Ignored if ocular_names is True.
 589        :param kwargs: see get_many_montages() for more parameters.
 590        """
 591        unique_tiles = set([event.tile for event in events])
 592
 593        for tile in unique_tiles:
 594            # Get one tile's worth of event crops
 595            tile_events = [e for e in events if e.tile == tile]
 596            montages = cls.get_many_montages(tile_events, **kwargs)
 597            for event, montage in zip(tile_events, montages):
 598                event.save_montage(montage, output_path, ocular_names, tag)
 599
 600
 601class EventArray:
 602    """
 603    A class that holds a large number of events' data, making it easy to analyze and
 604    manipulate many events at once. A more separated version of the Event class.
 605    """
 606
 607    INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y"]
 608
 609    def __init__(
 610        self,
 611        info: pd.DataFrame = None,
 612        metadata: pd.DataFrame = None,
 613        features: pd.DataFrame = None,
 614    ):
 615        # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y"
 616        if info is not None:
 617            # Special case: "roi" is often not required, so we'll fill in if its missing
 618            if "roi" not in info.columns:
 619                info["roi"] = 0
 620            if set(info.columns) != set(self.INFO_COLUMNS):
 621                raise ValueError(
 622                    f"EventArray.info must have columns:"
 623                    f"{self.INFO_COLUMNS}; had {list(info.columns)}"
 624                )
 625            # Copy first to avoid modifying the original
 626            info = info.copy()
 627            # Ensure that the columns are the right types
 628            info["slide_id"] = info["slide_id"].astype(str)
 629            info["tile"] = info["tile"].astype(np.uint16)
 630            info["roi"] = info["roi"].astype(np.uint8)
 631            info["x"] = info["x"].round().astype(np.uint16)
 632            info["y"] = info["y"].round().astype(np.uint16)
 633            # Ensure that the columns are in the right order
 634            info = info[self.INFO_COLUMNS]
 635        # All DataFrames must all have the same number of rows
 636        if metadata is not None and (info is None or len(info) != len(metadata)):
 637            raise ValueError(
 638                "If EventArray.metadata is not None, it should match rows with .info"
 639            )
 640        if features is not None and (info is None or len(info) != len(features)):
 641            raise ValueError(
 642                "If EventArray.features is not None, it should match rows with .info"
 643            )
 644        # No columns named "metadata_", "features_", or "None"
 645        column_names = []
 646        if metadata is not None:
 647            column_names += metadata.columns.tolist()
 648        if features is not None:
 649            column_names += features.columns.tolist()
 650        if any([col.lower().startswith("metadata_") for col in column_names]):
 651            raise ValueError("EventArray column names cannot start with 'metadata_'")
 652        if any([col.lower().startswith("features_") for col in column_names]):
 653            raise ValueError("EventArray column names cannot start with 'features_'")
 654        if any([col.lower() == "none" for col in column_names]):
 655            raise ValueError("EventArray column names cannot be 'none'")
 656
 657        self.info = info
 658        self.metadata = metadata
 659        self.features = features
 660
 661    def __len__(self) -> int:
 662        # Convenience method to get the number of events
 663        if self.info is None:
 664            return 0
 665        else:
 666            return len(self.info)
 667
 668    def __eq__(self, other):
 669        # Parse all possibilities for info
 670        if isinstance(self.info, pd.DataFrame):
 671            if isinstance(other.info, pd.DataFrame):
 672                if not self.info.equals(other.info):
 673                    return False
 674            else:
 675                return False
 676        elif self.info is None:
 677            if other.info is not None:
 678                return False
 679
 680        # Parse all possibilities for metadata
 681        if isinstance(self.metadata, pd.DataFrame):
 682            if isinstance(other.metadata, pd.DataFrame):
 683                is_equal = self.metadata.equals(other.metadata)
 684                if not is_equal:
 685                    return False
 686            else:
 687                return False
 688        elif self.metadata is None:
 689            if other.metadata is not None:
 690                return False
 691
 692        # Parse all possibilities for features
 693        if isinstance(self.features, pd.DataFrame):
 694            if isinstance(other.features, pd.DataFrame):
 695                is_equal = self.features.equals(other.features)
 696                if not is_equal:
 697                    return False
 698            else:
 699                return False
 700        elif self.features is None:
 701            if other.features is not None:
 702                return False
 703
 704        return is_equal
 705
 706    def get_sort_order(
 707        self, by: Hashable | Sequence[Hashable], ascending: bool | Sequence[bool] = True
 708    ):
 709        """
 710        Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.
 711        :param by: name of the column(s) to sort by.
 712        :param ascending: whether to sort in ascending order; can be a list to match by
 713        :return: the order of the indices to sort by.
 714        """
 715        columns = self.get(by)
 716        return columns.sort_values(by=by, ascending=ascending).index
 717
 718    def sort(
 719        self,
 720        by: Hashable | Sequence[Hashable],
 721        ascending: bool | Sequence[bool] = True,
 722    ) -> Self:
 723        """
 724        Sort the EventArray by column(s) in the info, metadata, or features DataFrames.
 725        :param by: name of the column(s) to sort by.
 726        :param ascending: whether to sort in ascending order; can be a list to match by
 727        :return: a new, sorted EventArray.
 728        """
 729        order = self.get_sort_order(by, ascending)
 730        info = self.info.loc[order].reset_index(drop=True)
 731        if self.metadata is not None:
 732            metadata = self.metadata.loc[order].reset_index(drop=True)
 733        else:
 734            metadata = None
 735        if self.features is not None:
 736            features = self.features.loc[order].reset_index(drop=True)
 737        else:
 738            features = None
 739        return EventArray(info, metadata, features)
 740
 741    def get(self, column_names: Hashable | Sequence[Hashable]) -> pd.DataFrame:
 742        """
 743        Get a DataFrame with the specified columns from the EventArray, by value.
 744        :param column_names: the names of the columns to get.
 745        :return: a DataFrame with the specified columns.
 746        """
 747        if isinstance(column_names, Hashable):
 748            column_names = [column_names]  # Drop into a list for the loop
 749        columns = []
 750        for column_name in column_names:
 751            if column_name in self.info.columns:
 752                columns.append(self.info[column_name])
 753            elif self.metadata is not None and column_name in self.metadata.columns:
 754                columns.append(self.metadata[column_name])
 755            elif self.features is not None and column_name in self.features.columns:
 756                columns.append(self.features[column_name])
 757            else:
 758                raise ValueError(f"Column {column_name} not found in EventArray")
 759        return pd.concat(columns, axis=1)
 760
 761    def rows(self, rows: Sequence[Hashable]) -> Self:
 762        """
 763        Get a subset of the EventArray rows based on a boolean or integer index, by value.
 764        :param rows: row labels, indices, or boolean mask; anything for .loc[]
 765        :return: a new EventArray with the subset of events.
 766        """
 767        info = self.info.loc[rows].reset_index(drop=True)
 768        if self.metadata is not None:
 769            metadata = self.metadata.loc[rows].reset_index(drop=True)
 770        else:
 771            metadata = None
 772        if self.features is not None:
 773            features = self.features.loc[rows].reset_index(drop=True)
 774        else:
 775            features = None
 776        return EventArray(info, metadata, features)
 777
 778    def copy(self) -> Self:
 779        """
 780        Create a deep copy of the EventArray.
 781        :return: a deep copy of the EventArray.
 782        """
 783        return EventArray(
 784            info=self.info.copy(),
 785            metadata=None if self.metadata is None else self.metadata.copy(),
 786            features=None if self.features is None else self.features.copy(),
 787        )
 788
 789    # TODO: add a "filter" convenience function that takes a column name and values to filter by
 790
 791    def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None:
 792        """
 793        Add metadata to the EventArray. Removes the need to check if metadata is None.
 794        Overwrites any existing metadata with the same column names as the new metadata.
 795        :param new_metadata: the metadata to add.
 796        """
 797        if len(self) != len(new_metadata):
 798            raise ValueError("New metadata must match length of existing info")
 799
 800        if self.metadata is None:
 801            self.metadata = new_metadata
 802        else:
 803            if isinstance(new_metadata, pd.Series):
 804                self.metadata[new_metadata.name] = new_metadata
 805            else:
 806                # It's a DataFrame
 807                self.metadata[new_metadata.columns] = new_metadata
 808
 809    def add_features(self, new_features: pd.Series | pd.DataFrame) -> None:
 810        """
 811        Add features to the EventArray. Removes the need to check if features is None.
 812        Overwrites any existing features with the same column names as the new features.
 813        :param new_features: the features to add.
 814        """
 815        if len(self) != len(new_features):
 816            raise ValueError("New features must match length of existing info")
 817
 818        if self.features is None:
 819            self.features = new_features
 820        else:
 821            if isinstance(new_features, pd.Series):
 822                self.features[new_features.name] = new_features
 823            else:
 824                # It's a DataFrame
 825                self.features[new_features.columns] = new_features
 826
 827    @classmethod
 828    def merge(cls, events: Iterable[Self]) -> Self:
 829        """
 830        Combine EventArrays in a list into a single EventArray.
 831        :param events: the new list of events.
 832        """
 833        all_info = []
 834        all_metadata = []
 835        all_features = []
 836        for event_array in events:
 837            # Skip empty EventArrays
 838            if event_array.info is not None:
 839                all_info.append(event_array.info)
 840            if event_array.metadata is not None:
 841                all_metadata.append(event_array.metadata)
 842            if event_array.features is not None:
 843                all_features.append(event_array.features)
 844        if len(all_info) == 0:
 845            return EventArray()
 846        else:
 847            all_info = pd.concat(all_info, ignore_index=True)
 848        if len(all_metadata) == 0:
 849            all_metadata = None
 850        else:
 851            all_metadata = pd.concat(all_metadata, ignore_index=True)
 852        if len(all_features) == 0:
 853            all_features = None
 854        else:
 855            all_features = pd.concat(all_features, ignore_index=True)
 856
 857        return EventArray(all_info, all_metadata, all_features)
 858
 859    def to_events(
 860        self,
 861        scans: Scan | Iterable[Scan],
 862        ignore_missing_scans=True,
 863        ignore_metadata=False,
 864        ignore_features=False,
 865    ) -> list[Event]:
 866        """
 867        Get the events in the EventArray as a list of events.
 868        :param scans: the scans that the events belong to, auto-matched by slide_id.
 869        Pass None if you don't care about scan metadata (pass ignore_missing_scans).
 870        :param ignore_missing_scans: whether to create blank scans for events without scans.
 871        :param ignore_metadata: whether to ignore metadata or not
 872        :param ignore_features: whether to ignore features or not
 873        :return:
 874        """
 875        if isinstance(scans, Scan):
 876            scans = [scans]
 877        scans = {scan.slide_id: scan for scan in scans}
 878        events = []
 879        for i in range(len(self.info)):
 880            # Determine the associated scan
 881            slide_id = self.info["slide_id"][i]
 882            if slide_id not in scans:
 883                if ignore_missing_scans:
 884                    # Create a placeholder scan if the scan is missing
 885                    scan = Scan.make_placeholder(
 886                        slide_id,
 887                        self.info["tile"][i],
 888                        self.info["roi"][i],
 889                    )
 890                else:
 891                    raise ValueError(
 892                        f"Scan {self.info['slide_id'][i]} not found for event {i}."
 893                    )
 894            else:
 895                scan = scans[slide_id]
 896
 897            # Prepare the metadata and features
 898            if ignore_metadata or self.metadata is None:
 899                metadata = None
 900            else:
 901                # This Series creation method is less efficient,
 902                # but required for preserving dtypes
 903                metadata = pd.Series(
 904                    {col: self.metadata.loc[i, col] for col in self.metadata.columns},
 905                    dtype=object,
 906                )
 907            if ignore_features or self.features is None:
 908                features = None
 909            else:
 910                features = pd.Series(
 911                    {col: self.features.loc[i, col] for col in self.features.columns},
 912                    dtype=object,
 913                )
 914            # Create the event and append it to the list
 915            events.append(
 916                Event(
 917                    Tile(scan, self.info["tile"][i], self.info["roi"][i]),
 918                    self.info["x"][i],
 919                    self.info["y"][i],
 920                    metadata=metadata,
 921                    features=features,
 922                )
 923            )
 924        return events
 925
 926    @classmethod
 927    def from_events(cls, events: Iterable[Event]) -> Self:
 928        """
 929        Set the events in the EventArray to a new list of events.
 930        :param events: the new list of events.
 931        """
 932        info = pd.DataFrame(
 933            {
 934                "slide_id": [event.tile.scan.slide_id for event in events],
 935                "tile": [event.tile.n for event in events],
 936                "roi": [event.tile.n_roi for event in events],
 937                "x": [event.x for event in events],
 938                "y": [event.y for event in events],
 939            }
 940        )
 941        metadata_list = [event.metadata for event in events]
 942        # Iterate through and ensure that all metadata is the same shape
 943        for metadata in metadata_list:
 944            if type(metadata) != type(metadata_list[0]):
 945                raise ValueError("All metadata must be the same type.")
 946            if metadata is not None and metadata.shape != metadata_list[0].shape:
 947                raise ValueError("All metadata must be the same shape.")
 948        if metadata_list[0] is None:
 949            metadata = None
 950        else:
 951            metadata = pd.DataFrame(metadata_list)
 952        features_list = [event.features for event in events]
 953        # Iterate through and ensure that all features are the same shape
 954        for features in features_list:
 955            if type(features) != type(features_list[0]):
 956                raise ValueError("All features must be the same type.")
 957            if features is not None and features.shape != features_list[0].shape:
 958                raise ValueError("All features must be the same shape.")
 959        if features_list[0] is None:
 960            features = None
 961        else:
 962            features = pd.DataFrame(features_list)
 963        return EventArray(info=info, metadata=metadata, features=features)
 964
 965    def to_dataframe(self) -> pd.DataFrame:
 966        """
 967        Convert all the data in the EventArray to a single DataFrame.
 968        :return: a DataFrame with all the data in the EventArray.
 969        """
 970        # Make a copy of the info DataFrame and prepend "info_" to the column names
 971        output = self.info.copy()
 972        # Combine with the metadata and prepend "metadata_" to the column names
 973        if self.metadata is not None:
 974            metadata = self.metadata.copy()
 975            metadata.columns = [f"metadata_{col}" for col in metadata.columns]
 976            output = pd.concat([output, metadata], axis=1)
 977        # Combine with the features and prepend "features_" to the column names
 978        if self.features is not None:
 979            features = self.features.copy()
 980            features.columns = [f"features_{col}" for col in features.columns]
 981            output = pd.concat([output, features], axis=1)
 982        return output
 983
 984    @classmethod
 985    def from_dataframe(
 986        cls, df, metadata_prefix: str = "metadata_", features_prefix: str = "features_"
 987    ) -> Self:
 988        """
 989        From a single, special DataFrame, create an EventArray.
 990        :param df: the DataFrame to convert to an EventArray.
 991        :param metadata_prefix: the prefix for metadata columns.
 992        :param features_prefix: the prefix for features columns.
 993        :return: a DataFrame with all the data in the EventArray.
 994        """
 995        # Split the columns into info, metadata, and features and strip prefix
 996        info = df[[col for col in df.columns if col in cls.INFO_COLUMNS]].copy()
 997        if info.size == 0:
 998            info = None
 999        metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy()
1000        metadata.columns = [
1001            col.replace(metadata_prefix, "") for col in metadata.columns
1002        ]
1003        if metadata.size == 0:
1004            metadata = None
1005        features = df[[col for col in df.columns if col.startswith("features_")]].copy()
1006        features.columns = [
1007            col.replace(features_prefix, "") for col in features.columns
1008        ]
1009        if features.size == 0:
1010            features = None
1011        return cls(info=info, metadata=metadata, features=features)
1012
1013    @classmethod
1014    def from_mask(
1015        cls,
1016        mask: np.ndarray,
1017        slide_id: str,
1018        tile_n: int,
1019        n_roi: int = 0,
1020        include_cell_id: bool = True,
1021        images: list[np.ndarray] = None,
1022        image_labels: list[str] = None,
1023        properties: list[str] = None,
1024    ) -> Self:
1025        """
1026        Extract events from a mask DataFrame, including metadata and features.
1027        :param mask: the mask to extract events from.
1028        :param slide_id: the slide ID the mask is from.
1029        :param tile_n: the tile number the mask is from.
1030        :param n_roi: the ROI number the mask is from.
1031        :param include_cell_id: whether to include the cell_id, or numerical
1032        mask label, as metadata in the EventArray.
1033        :param images: the intensity images to extract features from.
1034        :param image_labels: the labels for the intensity images.
1035        :param properties: list of properties to extract in addition to the defaults:
1036        :return: EventArray corresponding to the mask labels.
1037        """
1038        if csi_images is None:
1039            raise ModuleNotFoundError(
1040                "imageio libraries not installed! "
1041                "run `pip install csi_images[imageio]` to resolve."
1042            )
1043        # Gather mask_info
1044        if images is not None and image_labels is not None:
1045            if len(images) != len(image_labels):
1046                raise ValueError("Intensity images and labels must match lengths.")
1047
1048        mask_info = csi_images.extract_mask_info(mask, images, image_labels, properties)
1049
1050        if len(mask_info) == 0:
1051            return EventArray()
1052
1053        # Combine provided info and mask info
1054        info = pd.DataFrame(
1055            {
1056                "slide_id": slide_id,
1057                "tile": tile_n,
1058                "roi": n_roi,
1059                "x": mask_info["x"],
1060                "y": mask_info["y"],
1061            },
1062        )
1063        # Extract a metadata column if desired
1064        if include_cell_id:
1065            metadata = pd.DataFrame({"cell_id": mask_info["id"]})
1066        else:
1067            metadata = None
1068        # If any additional properties were extracted, add them as features
1069        mask_info = mask_info.drop(columns=["id", "x", "y"], errors="ignore")
1070        if len(mask_info.columns) > 0:
1071            features = mask_info
1072        else:
1073            features = None
1074        return EventArray(info, metadata, features)
1075
1076    def save_csv(self, output_path: str) -> bool:
1077        """
1078        Save the events to an CSV file, including metadata and features.
1079        :param output_path:
1080        :return:
1081        """
1082        if not output_path.endswith(".csv"):
1083            output_path += ".csv"
1084        self.to_dataframe().to_csv(output_path, index=False)
1085        return os.path.exists(output_path)
1086
1087    @classmethod
1088    def load_csv(
1089        cls,
1090        input_path: str,
1091        metadata_prefix: str = "metadata_",
1092        features_prefix: str = "features_",
1093    ) -> Self:
1094        """
1095        Load the events from an CSV file, including metadata and features.
1096        :param input_path:
1097        :param metadata_prefix:
1098        :param features_prefix:
1099        :return:
1100        """
1101        # Load the CSV file
1102        df = pd.read_csv(input_path)
1103        return cls.from_dataframe(df, metadata_prefix, features_prefix)
1104
1105    def save_hdf5(self, output_path: str) -> bool:
1106        """
1107        Save the events to an HDF5 file, including metadata and features.
1108        Uses the pandas-provided HDF5 functions for ease, and external compatibility,
1109        though these files are slightly harder to view in HDFView or similar.
1110        :param output_path:
1111        :return:
1112        """
1113        if not output_path.endswith(".hdf5") and not output_path.endswith(".h5"):
1114            output_path += ".hdf5"
1115        # Open the output_path as an HDF5 file
1116        with pd.HDFStore(output_path) as store:
1117            # Store the dataframes in the HDF5 file
1118            if self.info is not None:
1119                store.put("info", self.info, index=False)
1120            if self.metadata is not None:
1121                store.put("metadata", self.metadata, index=False)
1122            if self.features is not None:
1123                store.put("features", self.features, index=False)
1124        return os.path.exists(output_path)
1125
1126    @classmethod
1127    def load_hdf5(cls, input_path: str) -> Self:
1128        """
1129        Load the events from an HDF5 file, including metadata and features.
1130        :param input_path:
1131        :return:
1132        """
1133        # Open the input_path as an HDF5 file
1134        with pd.HDFStore(input_path, "r") as store:
1135            # Load the dataframes from the HDF5 file
1136            info = store.get("info") if "info" in store else None
1137            metadata = store.get("metadata") if "metadata" in store else None
1138            features = store.get("features") if "features" in store else None
1139        return cls(info=info, metadata=metadata, features=features)
1140
1141    def save_ocular(self, output_path: str, event_type: str = "cells"):
1142        """
1143        Save the events to an OCULAR file. Relies on the dataframe originating
1144        from an OCULAR file (same columns; duplicate metadata/info).
1145        :param output_path:
1146        :param event_type:
1147        :return:
1148        """
1149        if pyreadr is None:
1150            raise ModuleNotFoundError(
1151                "pyreadr not installed! Install pyreadr directly "
1152                "or run `pip install csi-images[rds]` option to resolve."
1153            )
1154        if event_type == "cells":
1155            file_stub = "rc-final"
1156        elif event_type == "others":
1157            file_stub = "others-final"
1158        else:
1159            raise ValueError("Invalid event type. Must be cells or others.")
1160
1161        # Ensure good metadata
1162        metadata = pd.DataFrame(
1163            {
1164                "slide_id": self.info["slide_id"],
1165                "frame_id": self.info["tile"],
1166                "cell_id": (
1167                    self.metadata["cell_id"]
1168                    if "cell_id" in self.metadata.columns
1169                    else range(len(self.info))
1170                ),
1171                "cellx": self.info["x"],
1172                "celly": self.info["y"],
1173            }
1174        )
1175        if self.metadata is not None:
1176            metadata[self.metadata.columns] = self.metadata.copy()
1177
1178        # Check for the "ocular_interesting" column
1179        if event_type == "cells":
1180            if "ocular_interesting" in metadata.columns:
1181                interesting_rows = metadata["ocular_interesting"].to_numpy(dtype=bool)
1182            elif "hcpc" in metadata.columns:
1183                # Interesting cells don't get an hcpc designation, leaving them as -1
1184                interesting_rows = (
1185                    metadata["hcpc"].to_numpy() == -1
1186                )  # interesting cells
1187            else:
1188                interesting_rows = []
1189            if sum(interesting_rows) > 0:
1190                # Split the metadata into interesting and regular
1191                interesting_events = self.rows(interesting_rows)
1192                interesting_df = pd.concat(
1193                    [interesting_events.features, interesting_events.metadata], axis=1
1194                )
1195                data_events = self.rows(~interesting_rows)
1196                data_df = pd.concat(
1197                    [data_events.features, data_events.metadata], axis=1
1198                )
1199                data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore")
1200
1201                # Drop particular columns for "interesting"
1202                interesting_df = interesting_df.drop(
1203                    [
1204                        "clust",
1205                        "hcpc",
1206                        "frame_id",
1207                        "cell_id",
1208                        "unique_id",
1209                        "ocular_interesting",
1210                    ],
1211                    axis=1,
1212                    errors="ignore",
1213                )
1214                # Save both .csv and .rds
1215                interesting_stub = os.path.join(output_path, "ocular_interesting")
1216                interesting_df.to_csv(f"{interesting_stub}.csv")
1217                # Suppress pandas FutureWarning
1218                with warnings.catch_warnings():
1219                    warnings.simplefilter(action="ignore", category=FutureWarning)
1220                    pyreadr.write_rds(f"{interesting_stub}.rds", interesting_df)
1221            else:
1222                data_df = pd.concat([self.features, metadata], axis=1)
1223        else:
1224            # Get all data and reset_index (will copy it)
1225            data_df = pd.concat([self.features, metadata], axis=1)
1226
1227        # Split based on cluster number to conform to *-final[1-4].rds
1228        n_clusters = max(data_df["clust"]) + 1
1229        split_idx = [round(i * n_clusters / 4) for i in range(5)]
1230        for i in range(4):
1231            subset = (split_idx[i] <= data_df["clust"]) & (
1232                data_df["clust"] < split_idx[i + 1]
1233            )
1234            data_df.loc[subset, "hcpc"] = i + 1
1235            subset = data_df[subset].reset_index(drop=True)
1236            # Suppress pandas FutureWarning
1237            with warnings.catch_warnings():
1238                warnings.simplefilter(action="ignore", category=FutureWarning)
1239                pyreadr.write_rds(
1240                    os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset
1241                )
1242
1243        # Create new example cell strings
1244        data_df["example_cell_id"] = (
1245            data_df["slide_id"]
1246            + " "
1247            + data_df["frame_id"].astype(str)
1248            + " "
1249            + data_df["cell_id"].astype(str)
1250            + " "
1251            + data_df["cellx"].astype(int).astype(str)
1252            + " "
1253            + data_df["celly"].astype(int).astype(str)
1254        )
1255        # Find averagable data columns
1256        if "cellcluster_id" in data_df.columns:
1257            end_idx = data_df.columns.get_loc("cellcluster_id")
1258        else:
1259            end_idx = data_df.columns.get_loc("slide_id")
1260        avg_cols = data_df.columns[:end_idx].tolist()
1261        # Group by cluster and average
1262        data_df = data_df.groupby("clust").agg(
1263            **{col: (col, "mean") for col in avg_cols},
1264            count=("clust", "size"),  # count rows in each cluster
1265            example_cells=("example_cell_id", lambda x: ",".join(x)),
1266            hcpc=("hcpc", lambda x: x.iloc[0]),
1267        )
1268        data_df = data_df.reset_index()  # Do NOT drop, index is "clust"
1269        # Create new columns
1270        metadata = pd.DataFrame(
1271            {
1272                "count": data_df["count"],
1273                "example_cells": data_df["example_cells"],
1274                "clust": data_df["clust"].astype(int),
1275                "hcpc": data_df["hcpc"].astype(int),
1276                "id": data_df["clust"].astype(int).astype(str),
1277                "cccluster": "0",  # Dummy value
1278                "ccdistance": 0.0,  # Dummy value
1279                "rownum": list(range(len(data_df))),
1280                "framegroup": 0,  # Dummy value
1281            }
1282        )
1283        # Need to pad the features to 761 columns, as per OCULAR report needs
1284        additional_columns = range(len(avg_cols), 761)
1285        if len(additional_columns) > 0:
1286            padding = pd.DataFrame(
1287                np.zeros((len(data_df), len(additional_columns))),
1288                columns=[f"pad{i}" for i in additional_columns],
1289            )
1290            data_df = pd.concat([data_df[avg_cols], padding, metadata], axis=1)
1291        else:
1292            data_df = pd.concat([data_df[avg_cols], metadata], axis=1)
1293
1294        # Save the cluster data
1295        data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv"))
1296        # Suppress pandas FutureWarning
1297        with warnings.catch_warnings():
1298            warnings.simplefilter(action="ignore", category=FutureWarning)
1299            pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df)
1300
1301    @classmethod
1302    def load_ocular(
1303        cls,
1304        input_path: str,
1305        event_type="cells",
1306        cell_data_files=(
1307            "rc-final1.rds",
1308            "rc-final2.rds",
1309            "rc-final3.rds",
1310            "rc-final4.rds",
1311            "ocular_interesting.rds",
1312        ),
1313        others_data_files=(
1314            "others-final1.rds",
1315            "others-final2.rds",
1316            "others-final3.rds",
1317            "others-final4.rds",
1318        ),
1319        atlas_data_files=(
1320            "ocular_interesting.rds",
1321            "ocular_not_interesting.rds",
1322        ),
1323        drop_common_events=True,
1324    ) -> Self:
1325        """
1326
1327        :param input_path:
1328        :param event_type:
1329        :param cell_data_files:
1330        :param others_data_files:
1331        :param atlas_data_files:
1332        :param drop_common_events:
1333        :return:
1334        """
1335        if pyreadr is None:
1336            raise ModuleNotFoundError(
1337                "pyreadr not installed! Install pyreadr directly "
1338                "or run `pip install csi-images[rds]` option to resolve."
1339            )
1340        # Check if the input path is a directory or a file
1341        if os.path.isfile(input_path):
1342            data_files = [os.path.basename(input_path)]
1343            input_path = os.path.dirname(input_path)
1344        if event_type == "cells":
1345            data_files = cell_data_files
1346        elif event_type == "others":
1347            data_files = others_data_files
1348        else:
1349            raise ValueError("Invalid event type.")
1350
1351        # Load the data from the OCULAR files
1352        file_data = {}
1353        for file in data_files:
1354            file_path = os.path.join(input_path, file)
1355            if not os.path.isfile(file_path):
1356                warnings.warn(f"{file} not found for in {input_path}")
1357                continue
1358            file_data[file] = pyreadr.read_r(file_path)
1359            # Get the DataFrame associated with None (pyreadr dict quirk)
1360            file_data[file] = file_data[file][None]
1361            if len(file_data[file]) == 0:
1362                # File gets dropped from the dict
1363                file_data.pop(file)
1364                warnings.warn(f"{file} has no cells")
1365                continue
1366
1367            # Drop common cells if requested and in this file
1368            if (
1369                file in atlas_data_files
1370                and drop_common_events
1371                and "catalogue_classification" in file_data[file]
1372            ):
1373                common_cell_indices = (
1374                    file_data[file]["catalogue_classification"] == "common_cell"
1375                )
1376                file_data[file] = file_data[file][common_cell_indices == False]
1377
1378            if len(file_data[file]) == 0:
1379                # File gets dropped from the dict
1380                file_data.pop(file)
1381                warnings.warn(f"{file} has no cells after dropping common cells")
1382                continue
1383
1384            # Extract frame_id and cell_id
1385            # DAPI- events already have frame_id cell_id outside rowname
1386            if event_type == "cells" and "frame_id" not in file_data[file].columns:
1387                file_data[file]["rowname"] = file_data[file]["rowname"].astype("str")
1388                # get frame_id cell_id from rownames column and split into two columns
1389                split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True)
1390                if len(split_res.columns) != 2:
1391                    warnings.warn(
1392                        f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}'
1393                    )
1394                # then assign it back to the dataframe
1395                file_data[file][["frame_id", "cell_id"]] = split_res.astype("int")
1396            # Ensure frame_id and cell_id are integers
1397            file_data[file]["frame_id"] = file_data[file]["frame_id"].astype("int")
1398            file_data[file]["cell_id"] = file_data[file]["cell_id"].astype("int")
1399            # reset indexes since they can cause NaN values in concat
1400            file_data[file] = file_data[file].reset_index(drop=True)
1401
1402        # Merge the data from all files
1403        if len(file_data) == 0:
1404            return EventArray()
1405        elif len(file_data) == 1:
1406            data = [file_data[file] for file in file_data.keys()][0]
1407        else:
1408            data = pd.concat(file_data.values())
1409
1410        # Others is missing the "slide_id". Insert it right before "frame_id" column
1411        if event_type == "others" and "slide_id" not in data.columns:
1412            if os.path.basename(input_path) == "ocular":
1413                slide_id = os.path.basename(os.path.dirname(input_path))
1414            else:
1415                slide_id = "UNKNOWN"
1416            data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id)
1417
1418        # Sort according to ascending cell_id to keep the original, which is in manual_df
1419        data = data.sort_values(by=["cell_id"], ascending=True)
1420        # Filter out duplicates by x & y
1421        data = data.assign(
1422            unique_id=data["slide_id"]
1423            + "_"
1424            + data["frame_id"].astype(str)
1425            + "_"
1426            + data["cellx"].astype(int).astype(str)
1427            + "_"
1428            + data["celly"].astype(int).astype(str)
1429        )
1430        data = data.drop_duplicates(subset=["unique_id"], keep="first")
1431        # Normal unique_id is with cell_id
1432        data = data.assign(
1433            unique_id=data["slide_id"]
1434            + "_"
1435            + data["frame_id"].astype(str)
1436            + "_"
1437            + data["cell_id"].astype(str)
1438        )
1439        data = data.reset_index(drop=True)
1440        # All columns up to "slide_id" are features; drop the "slide_id"
1441        features = data.loc[:, :"slide_id"].iloc[:, :-1]
1442        data = data.loc[:, "slide_id":]
1443        # Grab the info columns
1444        info = data[["slide_id", "frame_id", "cellx", "celly"]]
1445        info.columns = ["slide_id", "tile", "x", "y"]
1446        info = info.assign(roi=0)  # OCULAR only works on 1 ROI, as far as known
1447        info = info[["slide_id", "tile", "roi", "x", "y"]]
1448        # Metadata has duplicate columns for later convenience
1449        metadata = data
1450        # Certain columns tend to be problematic with mixed data formats...
1451        for col in ["TRITC", "CY5", "FITC"]:
1452            if col in metadata:
1453                labels = {
1454                    "False": False,
1455                    "True": True,
1456                    "FALSE": False,
1457                    "TRUE": True,
1458                    False: False,
1459                    True: True,
1460                }
1461                metadata[col] = metadata[col].map(labels).astype(bool)
1462        for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]:
1463            if col in metadata:
1464                metadata[col] = metadata[col].fillna(-1).astype(int)
1465        return EventArray(info, metadata, features)
class Event:
 40class Event:
 41    """
 42    A class that represents a single event in a scan, making it easy to evaluate
 43    singular events. Required metadata is exposed as attributes, and optional
 44    metadata and features are stored as DataFrames.
 45    """
 46
 47    SCAN_TO_SLIDE_TRANSFORM = {
 48        # Axioscan zero is in the top-right corner instead of top-left
 49        Scan.Type.AXIOSCAN7: np.array(
 50            [
 51                [1, 0, 75000],
 52                [0, 1, 0],
 53                [0, 0, 1],
 54            ]
 55        ),
 56        # BZScanner coordinates are a special kind of messed up:
 57        # - The slide is upside-down.
 58        # - The slide is oriented vertically, with the barcode at the bottom.
 59        # - Tiles are numbered from the top-right
 60        Scan.Type.BZSCANNER: np.array(
 61            [
 62                [0, -1, 75000],
 63                [-1, 0, 25000],
 64                [0, 0, 1],
 65            ]
 66        ),
 67    }
 68    """
 69    Homogeneous transformation matrices for converting between scanner and slide
 70    coordinates. The matrices are 3x3, with the final column representing the
 71    translation in micrometers (um). For more information, see 
 72    [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations).
 73    
 74    Transformations are nominal, and accuracy is not guaranteed; this is due to 
 75    imperfections in slides and alignment in the scanners. Units are in micrometers.
 76    """
 77
 78    def __init__(
 79        self,
 80        tile: Tile,
 81        x: int,
 82        y: int,
 83        metadata: pd.Series = None,
 84        features: pd.Series = None,
 85    ):
 86        self.tile = tile
 87        self.x = int(x)
 88        self.y = int(y)
 89        self.metadata = metadata
 90        self.features = features
 91
 92    def __repr__(self) -> str:
 93        return f"{self.tile}-{self.x}-{self.y}"
 94
 95    def __eq__(self, other) -> bool:
 96        return self.__repr__() == other.__repr__()
 97
 98    def __lt__(self, other):
 99        return self.__repr__() < other.__repr__()
100
101    def get_scan_position(self) -> tuple[float, float]:
102        """
103        Get the position of the event in the scanner's coordinate frame.
104        :return: the scan position of the event in micrometers (um).
105        """
106        # Get overall pixel position
107        real_tile_height, real_tile_width = self.tile.scan.get_image_size()
108        pixel_x = self.x + (real_tile_width * self.tile.x)
109        pixel_y = self.y + (real_tile_height * self.tile.y)
110        # Convert to micrometers
111        x_um = pixel_x * self.tile.scan.pixel_size_um
112        y_um = pixel_y * self.tile.scan.pixel_size_um
113        # Add the scan's origin in the scanner frame
114        x_um += self.tile.scan.roi[self.tile.n_roi].origin_x_um
115        y_um += self.tile.scan.roi[self.tile.n_roi].origin_y_um
116        return x_um, y_um
117
118    def get_slide_position(self) -> tuple[float, float]:
119        """
120        Get the slide position of the event in micrometers (um).
121        :return: the slide position of the event.
122        """
123        # Turn scan_position into a 3x1 vector
124        scan_position = self.get_scan_position()
125        scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]])
126
127        # Multiply by the appropriate homogeneous matrix
128        if self.tile.scan.scanner_id.startswith(self.tile.scan.Type.AXIOSCAN7.value):
129            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.tile.scan.Type.AXIOSCAN7]
130        elif self.tile.scan.scanner_id.startswith(self.tile.scan.Type.BZSCANNER.value):
131            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.tile.scan.Type.BZSCANNER]
132        else:
133            raise ValueError(f"Scanner type {self.tile.scan.scanner_id} not supported.")
134        slide_position = np.matmul(transform, scan_position)
135        return float(slide_position[0][0]), float(slide_position[1][0])
136
137    def crop(
138        self, images: Iterable[np.ndarray], crop_size: int = 100, in_pixels: bool = True
139    ) -> list[np.ndarray]:
140        """
141        Crop the event from the provided frame images. Use if you have already gotten
142        frame images; useful for cropping multiple events from the same frame image.
143        :param images: the frame images.
144        :param crop_size: the square size of the image crop to get for this event.
145        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
146        :return: image_size x image_size crops of the event in the provided frames. If
147        the event is too close to the edge, the crop will be smaller and not centered.
148        """
149        # Convert a crop size in micrometers to pixels
150        if not in_pixels:
151            crop_size = round(crop_size / self.tile.scan.pixel_size_um)
152        image_height, image_width = 0, 0
153        for image in images:
154            if image_height == 0 and image_width == 0:
155                image_height, image_width = image.shape
156            else:
157                if image_height != image.shape[0] or image_width != image.shape[1]:
158                    raise ValueError("All images must be the same size")
159        if image_height == 0 or image_width == 0:
160            raise ValueError("No images provided")
161
162        # Find the crop bounds
163        bounds = [
164            self.x - (crop_size // 2) + 1,
165            self.y - (crop_size // 2) + 1,
166            self.x + math.ceil(crop_size / 2) + 1,
167            self.y + math.ceil(crop_size / 2) + 1,
168        ]
169        # Determine how much the bounds violate the image size
170        displacements = [
171            max(0, -bounds[0]),
172            max(0, -bounds[1]),
173            max(0, bounds[2] - image_width),
174            max(0, bounds[3] - image_height),
175        ]
176        # Cap off the bounds
177        bounds = [
178            max(0, bounds[0]),
179            max(0, bounds[1]),
180            min(image_width, bounds[2]),
181            min(image_height, bounds[3]),
182        ]
183
184        # Crop the images
185        crops = []
186        for image in images:
187            # Create a blank image of the right size
188            crop = np.zeros((crop_size, crop_size), dtype=image.dtype)
189
190            # Insert the cropped image into the blank image, leaving a black buffer
191            # around the edges if the crop would go beyond the original image bounds
192            crop[
193                displacements[1] : crop_size - displacements[3],
194                displacements[0] : crop_size - displacements[2],
195            ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]]
196            crops.append(crop)
197        return crops
198
199    def get_crops(
200        self,
201        crop_size: int = 100,
202        in_pixels: bool = True,
203        input_path: str = None,
204        channels: Iterable[int | str] = None,
205        apply_gain: bool | Iterable[bool] = True,
206    ) -> list[np.ndarray]:
207        """
208        Gets the frame images for this event and then crops the event from the images.
209        Convenient for retrieving a single event's crops, but less efficient when
210        retrieving multiple events from the same tile as it will reread the images.
211        :param crop_size: the square size of the image crop to get for this event.
212        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
213        :param input_path: the path to the input images. Defaults to None (uses the scan's path).
214        :param channels: the channels to extract images for. Defaults to all channels.
215        :param apply_gain: whether to apply scanner-calculated gain to the images, if
216        not already applied. If a list, matches the channels.
217        :return: a list of cropped images from the scan in the order of the channels.
218        """
219        # This function validates channels
220        frames = Frame.get_frames(self.tile, channels)
221        # Convert individual inputs to lists of appropriate length
222        if isinstance(apply_gain, bool):
223            apply_gain = [apply_gain] * len(frames)
224        images = [f.get_image(input_path, a) for f, a in zip(frames, apply_gain)]
225        return self.crop(images, crop_size, in_pixels)
226
227    def save_crops(
228        self,
229        crops: Sequence[np.ndarray],
230        output_path: str,
231        labels: Sequence[str],
232        ext: str = "auto",
233    ):
234        """
235        Save the crops to image files.
236        :param crops: the crops to save. Will save as RGB if 3 channel [h, w, 3] or
237        grayscale if 1 channel [h, w] or [h, w, 1].
238        :param labels: the labels to append to the file name, usually the channel names
239        associated with each crop.
240        :param output_path: the folder to save the crops to. Will make if needed.
241        :param ext: the file extension to save the crops as. Defaults to "auto", which
242        will save as .tif for grayscale images and .jpg for RGB images.
243        :return: None
244        """
245        if len(crops) != len(labels):
246            raise ValueError("Crops and labels must be the same length")
247
248        if csi_images is None or imageio is None:
249            raise ModuleNotFoundError(
250                "imageio libraries not installed! "
251                "run `pip install csi_images[imageio]` to resolve."
252            )
253
254        os.makedirs(output_path, exist_ok=True)
255
256        for crop, label in zip(crops, labels):
257            if ext == "auto":
258                if len(crop.shape) == 2 or crop.shape[2] == 1:
259                    file_extension = ".tif"
260                elif crop.shape[2] == 3:
261                    file_extension = ".jpg"
262                else:
263                    warnings.warn(
264                        f"Image shape {crop.shape} not recognized; saving as .tif"
265                    )
266                    file_extension = ".tif"
267            else:
268                file_extension = ext
269            file = os.path.join(output_path, f"{self}-{label}{file_extension}")
270            # TODO: add more file types here
271            if file_extension == ".tif":
272                imageio.imwrite(file, crop, compression="deflate")
273            elif file_extension in [".jpg", ".jpeg"]:
274                crop = csi_images.scale_bit_depth(crop, np.uint8)
275                imageio.imwrite(file, crop, quality=80)
276            else:
277                imageio.imwrite(file, crop)
278
279    def load_crops(
280        self, input_path: str, labels: list[str] = None
281    ) -> dict[str, np.ndarray]:
282        """
283        Loads previously saved crop files from a folder.
284        :param input_path: folder containing crop files.
285        :param labels: optional label filter, will only return crops with these labels.
286        :return: a tuple of lists containing the crops and their labels.
287        """
288        crops = {}
289        for file in glob.glob(os.path.join(input_path, f"{self}-*")):
290            label = os.path.splitext(os.path.basename(file))[0].split("-")[-1]
291            # Skip if we have labels to target
292            if labels is not None and label not in labels:
293                continue
294            crops[label] = imageio.imread(file)
295        return crops
296
297    def get_montage_channels(
298        self,
299        channels: Sequence[int | str] | None,
300        composites: dict[int | str, tuple[float, float, float]] | None,
301    ) -> tuple[list[int], list[int], dict[int, tuple[float, float, float]]]:
302        """
303        Get the channel names for the montage from the event's tile.
304        :param channels: channel indices or names for grayscale channels
305        :param composites: dictionary of channel indices or names and RGB values
306        :return: (1) channel indices to retrieve,
307                 (2) relative grayscale channel indices, and
308                 (3) composite channel indices and RGB values.
309        """
310        if channels is None:
311            channels = list(range(len(self.tile.scan.channels)))
312        if (len(channels) == 0) and (composites is None or len(composites) == 0):
313            raise ValueError("Must provide at least one channel type to montage")
314
315        channels_to_get = []
316
317        # Build the list of channels to retrieve
318        if channels is not None:
319            if isinstance(channels[0], str):
320                channels = self.tile.scan.get_channel_indices(channels)
321            channels_to_get += channels
322            order = list(range(len(channels)))  # Always the first n channels
323        else:
324            order = None
325
326        if composites is not None:
327            relative_composites = {}  # Relative indices for retrieved channels
328            # Convert to scan indices
329            rgb_channels = list(composites.keys())
330            if isinstance(rgb_channels[0], str):
331                rgb_channels = self.tile.scan.get_channel_indices(rgb_channels)
332            # Find the index or add to the end
333            for channel, rgb in zip(rgb_channels, composites.values()):
334                if channel not in channels_to_get:
335                    channels_to_get.append(channel)
336                    relative_composites[channel] = rgb
337                else:
338                    relative_composites[channels_to_get.index(channel)] = rgb
339        else:
340            relative_composites = None
341
342        return channels_to_get, order, relative_composites
343
344    def get_montage(
345        self,
346        channels: Sequence[int | str] = None,
347        composites: dict[int | str, tuple[float, float, float]] = None,
348        crop_size: int = 100,
349        in_pixels: bool = True,
350        input_path: str = None,
351        apply_gain: bool = True,
352        **kwargs,
353    ) -> np.ndarray:
354        """
355        Convenience function for getting frame images and creating a montage. Mirrors
356        csi_images.make_montage(). Convenient for a single event's montage, but less
357        efficient when for multiple events from the same tile.
358        :param channels: the channels to use for black-and-white montages.
359        :param composites: dictionary of indices and RGB tuples for a composite.
360        :param crop_size: the square size of the image crop to get for this event.
361        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
362        :param input_path: the path to the input images. Defaults to None (uses the scan's path).
363        :param apply_gain: whether to apply scanner-calculated gain to the images, if
364        not already applied. If a list, matches the channels.
365        :param kwargs: montage options. See csi_images.make_montage() for more details.
366        :return: numpy array representing the montage.
367        """
368        channels, order, composites = self.get_montage_channels(channels, composites)
369        images = self.get_crops(crop_size, in_pixels, input_path, channels, apply_gain)
370        return csi_images.make_montage(images, order, composites, **kwargs)
371
372    def save_montage(
373        self,
374        montage: np.ndarray,
375        output_path: str,
376        ocular_names: bool = False,
377        tag: str = "",
378    ):
379        """
380        Save the montage as a JPEG image with a set name.
381        :param montage: the montage to save.
382        :param output_path: the folder to save the montage in. Wil make if needed.
383        :param ocular_names: whether to use the OCULAR naming convention.
384        :param tag: a tag to append to the file name. Ignored if ocular_names is True.
385        :return: None
386        """
387        if csi_images is None or imageio is None:
388            raise ModuleNotFoundError(
389                "imageio libraries not installed! "
390                "run `pip install csi_images[imageio]` to resolve."
391            )
392
393        montage = csi_images.scale_bit_depth(montage, np.uint8)
394
395        if ocular_names:
396            if "cell_id" not in self.metadata.index:
397                raise ValueError(
398                    "Event metadata must include 'cell_id' for OCULAR naming."
399                )
400            file = f"{self.tile.n}-{self.metadata['cell_id']}-{self.x}-{self.y}.jpeg"
401        else:
402            file = f"{self}{tag}.jpeg"
403
404        os.makedirs(output_path, exist_ok=True)
405        imageio.imwrite(os.path.join(output_path, file), montage, quality=80)
406
407    def load_montage(self, input_path: str, tag: str = "") -> np.ndarray:
408        """
409        Loads the montage from a file saved by Event.save_montage.
410        :param input_path: the path to the folder where the montage was saved.
411        :param tag: a string to add to the file name, before the extension.
412        :return:
413        """
414        file = f"{self}{tag}.jpeg"
415        return imageio.imread(os.path.join(input_path, file))
416
417    @classmethod
418    def get_many_crops(
419        cls,
420        events: Sequence[Self],
421        crop_size: int | Sequence[int] = 100,
422        in_pixels: bool = True,
423        input_path: str | Sequence[str] = None,
424        channels: Sequence[int | str] = None,
425        apply_gain: bool | Sequence[bool] = True,
426    ) -> list[list[np.ndarray]]:
427        """
428        Get the crops for a list of events, ensuring that there is no wasteful reading
429        of the same tile multiple times. This function is more efficient than calling
430        get_crops() for each event.
431        :param events: the events to get crops for.
432        :param crop_size: the square size of the image crop to get for this event.
433                          Defaults to four times the size of the event.
434        :param in_pixels: whether the crop size is in pixels or micrometers.
435                          Defaults to pixels, and is ignored if crop_size is None.
436        :param input_path: the path to the input images. Will only work for lists of events
437                           from the same scan. Defaults to None (uses the scan's path).
438        :param channels: the channels to extract images for. Defaults to all channels.
439        :param apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True.
440                           Can be supplied as a list to apply gain to individual channels.
441        :return: a list of lists of cropped images for each event.
442        """
443        if len(events) == 0:
444            return []
445        # Adapt singular inputs to lists of appropriate length
446        if isinstance(crop_size, int):
447            crop_size = [crop_size] * len(events)
448        if input_path is None or isinstance(input_path, str):
449            input_path = [input_path] * len(events)
450
451        # Get the order of the events when sorted by slide/tile
452        order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__()))
453
454        # Allocate the list to size
455        crops = [[]] * len(events)
456        last_tile = None
457        images = None  # Holds large numpy arrays, so expensive to compare
458        # Iterate through in slide/tile sorted order
459        for i in order:
460            if last_tile != events[i].tile:
461                # Gather the frame images, preserving them for the next event
462                frames = Frame.get_frames(events[i].tile, channels)
463                if isinstance(apply_gain, bool):
464                    apply = [apply_gain] * len(frames)
465                else:
466                    apply = apply_gain
467                images = [f.get_image(input_path[i], a) for f, a in zip(frames, apply)]
468                last_tile = events[i].tile
469            # Use the frame images to crop the event images
470            crops[i] = events[i].crop(images, crop_size[i], in_pixels)
471        return crops
472
473    @classmethod
474    def get_many_montages(
475        cls,
476        events: Sequence[Self],
477        channels: Sequence[int | str] = None,
478        composites: dict[int | str, tuple[float, float, float]] = None,
479        crop_size: int = 100,
480        in_pixels: bool = True,
481        input_path: str = None,
482        apply_gain: bool | Iterable[bool] = True,
483        **kwargs,
484    ) -> list[np.ndarray]:
485        """
486        Convenience function for get_montage(), but for a list of events. More efficient
487        thank get_montage() when working with multiple events from the same tile.
488        :param events: a list of Event objects.
489        :param channels: the channels to extract images for. Defaults to all channels.
490        :param composites: dictionary of indices and RGB tuples for a composite.
491        :param crop_size: the square size of the image crop to get for this event.
492        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
493        :param input_path: the path to the input images. Defaults to None (uses the scan's path).
494        :param apply_gain: whether to apply scanner-calculated gain to the images, if
495        not already applied. If a list, matches the channels.
496        :param kwargs: montage options. See csi_images.make_montage() for more details.
497        :return: a list of numpy arrays representing the montages.
498        """
499        if len(events) == 0:
500            return []
501        # Adapt singular inputs to lists of appropriate length
502        if isinstance(crop_size, int):
503            crop_size = [crop_size] * len(events)
504        if input_path is None or isinstance(input_path, str):
505            input_path = [input_path] * len(events)
506
507        # Get the order of the events when sorted by slide/tile
508        event_order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__()))
509
510        # Allocate the list to size
511        montages = [np.empty(0)] * len(events)
512        # Placeholder variables to avoid rereading the same tile
513        images = None  # Holds large numpy arrays, so expensive to compare
514        order = None
515        rel_composites = None
516        last_tile = None
517        # Iterate through in slide/tile sorted order
518        for i in event_order:
519            if last_tile != events[i].tile:
520                channels_to_get, order, rel_composites = events[i].get_montage_channels(
521                    channels, composites
522                )
523                # Gather the frame images, preserving them for the next event
524                frames = Frame.get_frames(events[i].tile, channels_to_get)
525                if isinstance(apply_gain, bool):
526                    apply = [apply_gain] * len(frames)
527                else:
528                    apply = apply_gain
529                images = [f.get_image(input_path[i], a) for f, a in zip(frames, apply)]
530                last_tile = events[i].tile
531            # Use the frame images to crop the event images and make montages
532            crops = events[i].crop(images, crop_size[i], in_pixels)
533            montages[i] = csi_images.make_montage(
534                crops, order, rel_composites, **kwargs
535            )
536
537        return montages
538
539    @classmethod
540    def get_and_save_many_crops(
541        cls,
542        events: list[Self],
543        output_path: str,
544        labels: Sequence[str],
545        ext: str = "auto",
546        additional_gain: Sequence[float] = None,
547        **kwargs,
548    ) -> None:
549        """
550        Get and save the crops for a list of events, ensuring that there is no wasteful
551        reading and limiting the image data in memory to 1 tile at a time. This function
552        is more efficient that chaining get_crops() and save_crops() for each event or
553        get_many_crops() and then save_crops().
554        :param events: list of events to get, crop, and save.
555        :param output_path: the folder to save the crops in. Will make if needed.
556        :param labels: the labels to save the crops with. See save_crops().
557        :param ext: the file extension to save the crops as. See save_crops().
558        :param additional_gain: additional gain to apply to the crops. If not None, must
559        match the length of the number of crop channels.
560        :param kwargs: see get_many_crops() for more parameters.
561        :return:
562        """
563        unique_tiles = set([event.tile for event in events])
564
565        for tile in unique_tiles:
566            # Get one tile's worth of event crops
567            tile_events = [e for e in events if e.tile == tile]
568            crops_list = cls.get_many_crops(tile_events, **kwargs)
569            for event, crops in zip(tile_events, crops_list):
570                # Apply any additional gains
571                if additional_gain is not None:
572                    crops = [gain * crop for gain, crop in zip(additional_gain, crops)]
573                event.save_crops(crops, output_path, labels, ext)
574
575    @classmethod
576    def get_and_save_many_montages(
577        cls,
578        events: list[Self],
579        output_path: str,
580        ocular_names: bool = False,
581        tag: str = "",
582        **kwargs,
583    ) -> None:
584        """
585        Save montages of the events to image files.
586        :param events: the events to get, montage, and save.
587        :param output_path: the folder to save the montages to. Will make if needed.
588        :param ocular_names: whether to use the OCULAR naming convention.
589        :param tag: a tag to append to the file name. Ignored if ocular_names is True.
590        :param kwargs: see get_many_montages() for more parameters.
591        """
592        unique_tiles = set([event.tile for event in events])
593
594        for tile in unique_tiles:
595            # Get one tile's worth of event crops
596            tile_events = [e for e in events if e.tile == tile]
597            montages = cls.get_many_montages(tile_events, **kwargs)
598            for event, montage in zip(tile_events, montages):
599                event.save_montage(montage, output_path, ocular_names, tag)

A class that represents a single event in a scan, making it easy to evaluate singular events. Required metadata is exposed as attributes, and optional metadata and features are stored as DataFrames.

Event( tile: csi_images.csi_tiles.Tile, x: int, y: int, metadata: pandas.core.series.Series = None, features: pandas.core.series.Series = None)
78    def __init__(
79        self,
80        tile: Tile,
81        x: int,
82        y: int,
83        metadata: pd.Series = None,
84        features: pd.Series = None,
85    ):
86        self.tile = tile
87        self.x = int(x)
88        self.y = int(y)
89        self.metadata = metadata
90        self.features = features
SCAN_TO_SLIDE_TRANSFORM = {<Type.AXIOSCAN7: 'axioscan7'>: array([[ 1, 0, 75000], [ 0, 1, 0], [ 0, 0, 1]]), <Type.BZSCANNER: 'bzscanner'>: array([[ 0, -1, 75000], [ -1, 0, 25000], [ 0, 0, 1]])}

Homogeneous transformation matrices for converting between scanner and slide coordinates. The matrices are 3x3, with the final column representing the translation in micrometers (um). For more information, see affine transformations.

Transformations are nominal, and accuracy is not guaranteed; this is due to imperfections in slides and alignment in the scanners. Units are in micrometers.

tile
x
y
metadata
features
def get_scan_position(self) -> tuple[float, float]:
101    def get_scan_position(self) -> tuple[float, float]:
102        """
103        Get the position of the event in the scanner's coordinate frame.
104        :return: the scan position of the event in micrometers (um).
105        """
106        # Get overall pixel position
107        real_tile_height, real_tile_width = self.tile.scan.get_image_size()
108        pixel_x = self.x + (real_tile_width * self.tile.x)
109        pixel_y = self.y + (real_tile_height * self.tile.y)
110        # Convert to micrometers
111        x_um = pixel_x * self.tile.scan.pixel_size_um
112        y_um = pixel_y * self.tile.scan.pixel_size_um
113        # Add the scan's origin in the scanner frame
114        x_um += self.tile.scan.roi[self.tile.n_roi].origin_x_um
115        y_um += self.tile.scan.roi[self.tile.n_roi].origin_y_um
116        return x_um, y_um

Get the position of the event in the scanner's coordinate frame.

Returns

the scan position of the event in micrometers (um).

def get_slide_position(self) -> tuple[float, float]:
118    def get_slide_position(self) -> tuple[float, float]:
119        """
120        Get the slide position of the event in micrometers (um).
121        :return: the slide position of the event.
122        """
123        # Turn scan_position into a 3x1 vector
124        scan_position = self.get_scan_position()
125        scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]])
126
127        # Multiply by the appropriate homogeneous matrix
128        if self.tile.scan.scanner_id.startswith(self.tile.scan.Type.AXIOSCAN7.value):
129            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.tile.scan.Type.AXIOSCAN7]
130        elif self.tile.scan.scanner_id.startswith(self.tile.scan.Type.BZSCANNER.value):
131            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.tile.scan.Type.BZSCANNER]
132        else:
133            raise ValueError(f"Scanner type {self.tile.scan.scanner_id} not supported.")
134        slide_position = np.matmul(transform, scan_position)
135        return float(slide_position[0][0]), float(slide_position[1][0])

Get the slide position of the event in micrometers (um).

Returns

the slide position of the event.

def crop( self, images: Iterable[numpy.ndarray], crop_size: int = 100, in_pixels: bool = True) -> list[numpy.ndarray]:
137    def crop(
138        self, images: Iterable[np.ndarray], crop_size: int = 100, in_pixels: bool = True
139    ) -> list[np.ndarray]:
140        """
141        Crop the event from the provided frame images. Use if you have already gotten
142        frame images; useful for cropping multiple events from the same frame image.
143        :param images: the frame images.
144        :param crop_size: the square size of the image crop to get for this event.
145        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
146        :return: image_size x image_size crops of the event in the provided frames. If
147        the event is too close to the edge, the crop will be smaller and not centered.
148        """
149        # Convert a crop size in micrometers to pixels
150        if not in_pixels:
151            crop_size = round(crop_size / self.tile.scan.pixel_size_um)
152        image_height, image_width = 0, 0
153        for image in images:
154            if image_height == 0 and image_width == 0:
155                image_height, image_width = image.shape
156            else:
157                if image_height != image.shape[0] or image_width != image.shape[1]:
158                    raise ValueError("All images must be the same size")
159        if image_height == 0 or image_width == 0:
160            raise ValueError("No images provided")
161
162        # Find the crop bounds
163        bounds = [
164            self.x - (crop_size // 2) + 1,
165            self.y - (crop_size // 2) + 1,
166            self.x + math.ceil(crop_size / 2) + 1,
167            self.y + math.ceil(crop_size / 2) + 1,
168        ]
169        # Determine how much the bounds violate the image size
170        displacements = [
171            max(0, -bounds[0]),
172            max(0, -bounds[1]),
173            max(0, bounds[2] - image_width),
174            max(0, bounds[3] - image_height),
175        ]
176        # Cap off the bounds
177        bounds = [
178            max(0, bounds[0]),
179            max(0, bounds[1]),
180            min(image_width, bounds[2]),
181            min(image_height, bounds[3]),
182        ]
183
184        # Crop the images
185        crops = []
186        for image in images:
187            # Create a blank image of the right size
188            crop = np.zeros((crop_size, crop_size), dtype=image.dtype)
189
190            # Insert the cropped image into the blank image, leaving a black buffer
191            # around the edges if the crop would go beyond the original image bounds
192            crop[
193                displacements[1] : crop_size - displacements[3],
194                displacements[0] : crop_size - displacements[2],
195            ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]]
196            crops.append(crop)
197        return crops

Crop the event from the provided frame images. Use if you have already gotten frame images; useful for cropping multiple events from the same frame image.

Parameters
  • images: the frame images.
  • crop_size: the square size of the image crop to get for this event.
  • in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
Returns

image_size x image_size crops of the event in the provided frames. If the event is too close to the edge, the crop will be smaller and not centered.

def get_crops( self, crop_size: int = 100, in_pixels: bool = True, input_path: str = None, channels: Iterable[int | str] = None, apply_gain: Union[bool, Iterable[bool]] = True) -> list[numpy.ndarray]:
199    def get_crops(
200        self,
201        crop_size: int = 100,
202        in_pixels: bool = True,
203        input_path: str = None,
204        channels: Iterable[int | str] = None,
205        apply_gain: bool | Iterable[bool] = True,
206    ) -> list[np.ndarray]:
207        """
208        Gets the frame images for this event and then crops the event from the images.
209        Convenient for retrieving a single event's crops, but less efficient when
210        retrieving multiple events from the same tile as it will reread the images.
211        :param crop_size: the square size of the image crop to get for this event.
212        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
213        :param input_path: the path to the input images. Defaults to None (uses the scan's path).
214        :param channels: the channels to extract images for. Defaults to all channels.
215        :param apply_gain: whether to apply scanner-calculated gain to the images, if
216        not already applied. If a list, matches the channels.
217        :return: a list of cropped images from the scan in the order of the channels.
218        """
219        # This function validates channels
220        frames = Frame.get_frames(self.tile, channels)
221        # Convert individual inputs to lists of appropriate length
222        if isinstance(apply_gain, bool):
223            apply_gain = [apply_gain] * len(frames)
224        images = [f.get_image(input_path, a) for f, a in zip(frames, apply_gain)]
225        return self.crop(images, crop_size, in_pixels)

Gets the frame images for this event and then crops the event from the images. Convenient for retrieving a single event's crops, but less efficient when retrieving multiple events from the same tile as it will reread the images.

Parameters
  • crop_size: the square size of the image crop to get for this event.
  • in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
  • input_path: the path to the input images. Defaults to None (uses the scan's path).
  • channels: the channels to extract images for. Defaults to all channels.
  • apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. If a list, matches the channels.
Returns

a list of cropped images from the scan in the order of the channels.

def save_crops( self, crops: Sequence[numpy.ndarray], output_path: str, labels: Sequence[str], ext: str = 'auto'):
227    def save_crops(
228        self,
229        crops: Sequence[np.ndarray],
230        output_path: str,
231        labels: Sequence[str],
232        ext: str = "auto",
233    ):
234        """
235        Save the crops to image files.
236        :param crops: the crops to save. Will save as RGB if 3 channel [h, w, 3] or
237        grayscale if 1 channel [h, w] or [h, w, 1].
238        :param labels: the labels to append to the file name, usually the channel names
239        associated with each crop.
240        :param output_path: the folder to save the crops to. Will make if needed.
241        :param ext: the file extension to save the crops as. Defaults to "auto", which
242        will save as .tif for grayscale images and .jpg for RGB images.
243        :return: None
244        """
245        if len(crops) != len(labels):
246            raise ValueError("Crops and labels must be the same length")
247
248        if csi_images is None or imageio is None:
249            raise ModuleNotFoundError(
250                "imageio libraries not installed! "
251                "run `pip install csi_images[imageio]` to resolve."
252            )
253
254        os.makedirs(output_path, exist_ok=True)
255
256        for crop, label in zip(crops, labels):
257            if ext == "auto":
258                if len(crop.shape) == 2 or crop.shape[2] == 1:
259                    file_extension = ".tif"
260                elif crop.shape[2] == 3:
261                    file_extension = ".jpg"
262                else:
263                    warnings.warn(
264                        f"Image shape {crop.shape} not recognized; saving as .tif"
265                    )
266                    file_extension = ".tif"
267            else:
268                file_extension = ext
269            file = os.path.join(output_path, f"{self}-{label}{file_extension}")
270            # TODO: add more file types here
271            if file_extension == ".tif":
272                imageio.imwrite(file, crop, compression="deflate")
273            elif file_extension in [".jpg", ".jpeg"]:
274                crop = csi_images.scale_bit_depth(crop, np.uint8)
275                imageio.imwrite(file, crop, quality=80)
276            else:
277                imageio.imwrite(file, crop)

Save the crops to image files.

Parameters
  • crops: the crops to save. Will save as RGB if 3 channel [h, w, 3] or grayscale if 1 channel [h, w] or [h, w, 1].
  • labels: the labels to append to the file name, usually the channel names associated with each crop.
  • output_path: the folder to save the crops to. Will make if needed.
  • ext: the file extension to save the crops as. Defaults to "auto", which will save as .tif for grayscale images and .jpg for RGB images.
Returns

None

def load_crops( self, input_path: str, labels: list[str] = None) -> dict[str, numpy.ndarray]:
279    def load_crops(
280        self, input_path: str, labels: list[str] = None
281    ) -> dict[str, np.ndarray]:
282        """
283        Loads previously saved crop files from a folder.
284        :param input_path: folder containing crop files.
285        :param labels: optional label filter, will only return crops with these labels.
286        :return: a tuple of lists containing the crops and their labels.
287        """
288        crops = {}
289        for file in glob.glob(os.path.join(input_path, f"{self}-*")):
290            label = os.path.splitext(os.path.basename(file))[0].split("-")[-1]
291            # Skip if we have labels to target
292            if labels is not None and label not in labels:
293                continue
294            crops[label] = imageio.imread(file)
295        return crops

Loads previously saved crop files from a folder.

Parameters
  • input_path: folder containing crop files.
  • labels: optional label filter, will only return crops with these labels.
Returns

a tuple of lists containing the crops and their labels.

def get_montage_channels( self, channels: Optional[Sequence[int | str]], composites: dict[int | str, tuple[float, float, float]] | None) -> tuple[list[int], list[int], dict[int, tuple[float, float, float]]]:
297    def get_montage_channels(
298        self,
299        channels: Sequence[int | str] | None,
300        composites: dict[int | str, tuple[float, float, float]] | None,
301    ) -> tuple[list[int], list[int], dict[int, tuple[float, float, float]]]:
302        """
303        Get the channel names for the montage from the event's tile.
304        :param channels: channel indices or names for grayscale channels
305        :param composites: dictionary of channel indices or names and RGB values
306        :return: (1) channel indices to retrieve,
307                 (2) relative grayscale channel indices, and
308                 (3) composite channel indices and RGB values.
309        """
310        if channels is None:
311            channels = list(range(len(self.tile.scan.channels)))
312        if (len(channels) == 0) and (composites is None or len(composites) == 0):
313            raise ValueError("Must provide at least one channel type to montage")
314
315        channels_to_get = []
316
317        # Build the list of channels to retrieve
318        if channels is not None:
319            if isinstance(channels[0], str):
320                channels = self.tile.scan.get_channel_indices(channels)
321            channels_to_get += channels
322            order = list(range(len(channels)))  # Always the first n channels
323        else:
324            order = None
325
326        if composites is not None:
327            relative_composites = {}  # Relative indices for retrieved channels
328            # Convert to scan indices
329            rgb_channels = list(composites.keys())
330            if isinstance(rgb_channels[0], str):
331                rgb_channels = self.tile.scan.get_channel_indices(rgb_channels)
332            # Find the index or add to the end
333            for channel, rgb in zip(rgb_channels, composites.values()):
334                if channel not in channels_to_get:
335                    channels_to_get.append(channel)
336                    relative_composites[channel] = rgb
337                else:
338                    relative_composites[channels_to_get.index(channel)] = rgb
339        else:
340            relative_composites = None
341
342        return channels_to_get, order, relative_composites

Get the channel names for the montage from the event's tile.

Parameters
  • channels: channel indices or names for grayscale channels
  • composites: dictionary of channel indices or names and RGB values
Returns

(1) channel indices to retrieve, (2) relative grayscale channel indices, and (3) composite channel indices and RGB values.

def get_montage( self, channels: Sequence[int | str] = None, composites: dict[int | str, tuple[float, float, float]] = None, crop_size: int = 100, in_pixels: bool = True, input_path: str = None, apply_gain: bool = True, **kwargs) -> numpy.ndarray:
344    def get_montage(
345        self,
346        channels: Sequence[int | str] = None,
347        composites: dict[int | str, tuple[float, float, float]] = None,
348        crop_size: int = 100,
349        in_pixels: bool = True,
350        input_path: str = None,
351        apply_gain: bool = True,
352        **kwargs,
353    ) -> np.ndarray:
354        """
355        Convenience function for getting frame images and creating a montage. Mirrors
356        csi_images.make_montage(). Convenient for a single event's montage, but less
357        efficient when for multiple events from the same tile.
358        :param channels: the channels to use for black-and-white montages.
359        :param composites: dictionary of indices and RGB tuples for a composite.
360        :param crop_size: the square size of the image crop to get for this event.
361        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
362        :param input_path: the path to the input images. Defaults to None (uses the scan's path).
363        :param apply_gain: whether to apply scanner-calculated gain to the images, if
364        not already applied. If a list, matches the channels.
365        :param kwargs: montage options. See csi_images.make_montage() for more details.
366        :return: numpy array representing the montage.
367        """
368        channels, order, composites = self.get_montage_channels(channels, composites)
369        images = self.get_crops(crop_size, in_pixels, input_path, channels, apply_gain)
370        return csi_images.make_montage(images, order, composites, **kwargs)

Convenience function for getting frame images and creating a montage. Mirrors csi_images.make_montage(). Convenient for a single event's montage, but less efficient when for multiple events from the same tile.

Parameters
  • channels: the channels to use for black-and-white montages.
  • composites: dictionary of indices and RGB tuples for a composite.
  • crop_size: the square size of the image crop to get for this event.
  • in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
  • input_path: the path to the input images. Defaults to None (uses the scan's path).
  • apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. If a list, matches the channels.
  • kwargs: montage options. See csi_images.make_montage() for more details.
Returns

numpy array representing the montage.

def save_montage( self, montage: numpy.ndarray, output_path: str, ocular_names: bool = False, tag: str = ''):
372    def save_montage(
373        self,
374        montage: np.ndarray,
375        output_path: str,
376        ocular_names: bool = False,
377        tag: str = "",
378    ):
379        """
380        Save the montage as a JPEG image with a set name.
381        :param montage: the montage to save.
382        :param output_path: the folder to save the montage in. Wil make if needed.
383        :param ocular_names: whether to use the OCULAR naming convention.
384        :param tag: a tag to append to the file name. Ignored if ocular_names is True.
385        :return: None
386        """
387        if csi_images is None or imageio is None:
388            raise ModuleNotFoundError(
389                "imageio libraries not installed! "
390                "run `pip install csi_images[imageio]` to resolve."
391            )
392
393        montage = csi_images.scale_bit_depth(montage, np.uint8)
394
395        if ocular_names:
396            if "cell_id" not in self.metadata.index:
397                raise ValueError(
398                    "Event metadata must include 'cell_id' for OCULAR naming."
399                )
400            file = f"{self.tile.n}-{self.metadata['cell_id']}-{self.x}-{self.y}.jpeg"
401        else:
402            file = f"{self}{tag}.jpeg"
403
404        os.makedirs(output_path, exist_ok=True)
405        imageio.imwrite(os.path.join(output_path, file), montage, quality=80)

Save the montage as a JPEG image with a set name.

Parameters
  • montage: the montage to save.
  • output_path: the folder to save the montage in. Wil make if needed.
  • ocular_names: whether to use the OCULAR naming convention.
  • tag: a tag to append to the file name. Ignored if ocular_names is True.
Returns

None

def load_montage(self, input_path: str, tag: str = '') -> numpy.ndarray:
407    def load_montage(self, input_path: str, tag: str = "") -> np.ndarray:
408        """
409        Loads the montage from a file saved by Event.save_montage.
410        :param input_path: the path to the folder where the montage was saved.
411        :param tag: a string to add to the file name, before the extension.
412        :return:
413        """
414        file = f"{self}{tag}.jpeg"
415        return imageio.imread(os.path.join(input_path, file))

Loads the montage from a file saved by Event.save_montage.

Parameters
  • input_path: the path to the folder where the montage was saved.
  • tag: a string to add to the file name, before the extension.
Returns
@classmethod
def get_many_crops( cls, events: Sequence[Self], crop_size: Union[int, Sequence[int]] = 100, in_pixels: bool = True, input_path: Union[str, Sequence[str]] = None, channels: Sequence[int | str] = None, apply_gain: Union[bool, Sequence[bool]] = True) -> list[list[numpy.ndarray]]:
417    @classmethod
418    def get_many_crops(
419        cls,
420        events: Sequence[Self],
421        crop_size: int | Sequence[int] = 100,
422        in_pixels: bool = True,
423        input_path: str | Sequence[str] = None,
424        channels: Sequence[int | str] = None,
425        apply_gain: bool | Sequence[bool] = True,
426    ) -> list[list[np.ndarray]]:
427        """
428        Get the crops for a list of events, ensuring that there is no wasteful reading
429        of the same tile multiple times. This function is more efficient than calling
430        get_crops() for each event.
431        :param events: the events to get crops for.
432        :param crop_size: the square size of the image crop to get for this event.
433                          Defaults to four times the size of the event.
434        :param in_pixels: whether the crop size is in pixels or micrometers.
435                          Defaults to pixels, and is ignored if crop_size is None.
436        :param input_path: the path to the input images. Will only work for lists of events
437                           from the same scan. Defaults to None (uses the scan's path).
438        :param channels: the channels to extract images for. Defaults to all channels.
439        :param apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True.
440                           Can be supplied as a list to apply gain to individual channels.
441        :return: a list of lists of cropped images for each event.
442        """
443        if len(events) == 0:
444            return []
445        # Adapt singular inputs to lists of appropriate length
446        if isinstance(crop_size, int):
447            crop_size = [crop_size] * len(events)
448        if input_path is None or isinstance(input_path, str):
449            input_path = [input_path] * len(events)
450
451        # Get the order of the events when sorted by slide/tile
452        order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__()))
453
454        # Allocate the list to size
455        crops = [[]] * len(events)
456        last_tile = None
457        images = None  # Holds large numpy arrays, so expensive to compare
458        # Iterate through in slide/tile sorted order
459        for i in order:
460            if last_tile != events[i].tile:
461                # Gather the frame images, preserving them for the next event
462                frames = Frame.get_frames(events[i].tile, channels)
463                if isinstance(apply_gain, bool):
464                    apply = [apply_gain] * len(frames)
465                else:
466                    apply = apply_gain
467                images = [f.get_image(input_path[i], a) for f, a in zip(frames, apply)]
468                last_tile = events[i].tile
469            # Use the frame images to crop the event images
470            crops[i] = events[i].crop(images, crop_size[i], in_pixels)
471        return crops

Get the crops for a list of events, ensuring that there is no wasteful reading of the same tile multiple times. This function is more efficient than calling get_crops() for each event.

Parameters
  • events: the events to get crops for.
  • crop_size: the square size of the image crop to get for this event. Defaults to four times the size of the event.
  • in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels, and is ignored if crop_size is None.
  • input_path: the path to the input images. Will only work for lists of events from the same scan. Defaults to None (uses the scan's path).
  • channels: the channels to extract images for. Defaults to all channels.
  • apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True. Can be supplied as a list to apply gain to individual channels.
Returns

a list of lists of cropped images for each event.

@classmethod
def get_many_montages( cls, events: Sequence[Self], channels: Sequence[int | str] = None, composites: dict[int | str, tuple[float, float, float]] = None, crop_size: int = 100, in_pixels: bool = True, input_path: str = None, apply_gain: Union[bool, Iterable[bool]] = True, **kwargs) -> list[numpy.ndarray]:
473    @classmethod
474    def get_many_montages(
475        cls,
476        events: Sequence[Self],
477        channels: Sequence[int | str] = None,
478        composites: dict[int | str, tuple[float, float, float]] = None,
479        crop_size: int = 100,
480        in_pixels: bool = True,
481        input_path: str = None,
482        apply_gain: bool | Iterable[bool] = True,
483        **kwargs,
484    ) -> list[np.ndarray]:
485        """
486        Convenience function for get_montage(), but for a list of events. More efficient
487        thank get_montage() when working with multiple events from the same tile.
488        :param events: a list of Event objects.
489        :param channels: the channels to extract images for. Defaults to all channels.
490        :param composites: dictionary of indices and RGB tuples for a composite.
491        :param crop_size: the square size of the image crop to get for this event.
492        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
493        :param input_path: the path to the input images. Defaults to None (uses the scan's path).
494        :param apply_gain: whether to apply scanner-calculated gain to the images, if
495        not already applied. If a list, matches the channels.
496        :param kwargs: montage options. See csi_images.make_montage() for more details.
497        :return: a list of numpy arrays representing the montages.
498        """
499        if len(events) == 0:
500            return []
501        # Adapt singular inputs to lists of appropriate length
502        if isinstance(crop_size, int):
503            crop_size = [crop_size] * len(events)
504        if input_path is None or isinstance(input_path, str):
505            input_path = [input_path] * len(events)
506
507        # Get the order of the events when sorted by slide/tile
508        event_order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__()))
509
510        # Allocate the list to size
511        montages = [np.empty(0)] * len(events)
512        # Placeholder variables to avoid rereading the same tile
513        images = None  # Holds large numpy arrays, so expensive to compare
514        order = None
515        rel_composites = None
516        last_tile = None
517        # Iterate through in slide/tile sorted order
518        for i in event_order:
519            if last_tile != events[i].tile:
520                channels_to_get, order, rel_composites = events[i].get_montage_channels(
521                    channels, composites
522                )
523                # Gather the frame images, preserving them for the next event
524                frames = Frame.get_frames(events[i].tile, channels_to_get)
525                if isinstance(apply_gain, bool):
526                    apply = [apply_gain] * len(frames)
527                else:
528                    apply = apply_gain
529                images = [f.get_image(input_path[i], a) for f, a in zip(frames, apply)]
530                last_tile = events[i].tile
531            # Use the frame images to crop the event images and make montages
532            crops = events[i].crop(images, crop_size[i], in_pixels)
533            montages[i] = csi_images.make_montage(
534                crops, order, rel_composites, **kwargs
535            )
536
537        return montages

Convenience function for get_montage(), but for a list of events. More efficient thank get_montage() when working with multiple events from the same tile.

Parameters
  • events: a list of Event objects.
  • channels: the channels to extract images for. Defaults to all channels.
  • composites: dictionary of indices and RGB tuples for a composite.
  • crop_size: the square size of the image crop to get for this event.
  • in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
  • input_path: the path to the input images. Defaults to None (uses the scan's path).
  • apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. If a list, matches the channels.
  • kwargs: montage options. See csi_images.make_montage() for more details.
Returns

a list of numpy arrays representing the montages.

@classmethod
def get_and_save_many_crops( cls, events: list[typing.Self], output_path: str, labels: Sequence[str], ext: str = 'auto', additional_gain: Sequence[float] = None, **kwargs) -> None:
539    @classmethod
540    def get_and_save_many_crops(
541        cls,
542        events: list[Self],
543        output_path: str,
544        labels: Sequence[str],
545        ext: str = "auto",
546        additional_gain: Sequence[float] = None,
547        **kwargs,
548    ) -> None:
549        """
550        Get and save the crops for a list of events, ensuring that there is no wasteful
551        reading and limiting the image data in memory to 1 tile at a time. This function
552        is more efficient that chaining get_crops() and save_crops() for each event or
553        get_many_crops() and then save_crops().
554        :param events: list of events to get, crop, and save.
555        :param output_path: the folder to save the crops in. Will make if needed.
556        :param labels: the labels to save the crops with. See save_crops().
557        :param ext: the file extension to save the crops as. See save_crops().
558        :param additional_gain: additional gain to apply to the crops. If not None, must
559        match the length of the number of crop channels.
560        :param kwargs: see get_many_crops() for more parameters.
561        :return:
562        """
563        unique_tiles = set([event.tile for event in events])
564
565        for tile in unique_tiles:
566            # Get one tile's worth of event crops
567            tile_events = [e for e in events if e.tile == tile]
568            crops_list = cls.get_many_crops(tile_events, **kwargs)
569            for event, crops in zip(tile_events, crops_list):
570                # Apply any additional gains
571                if additional_gain is not None:
572                    crops = [gain * crop for gain, crop in zip(additional_gain, crops)]
573                event.save_crops(crops, output_path, labels, ext)

Get and save the crops for a list of events, ensuring that there is no wasteful reading and limiting the image data in memory to 1 tile at a time. This function is more efficient that chaining get_crops() and save_crops() for each event or get_many_crops() and then save_crops().

Parameters
  • events: list of events to get, crop, and save.
  • output_path: the folder to save the crops in. Will make if needed.
  • labels: the labels to save the crops with. See save_crops().
  • ext: the file extension to save the crops as. See save_crops().
  • additional_gain: additional gain to apply to the crops. If not None, must match the length of the number of crop channels.
  • kwargs: see get_many_crops() for more parameters.
Returns
@classmethod
def get_and_save_many_montages( cls, events: list[typing.Self], output_path: str, ocular_names: bool = False, tag: str = '', **kwargs) -> None:
575    @classmethod
576    def get_and_save_many_montages(
577        cls,
578        events: list[Self],
579        output_path: str,
580        ocular_names: bool = False,
581        tag: str = "",
582        **kwargs,
583    ) -> None:
584        """
585        Save montages of the events to image files.
586        :param events: the events to get, montage, and save.
587        :param output_path: the folder to save the montages to. Will make if needed.
588        :param ocular_names: whether to use the OCULAR naming convention.
589        :param tag: a tag to append to the file name. Ignored if ocular_names is True.
590        :param kwargs: see get_many_montages() for more parameters.
591        """
592        unique_tiles = set([event.tile for event in events])
593
594        for tile in unique_tiles:
595            # Get one tile's worth of event crops
596            tile_events = [e for e in events if e.tile == tile]
597            montages = cls.get_many_montages(tile_events, **kwargs)
598            for event, montage in zip(tile_events, montages):
599                event.save_montage(montage, output_path, ocular_names, tag)

Save montages of the events to image files.

Parameters
  • events: the events to get, montage, and save.
  • output_path: the folder to save the montages to. Will make if needed.
  • ocular_names: whether to use the OCULAR naming convention.
  • tag: a tag to append to the file name. Ignored if ocular_names is True.
  • kwargs: see get_many_montages() for more parameters.
class EventArray:
 602class EventArray:
 603    """
 604    A class that holds a large number of events' data, making it easy to analyze and
 605    manipulate many events at once. A more separated version of the Event class.
 606    """
 607
 608    INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y"]
 609
 610    def __init__(
 611        self,
 612        info: pd.DataFrame = None,
 613        metadata: pd.DataFrame = None,
 614        features: pd.DataFrame = None,
 615    ):
 616        # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y"
 617        if info is not None:
 618            # Special case: "roi" is often not required, so we'll fill in if its missing
 619            if "roi" not in info.columns:
 620                info["roi"] = 0
 621            if set(info.columns) != set(self.INFO_COLUMNS):
 622                raise ValueError(
 623                    f"EventArray.info must have columns:"
 624                    f"{self.INFO_COLUMNS}; had {list(info.columns)}"
 625                )
 626            # Copy first to avoid modifying the original
 627            info = info.copy()
 628            # Ensure that the columns are the right types
 629            info["slide_id"] = info["slide_id"].astype(str)
 630            info["tile"] = info["tile"].astype(np.uint16)
 631            info["roi"] = info["roi"].astype(np.uint8)
 632            info["x"] = info["x"].round().astype(np.uint16)
 633            info["y"] = info["y"].round().astype(np.uint16)
 634            # Ensure that the columns are in the right order
 635            info = info[self.INFO_COLUMNS]
 636        # All DataFrames must all have the same number of rows
 637        if metadata is not None and (info is None or len(info) != len(metadata)):
 638            raise ValueError(
 639                "If EventArray.metadata is not None, it should match rows with .info"
 640            )
 641        if features is not None and (info is None or len(info) != len(features)):
 642            raise ValueError(
 643                "If EventArray.features is not None, it should match rows with .info"
 644            )
 645        # No columns named "metadata_", "features_", or "None"
 646        column_names = []
 647        if metadata is not None:
 648            column_names += metadata.columns.tolist()
 649        if features is not None:
 650            column_names += features.columns.tolist()
 651        if any([col.lower().startswith("metadata_") for col in column_names]):
 652            raise ValueError("EventArray column names cannot start with 'metadata_'")
 653        if any([col.lower().startswith("features_") for col in column_names]):
 654            raise ValueError("EventArray column names cannot start with 'features_'")
 655        if any([col.lower() == "none" for col in column_names]):
 656            raise ValueError("EventArray column names cannot be 'none'")
 657
 658        self.info = info
 659        self.metadata = metadata
 660        self.features = features
 661
 662    def __len__(self) -> int:
 663        # Convenience method to get the number of events
 664        if self.info is None:
 665            return 0
 666        else:
 667            return len(self.info)
 668
 669    def __eq__(self, other):
 670        # Parse all possibilities for info
 671        if isinstance(self.info, pd.DataFrame):
 672            if isinstance(other.info, pd.DataFrame):
 673                if not self.info.equals(other.info):
 674                    return False
 675            else:
 676                return False
 677        elif self.info is None:
 678            if other.info is not None:
 679                return False
 680
 681        # Parse all possibilities for metadata
 682        if isinstance(self.metadata, pd.DataFrame):
 683            if isinstance(other.metadata, pd.DataFrame):
 684                is_equal = self.metadata.equals(other.metadata)
 685                if not is_equal:
 686                    return False
 687            else:
 688                return False
 689        elif self.metadata is None:
 690            if other.metadata is not None:
 691                return False
 692
 693        # Parse all possibilities for features
 694        if isinstance(self.features, pd.DataFrame):
 695            if isinstance(other.features, pd.DataFrame):
 696                is_equal = self.features.equals(other.features)
 697                if not is_equal:
 698                    return False
 699            else:
 700                return False
 701        elif self.features is None:
 702            if other.features is not None:
 703                return False
 704
 705        return is_equal
 706
 707    def get_sort_order(
 708        self, by: Hashable | Sequence[Hashable], ascending: bool | Sequence[bool] = True
 709    ):
 710        """
 711        Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.
 712        :param by: name of the column(s) to sort by.
 713        :param ascending: whether to sort in ascending order; can be a list to match by
 714        :return: the order of the indices to sort by.
 715        """
 716        columns = self.get(by)
 717        return columns.sort_values(by=by, ascending=ascending).index
 718
 719    def sort(
 720        self,
 721        by: Hashable | Sequence[Hashable],
 722        ascending: bool | Sequence[bool] = True,
 723    ) -> Self:
 724        """
 725        Sort the EventArray by column(s) in the info, metadata, or features DataFrames.
 726        :param by: name of the column(s) to sort by.
 727        :param ascending: whether to sort in ascending order; can be a list to match by
 728        :return: a new, sorted EventArray.
 729        """
 730        order = self.get_sort_order(by, ascending)
 731        info = self.info.loc[order].reset_index(drop=True)
 732        if self.metadata is not None:
 733            metadata = self.metadata.loc[order].reset_index(drop=True)
 734        else:
 735            metadata = None
 736        if self.features is not None:
 737            features = self.features.loc[order].reset_index(drop=True)
 738        else:
 739            features = None
 740        return EventArray(info, metadata, features)
 741
 742    def get(self, column_names: Hashable | Sequence[Hashable]) -> pd.DataFrame:
 743        """
 744        Get a DataFrame with the specified columns from the EventArray, by value.
 745        :param column_names: the names of the columns to get.
 746        :return: a DataFrame with the specified columns.
 747        """
 748        if isinstance(column_names, Hashable):
 749            column_names = [column_names]  # Drop into a list for the loop
 750        columns = []
 751        for column_name in column_names:
 752            if column_name in self.info.columns:
 753                columns.append(self.info[column_name])
 754            elif self.metadata is not None and column_name in self.metadata.columns:
 755                columns.append(self.metadata[column_name])
 756            elif self.features is not None and column_name in self.features.columns:
 757                columns.append(self.features[column_name])
 758            else:
 759                raise ValueError(f"Column {column_name} not found in EventArray")
 760        return pd.concat(columns, axis=1)
 761
 762    def rows(self, rows: Sequence[Hashable]) -> Self:
 763        """
 764        Get a subset of the EventArray rows based on a boolean or integer index, by value.
 765        :param rows: row labels, indices, or boolean mask; anything for .loc[]
 766        :return: a new EventArray with the subset of events.
 767        """
 768        info = self.info.loc[rows].reset_index(drop=True)
 769        if self.metadata is not None:
 770            metadata = self.metadata.loc[rows].reset_index(drop=True)
 771        else:
 772            metadata = None
 773        if self.features is not None:
 774            features = self.features.loc[rows].reset_index(drop=True)
 775        else:
 776            features = None
 777        return EventArray(info, metadata, features)
 778
 779    def copy(self) -> Self:
 780        """
 781        Create a deep copy of the EventArray.
 782        :return: a deep copy of the EventArray.
 783        """
 784        return EventArray(
 785            info=self.info.copy(),
 786            metadata=None if self.metadata is None else self.metadata.copy(),
 787            features=None if self.features is None else self.features.copy(),
 788        )
 789
 790    # TODO: add a "filter" convenience function that takes a column name and values to filter by
 791
 792    def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None:
 793        """
 794        Add metadata to the EventArray. Removes the need to check if metadata is None.
 795        Overwrites any existing metadata with the same column names as the new metadata.
 796        :param new_metadata: the metadata to add.
 797        """
 798        if len(self) != len(new_metadata):
 799            raise ValueError("New metadata must match length of existing info")
 800
 801        if self.metadata is None:
 802            self.metadata = new_metadata
 803        else:
 804            if isinstance(new_metadata, pd.Series):
 805                self.metadata[new_metadata.name] = new_metadata
 806            else:
 807                # It's a DataFrame
 808                self.metadata[new_metadata.columns] = new_metadata
 809
 810    def add_features(self, new_features: pd.Series | pd.DataFrame) -> None:
 811        """
 812        Add features to the EventArray. Removes the need to check if features is None.
 813        Overwrites any existing features with the same column names as the new features.
 814        :param new_features: the features to add.
 815        """
 816        if len(self) != len(new_features):
 817            raise ValueError("New features must match length of existing info")
 818
 819        if self.features is None:
 820            self.features = new_features
 821        else:
 822            if isinstance(new_features, pd.Series):
 823                self.features[new_features.name] = new_features
 824            else:
 825                # It's a DataFrame
 826                self.features[new_features.columns] = new_features
 827
 828    @classmethod
 829    def merge(cls, events: Iterable[Self]) -> Self:
 830        """
 831        Combine EventArrays in a list into a single EventArray.
 832        :param events: the new list of events.
 833        """
 834        all_info = []
 835        all_metadata = []
 836        all_features = []
 837        for event_array in events:
 838            # Skip empty EventArrays
 839            if event_array.info is not None:
 840                all_info.append(event_array.info)
 841            if event_array.metadata is not None:
 842                all_metadata.append(event_array.metadata)
 843            if event_array.features is not None:
 844                all_features.append(event_array.features)
 845        if len(all_info) == 0:
 846            return EventArray()
 847        else:
 848            all_info = pd.concat(all_info, ignore_index=True)
 849        if len(all_metadata) == 0:
 850            all_metadata = None
 851        else:
 852            all_metadata = pd.concat(all_metadata, ignore_index=True)
 853        if len(all_features) == 0:
 854            all_features = None
 855        else:
 856            all_features = pd.concat(all_features, ignore_index=True)
 857
 858        return EventArray(all_info, all_metadata, all_features)
 859
 860    def to_events(
 861        self,
 862        scans: Scan | Iterable[Scan],
 863        ignore_missing_scans=True,
 864        ignore_metadata=False,
 865        ignore_features=False,
 866    ) -> list[Event]:
 867        """
 868        Get the events in the EventArray as a list of events.
 869        :param scans: the scans that the events belong to, auto-matched by slide_id.
 870        Pass None if you don't care about scan metadata (pass ignore_missing_scans).
 871        :param ignore_missing_scans: whether to create blank scans for events without scans.
 872        :param ignore_metadata: whether to ignore metadata or not
 873        :param ignore_features: whether to ignore features or not
 874        :return:
 875        """
 876        if isinstance(scans, Scan):
 877            scans = [scans]
 878        scans = {scan.slide_id: scan for scan in scans}
 879        events = []
 880        for i in range(len(self.info)):
 881            # Determine the associated scan
 882            slide_id = self.info["slide_id"][i]
 883            if slide_id not in scans:
 884                if ignore_missing_scans:
 885                    # Create a placeholder scan if the scan is missing
 886                    scan = Scan.make_placeholder(
 887                        slide_id,
 888                        self.info["tile"][i],
 889                        self.info["roi"][i],
 890                    )
 891                else:
 892                    raise ValueError(
 893                        f"Scan {self.info['slide_id'][i]} not found for event {i}."
 894                    )
 895            else:
 896                scan = scans[slide_id]
 897
 898            # Prepare the metadata and features
 899            if ignore_metadata or self.metadata is None:
 900                metadata = None
 901            else:
 902                # This Series creation method is less efficient,
 903                # but required for preserving dtypes
 904                metadata = pd.Series(
 905                    {col: self.metadata.loc[i, col] for col in self.metadata.columns},
 906                    dtype=object,
 907                )
 908            if ignore_features or self.features is None:
 909                features = None
 910            else:
 911                features = pd.Series(
 912                    {col: self.features.loc[i, col] for col in self.features.columns},
 913                    dtype=object,
 914                )
 915            # Create the event and append it to the list
 916            events.append(
 917                Event(
 918                    Tile(scan, self.info["tile"][i], self.info["roi"][i]),
 919                    self.info["x"][i],
 920                    self.info["y"][i],
 921                    metadata=metadata,
 922                    features=features,
 923                )
 924            )
 925        return events
 926
 927    @classmethod
 928    def from_events(cls, events: Iterable[Event]) -> Self:
 929        """
 930        Set the events in the EventArray to a new list of events.
 931        :param events: the new list of events.
 932        """
 933        info = pd.DataFrame(
 934            {
 935                "slide_id": [event.tile.scan.slide_id for event in events],
 936                "tile": [event.tile.n for event in events],
 937                "roi": [event.tile.n_roi for event in events],
 938                "x": [event.x for event in events],
 939                "y": [event.y for event in events],
 940            }
 941        )
 942        metadata_list = [event.metadata for event in events]
 943        # Iterate through and ensure that all metadata is the same shape
 944        for metadata in metadata_list:
 945            if type(metadata) != type(metadata_list[0]):
 946                raise ValueError("All metadata must be the same type.")
 947            if metadata is not None and metadata.shape != metadata_list[0].shape:
 948                raise ValueError("All metadata must be the same shape.")
 949        if metadata_list[0] is None:
 950            metadata = None
 951        else:
 952            metadata = pd.DataFrame(metadata_list)
 953        features_list = [event.features for event in events]
 954        # Iterate through and ensure that all features are the same shape
 955        for features in features_list:
 956            if type(features) != type(features_list[0]):
 957                raise ValueError("All features must be the same type.")
 958            if features is not None and features.shape != features_list[0].shape:
 959                raise ValueError("All features must be the same shape.")
 960        if features_list[0] is None:
 961            features = None
 962        else:
 963            features = pd.DataFrame(features_list)
 964        return EventArray(info=info, metadata=metadata, features=features)
 965
 966    def to_dataframe(self) -> pd.DataFrame:
 967        """
 968        Convert all the data in the EventArray to a single DataFrame.
 969        :return: a DataFrame with all the data in the EventArray.
 970        """
 971        # Make a copy of the info DataFrame and prepend "info_" to the column names
 972        output = self.info.copy()
 973        # Combine with the metadata and prepend "metadata_" to the column names
 974        if self.metadata is not None:
 975            metadata = self.metadata.copy()
 976            metadata.columns = [f"metadata_{col}" for col in metadata.columns]
 977            output = pd.concat([output, metadata], axis=1)
 978        # Combine with the features and prepend "features_" to the column names
 979        if self.features is not None:
 980            features = self.features.copy()
 981            features.columns = [f"features_{col}" for col in features.columns]
 982            output = pd.concat([output, features], axis=1)
 983        return output
 984
 985    @classmethod
 986    def from_dataframe(
 987        cls, df, metadata_prefix: str = "metadata_", features_prefix: str = "features_"
 988    ) -> Self:
 989        """
 990        From a single, special DataFrame, create an EventArray.
 991        :param df: the DataFrame to convert to an EventArray.
 992        :param metadata_prefix: the prefix for metadata columns.
 993        :param features_prefix: the prefix for features columns.
 994        :return: a DataFrame with all the data in the EventArray.
 995        """
 996        # Split the columns into info, metadata, and features and strip prefix
 997        info = df[[col for col in df.columns if col in cls.INFO_COLUMNS]].copy()
 998        if info.size == 0:
 999            info = None
1000        metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy()
1001        metadata.columns = [
1002            col.replace(metadata_prefix, "") for col in metadata.columns
1003        ]
1004        if metadata.size == 0:
1005            metadata = None
1006        features = df[[col for col in df.columns if col.startswith("features_")]].copy()
1007        features.columns = [
1008            col.replace(features_prefix, "") for col in features.columns
1009        ]
1010        if features.size == 0:
1011            features = None
1012        return cls(info=info, metadata=metadata, features=features)
1013
1014    @classmethod
1015    def from_mask(
1016        cls,
1017        mask: np.ndarray,
1018        slide_id: str,
1019        tile_n: int,
1020        n_roi: int = 0,
1021        include_cell_id: bool = True,
1022        images: list[np.ndarray] = None,
1023        image_labels: list[str] = None,
1024        properties: list[str] = None,
1025    ) -> Self:
1026        """
1027        Extract events from a mask DataFrame, including metadata and features.
1028        :param mask: the mask to extract events from.
1029        :param slide_id: the slide ID the mask is from.
1030        :param tile_n: the tile number the mask is from.
1031        :param n_roi: the ROI number the mask is from.
1032        :param include_cell_id: whether to include the cell_id, or numerical
1033        mask label, as metadata in the EventArray.
1034        :param images: the intensity images to extract features from.
1035        :param image_labels: the labels for the intensity images.
1036        :param properties: list of properties to extract in addition to the defaults:
1037        :return: EventArray corresponding to the mask labels.
1038        """
1039        if csi_images is None:
1040            raise ModuleNotFoundError(
1041                "imageio libraries not installed! "
1042                "run `pip install csi_images[imageio]` to resolve."
1043            )
1044        # Gather mask_info
1045        if images is not None and image_labels is not None:
1046            if len(images) != len(image_labels):
1047                raise ValueError("Intensity images and labels must match lengths.")
1048
1049        mask_info = csi_images.extract_mask_info(mask, images, image_labels, properties)
1050
1051        if len(mask_info) == 0:
1052            return EventArray()
1053
1054        # Combine provided info and mask info
1055        info = pd.DataFrame(
1056            {
1057                "slide_id": slide_id,
1058                "tile": tile_n,
1059                "roi": n_roi,
1060                "x": mask_info["x"],
1061                "y": mask_info["y"],
1062            },
1063        )
1064        # Extract a metadata column if desired
1065        if include_cell_id:
1066            metadata = pd.DataFrame({"cell_id": mask_info["id"]})
1067        else:
1068            metadata = None
1069        # If any additional properties were extracted, add them as features
1070        mask_info = mask_info.drop(columns=["id", "x", "y"], errors="ignore")
1071        if len(mask_info.columns) > 0:
1072            features = mask_info
1073        else:
1074            features = None
1075        return EventArray(info, metadata, features)
1076
1077    def save_csv(self, output_path: str) -> bool:
1078        """
1079        Save the events to an CSV file, including metadata and features.
1080        :param output_path:
1081        :return:
1082        """
1083        if not output_path.endswith(".csv"):
1084            output_path += ".csv"
1085        self.to_dataframe().to_csv(output_path, index=False)
1086        return os.path.exists(output_path)
1087
1088    @classmethod
1089    def load_csv(
1090        cls,
1091        input_path: str,
1092        metadata_prefix: str = "metadata_",
1093        features_prefix: str = "features_",
1094    ) -> Self:
1095        """
1096        Load the events from an CSV file, including metadata and features.
1097        :param input_path:
1098        :param metadata_prefix:
1099        :param features_prefix:
1100        :return:
1101        """
1102        # Load the CSV file
1103        df = pd.read_csv(input_path)
1104        return cls.from_dataframe(df, metadata_prefix, features_prefix)
1105
1106    def save_hdf5(self, output_path: str) -> bool:
1107        """
1108        Save the events to an HDF5 file, including metadata and features.
1109        Uses the pandas-provided HDF5 functions for ease, and external compatibility,
1110        though these files are slightly harder to view in HDFView or similar.
1111        :param output_path:
1112        :return:
1113        """
1114        if not output_path.endswith(".hdf5") and not output_path.endswith(".h5"):
1115            output_path += ".hdf5"
1116        # Open the output_path as an HDF5 file
1117        with pd.HDFStore(output_path) as store:
1118            # Store the dataframes in the HDF5 file
1119            if self.info is not None:
1120                store.put("info", self.info, index=False)
1121            if self.metadata is not None:
1122                store.put("metadata", self.metadata, index=False)
1123            if self.features is not None:
1124                store.put("features", self.features, index=False)
1125        return os.path.exists(output_path)
1126
1127    @classmethod
1128    def load_hdf5(cls, input_path: str) -> Self:
1129        """
1130        Load the events from an HDF5 file, including metadata and features.
1131        :param input_path:
1132        :return:
1133        """
1134        # Open the input_path as an HDF5 file
1135        with pd.HDFStore(input_path, "r") as store:
1136            # Load the dataframes from the HDF5 file
1137            info = store.get("info") if "info" in store else None
1138            metadata = store.get("metadata") if "metadata" in store else None
1139            features = store.get("features") if "features" in store else None
1140        return cls(info=info, metadata=metadata, features=features)
1141
1142    def save_ocular(self, output_path: str, event_type: str = "cells"):
1143        """
1144        Save the events to an OCULAR file. Relies on the dataframe originating
1145        from an OCULAR file (same columns; duplicate metadata/info).
1146        :param output_path:
1147        :param event_type:
1148        :return:
1149        """
1150        if pyreadr is None:
1151            raise ModuleNotFoundError(
1152                "pyreadr not installed! Install pyreadr directly "
1153                "or run `pip install csi-images[rds]` option to resolve."
1154            )
1155        if event_type == "cells":
1156            file_stub = "rc-final"
1157        elif event_type == "others":
1158            file_stub = "others-final"
1159        else:
1160            raise ValueError("Invalid event type. Must be cells or others.")
1161
1162        # Ensure good metadata
1163        metadata = pd.DataFrame(
1164            {
1165                "slide_id": self.info["slide_id"],
1166                "frame_id": self.info["tile"],
1167                "cell_id": (
1168                    self.metadata["cell_id"]
1169                    if "cell_id" in self.metadata.columns
1170                    else range(len(self.info))
1171                ),
1172                "cellx": self.info["x"],
1173                "celly": self.info["y"],
1174            }
1175        )
1176        if self.metadata is not None:
1177            metadata[self.metadata.columns] = self.metadata.copy()
1178
1179        # Check for the "ocular_interesting" column
1180        if event_type == "cells":
1181            if "ocular_interesting" in metadata.columns:
1182                interesting_rows = metadata["ocular_interesting"].to_numpy(dtype=bool)
1183            elif "hcpc" in metadata.columns:
1184                # Interesting cells don't get an hcpc designation, leaving them as -1
1185                interesting_rows = (
1186                    metadata["hcpc"].to_numpy() == -1
1187                )  # interesting cells
1188            else:
1189                interesting_rows = []
1190            if sum(interesting_rows) > 0:
1191                # Split the metadata into interesting and regular
1192                interesting_events = self.rows(interesting_rows)
1193                interesting_df = pd.concat(
1194                    [interesting_events.features, interesting_events.metadata], axis=1
1195                )
1196                data_events = self.rows(~interesting_rows)
1197                data_df = pd.concat(
1198                    [data_events.features, data_events.metadata], axis=1
1199                )
1200                data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore")
1201
1202                # Drop particular columns for "interesting"
1203                interesting_df = interesting_df.drop(
1204                    [
1205                        "clust",
1206                        "hcpc",
1207                        "frame_id",
1208                        "cell_id",
1209                        "unique_id",
1210                        "ocular_interesting",
1211                    ],
1212                    axis=1,
1213                    errors="ignore",
1214                )
1215                # Save both .csv and .rds
1216                interesting_stub = os.path.join(output_path, "ocular_interesting")
1217                interesting_df.to_csv(f"{interesting_stub}.csv")
1218                # Suppress pandas FutureWarning
1219                with warnings.catch_warnings():
1220                    warnings.simplefilter(action="ignore", category=FutureWarning)
1221                    pyreadr.write_rds(f"{interesting_stub}.rds", interesting_df)
1222            else:
1223                data_df = pd.concat([self.features, metadata], axis=1)
1224        else:
1225            # Get all data and reset_index (will copy it)
1226            data_df = pd.concat([self.features, metadata], axis=1)
1227
1228        # Split based on cluster number to conform to *-final[1-4].rds
1229        n_clusters = max(data_df["clust"]) + 1
1230        split_idx = [round(i * n_clusters / 4) for i in range(5)]
1231        for i in range(4):
1232            subset = (split_idx[i] <= data_df["clust"]) & (
1233                data_df["clust"] < split_idx[i + 1]
1234            )
1235            data_df.loc[subset, "hcpc"] = i + 1
1236            subset = data_df[subset].reset_index(drop=True)
1237            # Suppress pandas FutureWarning
1238            with warnings.catch_warnings():
1239                warnings.simplefilter(action="ignore", category=FutureWarning)
1240                pyreadr.write_rds(
1241                    os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset
1242                )
1243
1244        # Create new example cell strings
1245        data_df["example_cell_id"] = (
1246            data_df["slide_id"]
1247            + " "
1248            + data_df["frame_id"].astype(str)
1249            + " "
1250            + data_df["cell_id"].astype(str)
1251            + " "
1252            + data_df["cellx"].astype(int).astype(str)
1253            + " "
1254            + data_df["celly"].astype(int).astype(str)
1255        )
1256        # Find averagable data columns
1257        if "cellcluster_id" in data_df.columns:
1258            end_idx = data_df.columns.get_loc("cellcluster_id")
1259        else:
1260            end_idx = data_df.columns.get_loc("slide_id")
1261        avg_cols = data_df.columns[:end_idx].tolist()
1262        # Group by cluster and average
1263        data_df = data_df.groupby("clust").agg(
1264            **{col: (col, "mean") for col in avg_cols},
1265            count=("clust", "size"),  # count rows in each cluster
1266            example_cells=("example_cell_id", lambda x: ",".join(x)),
1267            hcpc=("hcpc", lambda x: x.iloc[0]),
1268        )
1269        data_df = data_df.reset_index()  # Do NOT drop, index is "clust"
1270        # Create new columns
1271        metadata = pd.DataFrame(
1272            {
1273                "count": data_df["count"],
1274                "example_cells": data_df["example_cells"],
1275                "clust": data_df["clust"].astype(int),
1276                "hcpc": data_df["hcpc"].astype(int),
1277                "id": data_df["clust"].astype(int).astype(str),
1278                "cccluster": "0",  # Dummy value
1279                "ccdistance": 0.0,  # Dummy value
1280                "rownum": list(range(len(data_df))),
1281                "framegroup": 0,  # Dummy value
1282            }
1283        )
1284        # Need to pad the features to 761 columns, as per OCULAR report needs
1285        additional_columns = range(len(avg_cols), 761)
1286        if len(additional_columns) > 0:
1287            padding = pd.DataFrame(
1288                np.zeros((len(data_df), len(additional_columns))),
1289                columns=[f"pad{i}" for i in additional_columns],
1290            )
1291            data_df = pd.concat([data_df[avg_cols], padding, metadata], axis=1)
1292        else:
1293            data_df = pd.concat([data_df[avg_cols], metadata], axis=1)
1294
1295        # Save the cluster data
1296        data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv"))
1297        # Suppress pandas FutureWarning
1298        with warnings.catch_warnings():
1299            warnings.simplefilter(action="ignore", category=FutureWarning)
1300            pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df)
1301
1302    @classmethod
1303    def load_ocular(
1304        cls,
1305        input_path: str,
1306        event_type="cells",
1307        cell_data_files=(
1308            "rc-final1.rds",
1309            "rc-final2.rds",
1310            "rc-final3.rds",
1311            "rc-final4.rds",
1312            "ocular_interesting.rds",
1313        ),
1314        others_data_files=(
1315            "others-final1.rds",
1316            "others-final2.rds",
1317            "others-final3.rds",
1318            "others-final4.rds",
1319        ),
1320        atlas_data_files=(
1321            "ocular_interesting.rds",
1322            "ocular_not_interesting.rds",
1323        ),
1324        drop_common_events=True,
1325    ) -> Self:
1326        """
1327
1328        :param input_path:
1329        :param event_type:
1330        :param cell_data_files:
1331        :param others_data_files:
1332        :param atlas_data_files:
1333        :param drop_common_events:
1334        :return:
1335        """
1336        if pyreadr is None:
1337            raise ModuleNotFoundError(
1338                "pyreadr not installed! Install pyreadr directly "
1339                "or run `pip install csi-images[rds]` option to resolve."
1340            )
1341        # Check if the input path is a directory or a file
1342        if os.path.isfile(input_path):
1343            data_files = [os.path.basename(input_path)]
1344            input_path = os.path.dirname(input_path)
1345        if event_type == "cells":
1346            data_files = cell_data_files
1347        elif event_type == "others":
1348            data_files = others_data_files
1349        else:
1350            raise ValueError("Invalid event type.")
1351
1352        # Load the data from the OCULAR files
1353        file_data = {}
1354        for file in data_files:
1355            file_path = os.path.join(input_path, file)
1356            if not os.path.isfile(file_path):
1357                warnings.warn(f"{file} not found for in {input_path}")
1358                continue
1359            file_data[file] = pyreadr.read_r(file_path)
1360            # Get the DataFrame associated with None (pyreadr dict quirk)
1361            file_data[file] = file_data[file][None]
1362            if len(file_data[file]) == 0:
1363                # File gets dropped from the dict
1364                file_data.pop(file)
1365                warnings.warn(f"{file} has no cells")
1366                continue
1367
1368            # Drop common cells if requested and in this file
1369            if (
1370                file in atlas_data_files
1371                and drop_common_events
1372                and "catalogue_classification" in file_data[file]
1373            ):
1374                common_cell_indices = (
1375                    file_data[file]["catalogue_classification"] == "common_cell"
1376                )
1377                file_data[file] = file_data[file][common_cell_indices == False]
1378
1379            if len(file_data[file]) == 0:
1380                # File gets dropped from the dict
1381                file_data.pop(file)
1382                warnings.warn(f"{file} has no cells after dropping common cells")
1383                continue
1384
1385            # Extract frame_id and cell_id
1386            # DAPI- events already have frame_id cell_id outside rowname
1387            if event_type == "cells" and "frame_id" not in file_data[file].columns:
1388                file_data[file]["rowname"] = file_data[file]["rowname"].astype("str")
1389                # get frame_id cell_id from rownames column and split into two columns
1390                split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True)
1391                if len(split_res.columns) != 2:
1392                    warnings.warn(
1393                        f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}'
1394                    )
1395                # then assign it back to the dataframe
1396                file_data[file][["frame_id", "cell_id"]] = split_res.astype("int")
1397            # Ensure frame_id and cell_id are integers
1398            file_data[file]["frame_id"] = file_data[file]["frame_id"].astype("int")
1399            file_data[file]["cell_id"] = file_data[file]["cell_id"].astype("int")
1400            # reset indexes since they can cause NaN values in concat
1401            file_data[file] = file_data[file].reset_index(drop=True)
1402
1403        # Merge the data from all files
1404        if len(file_data) == 0:
1405            return EventArray()
1406        elif len(file_data) == 1:
1407            data = [file_data[file] for file in file_data.keys()][0]
1408        else:
1409            data = pd.concat(file_data.values())
1410
1411        # Others is missing the "slide_id". Insert it right before "frame_id" column
1412        if event_type == "others" and "slide_id" not in data.columns:
1413            if os.path.basename(input_path) == "ocular":
1414                slide_id = os.path.basename(os.path.dirname(input_path))
1415            else:
1416                slide_id = "UNKNOWN"
1417            data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id)
1418
1419        # Sort according to ascending cell_id to keep the original, which is in manual_df
1420        data = data.sort_values(by=["cell_id"], ascending=True)
1421        # Filter out duplicates by x & y
1422        data = data.assign(
1423            unique_id=data["slide_id"]
1424            + "_"
1425            + data["frame_id"].astype(str)
1426            + "_"
1427            + data["cellx"].astype(int).astype(str)
1428            + "_"
1429            + data["celly"].astype(int).astype(str)
1430        )
1431        data = data.drop_duplicates(subset=["unique_id"], keep="first")
1432        # Normal unique_id is with cell_id
1433        data = data.assign(
1434            unique_id=data["slide_id"]
1435            + "_"
1436            + data["frame_id"].astype(str)
1437            + "_"
1438            + data["cell_id"].astype(str)
1439        )
1440        data = data.reset_index(drop=True)
1441        # All columns up to "slide_id" are features; drop the "slide_id"
1442        features = data.loc[:, :"slide_id"].iloc[:, :-1]
1443        data = data.loc[:, "slide_id":]
1444        # Grab the info columns
1445        info = data[["slide_id", "frame_id", "cellx", "celly"]]
1446        info.columns = ["slide_id", "tile", "x", "y"]
1447        info = info.assign(roi=0)  # OCULAR only works on 1 ROI, as far as known
1448        info = info[["slide_id", "tile", "roi", "x", "y"]]
1449        # Metadata has duplicate columns for later convenience
1450        metadata = data
1451        # Certain columns tend to be problematic with mixed data formats...
1452        for col in ["TRITC", "CY5", "FITC"]:
1453            if col in metadata:
1454                labels = {
1455                    "False": False,
1456                    "True": True,
1457                    "FALSE": False,
1458                    "TRUE": True,
1459                    False: False,
1460                    True: True,
1461                }
1462                metadata[col] = metadata[col].map(labels).astype(bool)
1463        for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]:
1464            if col in metadata:
1465                metadata[col] = metadata[col].fillna(-1).astype(int)
1466        return EventArray(info, metadata, features)

A class that holds a large number of events' data, making it easy to analyze and manipulate many events at once. A more separated version of the Event class.

EventArray( info: pandas.core.frame.DataFrame = None, metadata: pandas.core.frame.DataFrame = None, features: pandas.core.frame.DataFrame = None)
610    def __init__(
611        self,
612        info: pd.DataFrame = None,
613        metadata: pd.DataFrame = None,
614        features: pd.DataFrame = None,
615    ):
616        # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y"
617        if info is not None:
618            # Special case: "roi" is often not required, so we'll fill in if its missing
619            if "roi" not in info.columns:
620                info["roi"] = 0
621            if set(info.columns) != set(self.INFO_COLUMNS):
622                raise ValueError(
623                    f"EventArray.info must have columns:"
624                    f"{self.INFO_COLUMNS}; had {list(info.columns)}"
625                )
626            # Copy first to avoid modifying the original
627            info = info.copy()
628            # Ensure that the columns are the right types
629            info["slide_id"] = info["slide_id"].astype(str)
630            info["tile"] = info["tile"].astype(np.uint16)
631            info["roi"] = info["roi"].astype(np.uint8)
632            info["x"] = info["x"].round().astype(np.uint16)
633            info["y"] = info["y"].round().astype(np.uint16)
634            # Ensure that the columns are in the right order
635            info = info[self.INFO_COLUMNS]
636        # All DataFrames must all have the same number of rows
637        if metadata is not None and (info is None or len(info) != len(metadata)):
638            raise ValueError(
639                "If EventArray.metadata is not None, it should match rows with .info"
640            )
641        if features is not None and (info is None or len(info) != len(features)):
642            raise ValueError(
643                "If EventArray.features is not None, it should match rows with .info"
644            )
645        # No columns named "metadata_", "features_", or "None"
646        column_names = []
647        if metadata is not None:
648            column_names += metadata.columns.tolist()
649        if features is not None:
650            column_names += features.columns.tolist()
651        if any([col.lower().startswith("metadata_") for col in column_names]):
652            raise ValueError("EventArray column names cannot start with 'metadata_'")
653        if any([col.lower().startswith("features_") for col in column_names]):
654            raise ValueError("EventArray column names cannot start with 'features_'")
655        if any([col.lower() == "none" for col in column_names]):
656            raise ValueError("EventArray column names cannot be 'none'")
657
658        self.info = info
659        self.metadata = metadata
660        self.features = features
INFO_COLUMNS = ['slide_id', 'tile', 'roi', 'x', 'y']
info
metadata
features
def get_sort_order( self, by: Union[Hashable, Sequence[Hashable]], ascending: Union[bool, Sequence[bool]] = True):
707    def get_sort_order(
708        self, by: Hashable | Sequence[Hashable], ascending: bool | Sequence[bool] = True
709    ):
710        """
711        Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.
712        :param by: name of the column(s) to sort by.
713        :param ascending: whether to sort in ascending order; can be a list to match by
714        :return: the order of the indices to sort by.
715        """
716        columns = self.get(by)
717        return columns.sort_values(by=by, ascending=ascending).index

Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.

Parameters
  • by: name of the column(s) to sort by.
  • ascending: whether to sort in ascending order; can be a list to match by
Returns

the order of the indices to sort by.

def sort( self, by: Union[Hashable, Sequence[Hashable]], ascending: Union[bool, Sequence[bool]] = True) -> Self:
719    def sort(
720        self,
721        by: Hashable | Sequence[Hashable],
722        ascending: bool | Sequence[bool] = True,
723    ) -> Self:
724        """
725        Sort the EventArray by column(s) in the info, metadata, or features DataFrames.
726        :param by: name of the column(s) to sort by.
727        :param ascending: whether to sort in ascending order; can be a list to match by
728        :return: a new, sorted EventArray.
729        """
730        order = self.get_sort_order(by, ascending)
731        info = self.info.loc[order].reset_index(drop=True)
732        if self.metadata is not None:
733            metadata = self.metadata.loc[order].reset_index(drop=True)
734        else:
735            metadata = None
736        if self.features is not None:
737            features = self.features.loc[order].reset_index(drop=True)
738        else:
739            features = None
740        return EventArray(info, metadata, features)

Sort the EventArray by column(s) in the info, metadata, or features DataFrames.

Parameters
  • by: name of the column(s) to sort by.
  • ascending: whether to sort in ascending order; can be a list to match by
Returns

a new, sorted EventArray.

def get( self, column_names: Union[Hashable, Sequence[Hashable]]) -> pandas.core.frame.DataFrame:
742    def get(self, column_names: Hashable | Sequence[Hashable]) -> pd.DataFrame:
743        """
744        Get a DataFrame with the specified columns from the EventArray, by value.
745        :param column_names: the names of the columns to get.
746        :return: a DataFrame with the specified columns.
747        """
748        if isinstance(column_names, Hashable):
749            column_names = [column_names]  # Drop into a list for the loop
750        columns = []
751        for column_name in column_names:
752            if column_name in self.info.columns:
753                columns.append(self.info[column_name])
754            elif self.metadata is not None and column_name in self.metadata.columns:
755                columns.append(self.metadata[column_name])
756            elif self.features is not None and column_name in self.features.columns:
757                columns.append(self.features[column_name])
758            else:
759                raise ValueError(f"Column {column_name} not found in EventArray")
760        return pd.concat(columns, axis=1)

Get a DataFrame with the specified columns from the EventArray, by value.

Parameters
  • column_names: the names of the columns to get.
Returns

a DataFrame with the specified columns.

def rows(self, rows: Sequence[Hashable]) -> Self:
762    def rows(self, rows: Sequence[Hashable]) -> Self:
763        """
764        Get a subset of the EventArray rows based on a boolean or integer index, by value.
765        :param rows: row labels, indices, or boolean mask; anything for .loc[]
766        :return: a new EventArray with the subset of events.
767        """
768        info = self.info.loc[rows].reset_index(drop=True)
769        if self.metadata is not None:
770            metadata = self.metadata.loc[rows].reset_index(drop=True)
771        else:
772            metadata = None
773        if self.features is not None:
774            features = self.features.loc[rows].reset_index(drop=True)
775        else:
776            features = None
777        return EventArray(info, metadata, features)

Get a subset of the EventArray rows based on a boolean or integer index, by value.

Parameters
  • rows: row labels, indices, or boolean mask; anything for .loc[]
Returns

a new EventArray with the subset of events.

def copy(self) -> Self:
779    def copy(self) -> Self:
780        """
781        Create a deep copy of the EventArray.
782        :return: a deep copy of the EventArray.
783        """
784        return EventArray(
785            info=self.info.copy(),
786            metadata=None if self.metadata is None else self.metadata.copy(),
787            features=None if self.features is None else self.features.copy(),
788        )

Create a deep copy of the EventArray.

Returns

a deep copy of the EventArray.

def add_metadata( self, new_metadata: pandas.core.series.Series | pandas.core.frame.DataFrame) -> None:
792    def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None:
793        """
794        Add metadata to the EventArray. Removes the need to check if metadata is None.
795        Overwrites any existing metadata with the same column names as the new metadata.
796        :param new_metadata: the metadata to add.
797        """
798        if len(self) != len(new_metadata):
799            raise ValueError("New metadata must match length of existing info")
800
801        if self.metadata is None:
802            self.metadata = new_metadata
803        else:
804            if isinstance(new_metadata, pd.Series):
805                self.metadata[new_metadata.name] = new_metadata
806            else:
807                # It's a DataFrame
808                self.metadata[new_metadata.columns] = new_metadata

Add metadata to the EventArray. Removes the need to check if metadata is None. Overwrites any existing metadata with the same column names as the new metadata.

Parameters
  • new_metadata: the metadata to add.
def add_features( self, new_features: pandas.core.series.Series | pandas.core.frame.DataFrame) -> None:
810    def add_features(self, new_features: pd.Series | pd.DataFrame) -> None:
811        """
812        Add features to the EventArray. Removes the need to check if features is None.
813        Overwrites any existing features with the same column names as the new features.
814        :param new_features: the features to add.
815        """
816        if len(self) != len(new_features):
817            raise ValueError("New features must match length of existing info")
818
819        if self.features is None:
820            self.features = new_features
821        else:
822            if isinstance(new_features, pd.Series):
823                self.features[new_features.name] = new_features
824            else:
825                # It's a DataFrame
826                self.features[new_features.columns] = new_features

Add features to the EventArray. Removes the need to check if features is None. Overwrites any existing features with the same column names as the new features.

Parameters
  • new_features: the features to add.
@classmethod
def merge(cls, events: Iterable[Self]) -> Self:
828    @classmethod
829    def merge(cls, events: Iterable[Self]) -> Self:
830        """
831        Combine EventArrays in a list into a single EventArray.
832        :param events: the new list of events.
833        """
834        all_info = []
835        all_metadata = []
836        all_features = []
837        for event_array in events:
838            # Skip empty EventArrays
839            if event_array.info is not None:
840                all_info.append(event_array.info)
841            if event_array.metadata is not None:
842                all_metadata.append(event_array.metadata)
843            if event_array.features is not None:
844                all_features.append(event_array.features)
845        if len(all_info) == 0:
846            return EventArray()
847        else:
848            all_info = pd.concat(all_info, ignore_index=True)
849        if len(all_metadata) == 0:
850            all_metadata = None
851        else:
852            all_metadata = pd.concat(all_metadata, ignore_index=True)
853        if len(all_features) == 0:
854            all_features = None
855        else:
856            all_features = pd.concat(all_features, ignore_index=True)
857
858        return EventArray(all_info, all_metadata, all_features)

Combine EventArrays in a list into a single EventArray.

Parameters
  • events: the new list of events.
def to_events( self, scans: Union[csi_images.csi_scans.Scan, Iterable[csi_images.csi_scans.Scan]], ignore_missing_scans=True, ignore_metadata=False, ignore_features=False) -> list[Event]:
860    def to_events(
861        self,
862        scans: Scan | Iterable[Scan],
863        ignore_missing_scans=True,
864        ignore_metadata=False,
865        ignore_features=False,
866    ) -> list[Event]:
867        """
868        Get the events in the EventArray as a list of events.
869        :param scans: the scans that the events belong to, auto-matched by slide_id.
870        Pass None if you don't care about scan metadata (pass ignore_missing_scans).
871        :param ignore_missing_scans: whether to create blank scans for events without scans.
872        :param ignore_metadata: whether to ignore metadata or not
873        :param ignore_features: whether to ignore features or not
874        :return:
875        """
876        if isinstance(scans, Scan):
877            scans = [scans]
878        scans = {scan.slide_id: scan for scan in scans}
879        events = []
880        for i in range(len(self.info)):
881            # Determine the associated scan
882            slide_id = self.info["slide_id"][i]
883            if slide_id not in scans:
884                if ignore_missing_scans:
885                    # Create a placeholder scan if the scan is missing
886                    scan = Scan.make_placeholder(
887                        slide_id,
888                        self.info["tile"][i],
889                        self.info["roi"][i],
890                    )
891                else:
892                    raise ValueError(
893                        f"Scan {self.info['slide_id'][i]} not found for event {i}."
894                    )
895            else:
896                scan = scans[slide_id]
897
898            # Prepare the metadata and features
899            if ignore_metadata or self.metadata is None:
900                metadata = None
901            else:
902                # This Series creation method is less efficient,
903                # but required for preserving dtypes
904                metadata = pd.Series(
905                    {col: self.metadata.loc[i, col] for col in self.metadata.columns},
906                    dtype=object,
907                )
908            if ignore_features or self.features is None:
909                features = None
910            else:
911                features = pd.Series(
912                    {col: self.features.loc[i, col] for col in self.features.columns},
913                    dtype=object,
914                )
915            # Create the event and append it to the list
916            events.append(
917                Event(
918                    Tile(scan, self.info["tile"][i], self.info["roi"][i]),
919                    self.info["x"][i],
920                    self.info["y"][i],
921                    metadata=metadata,
922                    features=features,
923                )
924            )
925        return events

Get the events in the EventArray as a list of events.

Parameters
  • scans: the scans that the events belong to, auto-matched by slide_id. Pass None if you don't care about scan metadata (pass ignore_missing_scans).
  • ignore_missing_scans: whether to create blank scans for events without scans.
  • ignore_metadata: whether to ignore metadata or not
  • ignore_features: whether to ignore features or not
Returns
@classmethod
def from_events(cls, events: Iterable[Event]) -> Self:
927    @classmethod
928    def from_events(cls, events: Iterable[Event]) -> Self:
929        """
930        Set the events in the EventArray to a new list of events.
931        :param events: the new list of events.
932        """
933        info = pd.DataFrame(
934            {
935                "slide_id": [event.tile.scan.slide_id for event in events],
936                "tile": [event.tile.n for event in events],
937                "roi": [event.tile.n_roi for event in events],
938                "x": [event.x for event in events],
939                "y": [event.y for event in events],
940            }
941        )
942        metadata_list = [event.metadata for event in events]
943        # Iterate through and ensure that all metadata is the same shape
944        for metadata in metadata_list:
945            if type(metadata) != type(metadata_list[0]):
946                raise ValueError("All metadata must be the same type.")
947            if metadata is not None and metadata.shape != metadata_list[0].shape:
948                raise ValueError("All metadata must be the same shape.")
949        if metadata_list[0] is None:
950            metadata = None
951        else:
952            metadata = pd.DataFrame(metadata_list)
953        features_list = [event.features for event in events]
954        # Iterate through and ensure that all features are the same shape
955        for features in features_list:
956            if type(features) != type(features_list[0]):
957                raise ValueError("All features must be the same type.")
958            if features is not None and features.shape != features_list[0].shape:
959                raise ValueError("All features must be the same shape.")
960        if features_list[0] is None:
961            features = None
962        else:
963            features = pd.DataFrame(features_list)
964        return EventArray(info=info, metadata=metadata, features=features)

Set the events in the EventArray to a new list of events.

Parameters
  • events: the new list of events.
def to_dataframe(self) -> pandas.core.frame.DataFrame:
966    def to_dataframe(self) -> pd.DataFrame:
967        """
968        Convert all the data in the EventArray to a single DataFrame.
969        :return: a DataFrame with all the data in the EventArray.
970        """
971        # Make a copy of the info DataFrame and prepend "info_" to the column names
972        output = self.info.copy()
973        # Combine with the metadata and prepend "metadata_" to the column names
974        if self.metadata is not None:
975            metadata = self.metadata.copy()
976            metadata.columns = [f"metadata_{col}" for col in metadata.columns]
977            output = pd.concat([output, metadata], axis=1)
978        # Combine with the features and prepend "features_" to the column names
979        if self.features is not None:
980            features = self.features.copy()
981            features.columns = [f"features_{col}" for col in features.columns]
982            output = pd.concat([output, features], axis=1)
983        return output

Convert all the data in the EventArray to a single DataFrame.

Returns

a DataFrame with all the data in the EventArray.

@classmethod
def from_dataframe( cls, df, metadata_prefix: str = 'metadata_', features_prefix: str = 'features_') -> Self:
 985    @classmethod
 986    def from_dataframe(
 987        cls, df, metadata_prefix: str = "metadata_", features_prefix: str = "features_"
 988    ) -> Self:
 989        """
 990        From a single, special DataFrame, create an EventArray.
 991        :param df: the DataFrame to convert to an EventArray.
 992        :param metadata_prefix: the prefix for metadata columns.
 993        :param features_prefix: the prefix for features columns.
 994        :return: a DataFrame with all the data in the EventArray.
 995        """
 996        # Split the columns into info, metadata, and features and strip prefix
 997        info = df[[col for col in df.columns if col in cls.INFO_COLUMNS]].copy()
 998        if info.size == 0:
 999            info = None
1000        metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy()
1001        metadata.columns = [
1002            col.replace(metadata_prefix, "") for col in metadata.columns
1003        ]
1004        if metadata.size == 0:
1005            metadata = None
1006        features = df[[col for col in df.columns if col.startswith("features_")]].copy()
1007        features.columns = [
1008            col.replace(features_prefix, "") for col in features.columns
1009        ]
1010        if features.size == 0:
1011            features = None
1012        return cls(info=info, metadata=metadata, features=features)

From a single, special DataFrame, create an EventArray.

Parameters
  • df: the DataFrame to convert to an EventArray.
  • metadata_prefix: the prefix for metadata columns.
  • features_prefix: the prefix for features columns.
Returns

a DataFrame with all the data in the EventArray.

@classmethod
def from_mask( cls, mask: numpy.ndarray, slide_id: str, tile_n: int, n_roi: int = 0, include_cell_id: bool = True, images: list[numpy.ndarray] = None, image_labels: list[str] = None, properties: list[str] = None) -> Self:
1014    @classmethod
1015    def from_mask(
1016        cls,
1017        mask: np.ndarray,
1018        slide_id: str,
1019        tile_n: int,
1020        n_roi: int = 0,
1021        include_cell_id: bool = True,
1022        images: list[np.ndarray] = None,
1023        image_labels: list[str] = None,
1024        properties: list[str] = None,
1025    ) -> Self:
1026        """
1027        Extract events from a mask DataFrame, including metadata and features.
1028        :param mask: the mask to extract events from.
1029        :param slide_id: the slide ID the mask is from.
1030        :param tile_n: the tile number the mask is from.
1031        :param n_roi: the ROI number the mask is from.
1032        :param include_cell_id: whether to include the cell_id, or numerical
1033        mask label, as metadata in the EventArray.
1034        :param images: the intensity images to extract features from.
1035        :param image_labels: the labels for the intensity images.
1036        :param properties: list of properties to extract in addition to the defaults:
1037        :return: EventArray corresponding to the mask labels.
1038        """
1039        if csi_images is None:
1040            raise ModuleNotFoundError(
1041                "imageio libraries not installed! "
1042                "run `pip install csi_images[imageio]` to resolve."
1043            )
1044        # Gather mask_info
1045        if images is not None and image_labels is not None:
1046            if len(images) != len(image_labels):
1047                raise ValueError("Intensity images and labels must match lengths.")
1048
1049        mask_info = csi_images.extract_mask_info(mask, images, image_labels, properties)
1050
1051        if len(mask_info) == 0:
1052            return EventArray()
1053
1054        # Combine provided info and mask info
1055        info = pd.DataFrame(
1056            {
1057                "slide_id": slide_id,
1058                "tile": tile_n,
1059                "roi": n_roi,
1060                "x": mask_info["x"],
1061                "y": mask_info["y"],
1062            },
1063        )
1064        # Extract a metadata column if desired
1065        if include_cell_id:
1066            metadata = pd.DataFrame({"cell_id": mask_info["id"]})
1067        else:
1068            metadata = None
1069        # If any additional properties were extracted, add them as features
1070        mask_info = mask_info.drop(columns=["id", "x", "y"], errors="ignore")
1071        if len(mask_info.columns) > 0:
1072            features = mask_info
1073        else:
1074            features = None
1075        return EventArray(info, metadata, features)

Extract events from a mask DataFrame, including metadata and features.

Parameters
  • mask: the mask to extract events from.
  • slide_id: the slide ID the mask is from.
  • tile_n: the tile number the mask is from.
  • n_roi: the ROI number the mask is from.
  • include_cell_id: whether to include the cell_id, or numerical mask label, as metadata in the EventArray.
  • images: the intensity images to extract features from.
  • image_labels: the labels for the intensity images.
  • properties: list of properties to extract in addition to the defaults:
Returns

EventArray corresponding to the mask labels.

def save_csv(self, output_path: str) -> bool:
1077    def save_csv(self, output_path: str) -> bool:
1078        """
1079        Save the events to an CSV file, including metadata and features.
1080        :param output_path:
1081        :return:
1082        """
1083        if not output_path.endswith(".csv"):
1084            output_path += ".csv"
1085        self.to_dataframe().to_csv(output_path, index=False)
1086        return os.path.exists(output_path)

Save the events to an CSV file, including metadata and features.

Parameters
  • output_path:
Returns
@classmethod
def load_csv( cls, input_path: str, metadata_prefix: str = 'metadata_', features_prefix: str = 'features_') -> Self:
1088    @classmethod
1089    def load_csv(
1090        cls,
1091        input_path: str,
1092        metadata_prefix: str = "metadata_",
1093        features_prefix: str = "features_",
1094    ) -> Self:
1095        """
1096        Load the events from an CSV file, including metadata and features.
1097        :param input_path:
1098        :param metadata_prefix:
1099        :param features_prefix:
1100        :return:
1101        """
1102        # Load the CSV file
1103        df = pd.read_csv(input_path)
1104        return cls.from_dataframe(df, metadata_prefix, features_prefix)

Load the events from an CSV file, including metadata and features.

Parameters
  • input_path:
  • metadata_prefix:
  • features_prefix:
Returns
def save_hdf5(self, output_path: str) -> bool:
1106    def save_hdf5(self, output_path: str) -> bool:
1107        """
1108        Save the events to an HDF5 file, including metadata and features.
1109        Uses the pandas-provided HDF5 functions for ease, and external compatibility,
1110        though these files are slightly harder to view in HDFView or similar.
1111        :param output_path:
1112        :return:
1113        """
1114        if not output_path.endswith(".hdf5") and not output_path.endswith(".h5"):
1115            output_path += ".hdf5"
1116        # Open the output_path as an HDF5 file
1117        with pd.HDFStore(output_path) as store:
1118            # Store the dataframes in the HDF5 file
1119            if self.info is not None:
1120                store.put("info", self.info, index=False)
1121            if self.metadata is not None:
1122                store.put("metadata", self.metadata, index=False)
1123            if self.features is not None:
1124                store.put("features", self.features, index=False)
1125        return os.path.exists(output_path)

Save the events to an HDF5 file, including metadata and features. Uses the pandas-provided HDF5 functions for ease, and external compatibility, though these files are slightly harder to view in HDFView or similar.

Parameters
  • output_path:
Returns
@classmethod
def load_hdf5(cls, input_path: str) -> Self:
1127    @classmethod
1128    def load_hdf5(cls, input_path: str) -> Self:
1129        """
1130        Load the events from an HDF5 file, including metadata and features.
1131        :param input_path:
1132        :return:
1133        """
1134        # Open the input_path as an HDF5 file
1135        with pd.HDFStore(input_path, "r") as store:
1136            # Load the dataframes from the HDF5 file
1137            info = store.get("info") if "info" in store else None
1138            metadata = store.get("metadata") if "metadata" in store else None
1139            features = store.get("features") if "features" in store else None
1140        return cls(info=info, metadata=metadata, features=features)

Load the events from an HDF5 file, including metadata and features.

Parameters
  • input_path:
Returns
def save_ocular(self, output_path: str, event_type: str = 'cells'):
1142    def save_ocular(self, output_path: str, event_type: str = "cells"):
1143        """
1144        Save the events to an OCULAR file. Relies on the dataframe originating
1145        from an OCULAR file (same columns; duplicate metadata/info).
1146        :param output_path:
1147        :param event_type:
1148        :return:
1149        """
1150        if pyreadr is None:
1151            raise ModuleNotFoundError(
1152                "pyreadr not installed! Install pyreadr directly "
1153                "or run `pip install csi-images[rds]` option to resolve."
1154            )
1155        if event_type == "cells":
1156            file_stub = "rc-final"
1157        elif event_type == "others":
1158            file_stub = "others-final"
1159        else:
1160            raise ValueError("Invalid event type. Must be cells or others.")
1161
1162        # Ensure good metadata
1163        metadata = pd.DataFrame(
1164            {
1165                "slide_id": self.info["slide_id"],
1166                "frame_id": self.info["tile"],
1167                "cell_id": (
1168                    self.metadata["cell_id"]
1169                    if "cell_id" in self.metadata.columns
1170                    else range(len(self.info))
1171                ),
1172                "cellx": self.info["x"],
1173                "celly": self.info["y"],
1174            }
1175        )
1176        if self.metadata is not None:
1177            metadata[self.metadata.columns] = self.metadata.copy()
1178
1179        # Check for the "ocular_interesting" column
1180        if event_type == "cells":
1181            if "ocular_interesting" in metadata.columns:
1182                interesting_rows = metadata["ocular_interesting"].to_numpy(dtype=bool)
1183            elif "hcpc" in metadata.columns:
1184                # Interesting cells don't get an hcpc designation, leaving them as -1
1185                interesting_rows = (
1186                    metadata["hcpc"].to_numpy() == -1
1187                )  # interesting cells
1188            else:
1189                interesting_rows = []
1190            if sum(interesting_rows) > 0:
1191                # Split the metadata into interesting and regular
1192                interesting_events = self.rows(interesting_rows)
1193                interesting_df = pd.concat(
1194                    [interesting_events.features, interesting_events.metadata], axis=1
1195                )
1196                data_events = self.rows(~interesting_rows)
1197                data_df = pd.concat(
1198                    [data_events.features, data_events.metadata], axis=1
1199                )
1200                data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore")
1201
1202                # Drop particular columns for "interesting"
1203                interesting_df = interesting_df.drop(
1204                    [
1205                        "clust",
1206                        "hcpc",
1207                        "frame_id",
1208                        "cell_id",
1209                        "unique_id",
1210                        "ocular_interesting",
1211                    ],
1212                    axis=1,
1213                    errors="ignore",
1214                )
1215                # Save both .csv and .rds
1216                interesting_stub = os.path.join(output_path, "ocular_interesting")
1217                interesting_df.to_csv(f"{interesting_stub}.csv")
1218                # Suppress pandas FutureWarning
1219                with warnings.catch_warnings():
1220                    warnings.simplefilter(action="ignore", category=FutureWarning)
1221                    pyreadr.write_rds(f"{interesting_stub}.rds", interesting_df)
1222            else:
1223                data_df = pd.concat([self.features, metadata], axis=1)
1224        else:
1225            # Get all data and reset_index (will copy it)
1226            data_df = pd.concat([self.features, metadata], axis=1)
1227
1228        # Split based on cluster number to conform to *-final[1-4].rds
1229        n_clusters = max(data_df["clust"]) + 1
1230        split_idx = [round(i * n_clusters / 4) for i in range(5)]
1231        for i in range(4):
1232            subset = (split_idx[i] <= data_df["clust"]) & (
1233                data_df["clust"] < split_idx[i + 1]
1234            )
1235            data_df.loc[subset, "hcpc"] = i + 1
1236            subset = data_df[subset].reset_index(drop=True)
1237            # Suppress pandas FutureWarning
1238            with warnings.catch_warnings():
1239                warnings.simplefilter(action="ignore", category=FutureWarning)
1240                pyreadr.write_rds(
1241                    os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset
1242                )
1243
1244        # Create new example cell strings
1245        data_df["example_cell_id"] = (
1246            data_df["slide_id"]
1247            + " "
1248            + data_df["frame_id"].astype(str)
1249            + " "
1250            + data_df["cell_id"].astype(str)
1251            + " "
1252            + data_df["cellx"].astype(int).astype(str)
1253            + " "
1254            + data_df["celly"].astype(int).astype(str)
1255        )
1256        # Find averagable data columns
1257        if "cellcluster_id" in data_df.columns:
1258            end_idx = data_df.columns.get_loc("cellcluster_id")
1259        else:
1260            end_idx = data_df.columns.get_loc("slide_id")
1261        avg_cols = data_df.columns[:end_idx].tolist()
1262        # Group by cluster and average
1263        data_df = data_df.groupby("clust").agg(
1264            **{col: (col, "mean") for col in avg_cols},
1265            count=("clust", "size"),  # count rows in each cluster
1266            example_cells=("example_cell_id", lambda x: ",".join(x)),
1267            hcpc=("hcpc", lambda x: x.iloc[0]),
1268        )
1269        data_df = data_df.reset_index()  # Do NOT drop, index is "clust"
1270        # Create new columns
1271        metadata = pd.DataFrame(
1272            {
1273                "count": data_df["count"],
1274                "example_cells": data_df["example_cells"],
1275                "clust": data_df["clust"].astype(int),
1276                "hcpc": data_df["hcpc"].astype(int),
1277                "id": data_df["clust"].astype(int).astype(str),
1278                "cccluster": "0",  # Dummy value
1279                "ccdistance": 0.0,  # Dummy value
1280                "rownum": list(range(len(data_df))),
1281                "framegroup": 0,  # Dummy value
1282            }
1283        )
1284        # Need to pad the features to 761 columns, as per OCULAR report needs
1285        additional_columns = range(len(avg_cols), 761)
1286        if len(additional_columns) > 0:
1287            padding = pd.DataFrame(
1288                np.zeros((len(data_df), len(additional_columns))),
1289                columns=[f"pad{i}" for i in additional_columns],
1290            )
1291            data_df = pd.concat([data_df[avg_cols], padding, metadata], axis=1)
1292        else:
1293            data_df = pd.concat([data_df[avg_cols], metadata], axis=1)
1294
1295        # Save the cluster data
1296        data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv"))
1297        # Suppress pandas FutureWarning
1298        with warnings.catch_warnings():
1299            warnings.simplefilter(action="ignore", category=FutureWarning)
1300            pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df)

Save the events to an OCULAR file. Relies on the dataframe originating from an OCULAR file (same columns; duplicate metadata/info).

Parameters
  • output_path:
  • event_type:
Returns
@classmethod
def load_ocular( cls, input_path: str, event_type='cells', cell_data_files=('rc-final1.rds', 'rc-final2.rds', 'rc-final3.rds', 'rc-final4.rds', 'ocular_interesting.rds'), others_data_files=('others-final1.rds', 'others-final2.rds', 'others-final3.rds', 'others-final4.rds'), atlas_data_files=('ocular_interesting.rds', 'ocular_not_interesting.rds'), drop_common_events=True) -> Self:
1302    @classmethod
1303    def load_ocular(
1304        cls,
1305        input_path: str,
1306        event_type="cells",
1307        cell_data_files=(
1308            "rc-final1.rds",
1309            "rc-final2.rds",
1310            "rc-final3.rds",
1311            "rc-final4.rds",
1312            "ocular_interesting.rds",
1313        ),
1314        others_data_files=(
1315            "others-final1.rds",
1316            "others-final2.rds",
1317            "others-final3.rds",
1318            "others-final4.rds",
1319        ),
1320        atlas_data_files=(
1321            "ocular_interesting.rds",
1322            "ocular_not_interesting.rds",
1323        ),
1324        drop_common_events=True,
1325    ) -> Self:
1326        """
1327
1328        :param input_path:
1329        :param event_type:
1330        :param cell_data_files:
1331        :param others_data_files:
1332        :param atlas_data_files:
1333        :param drop_common_events:
1334        :return:
1335        """
1336        if pyreadr is None:
1337            raise ModuleNotFoundError(
1338                "pyreadr not installed! Install pyreadr directly "
1339                "or run `pip install csi-images[rds]` option to resolve."
1340            )
1341        # Check if the input path is a directory or a file
1342        if os.path.isfile(input_path):
1343            data_files = [os.path.basename(input_path)]
1344            input_path = os.path.dirname(input_path)
1345        if event_type == "cells":
1346            data_files = cell_data_files
1347        elif event_type == "others":
1348            data_files = others_data_files
1349        else:
1350            raise ValueError("Invalid event type.")
1351
1352        # Load the data from the OCULAR files
1353        file_data = {}
1354        for file in data_files:
1355            file_path = os.path.join(input_path, file)
1356            if not os.path.isfile(file_path):
1357                warnings.warn(f"{file} not found for in {input_path}")
1358                continue
1359            file_data[file] = pyreadr.read_r(file_path)
1360            # Get the DataFrame associated with None (pyreadr dict quirk)
1361            file_data[file] = file_data[file][None]
1362            if len(file_data[file]) == 0:
1363                # File gets dropped from the dict
1364                file_data.pop(file)
1365                warnings.warn(f"{file} has no cells")
1366                continue
1367
1368            # Drop common cells if requested and in this file
1369            if (
1370                file in atlas_data_files
1371                and drop_common_events
1372                and "catalogue_classification" in file_data[file]
1373            ):
1374                common_cell_indices = (
1375                    file_data[file]["catalogue_classification"] == "common_cell"
1376                )
1377                file_data[file] = file_data[file][common_cell_indices == False]
1378
1379            if len(file_data[file]) == 0:
1380                # File gets dropped from the dict
1381                file_data.pop(file)
1382                warnings.warn(f"{file} has no cells after dropping common cells")
1383                continue
1384
1385            # Extract frame_id and cell_id
1386            # DAPI- events already have frame_id cell_id outside rowname
1387            if event_type == "cells" and "frame_id" not in file_data[file].columns:
1388                file_data[file]["rowname"] = file_data[file]["rowname"].astype("str")
1389                # get frame_id cell_id from rownames column and split into two columns
1390                split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True)
1391                if len(split_res.columns) != 2:
1392                    warnings.warn(
1393                        f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}'
1394                    )
1395                # then assign it back to the dataframe
1396                file_data[file][["frame_id", "cell_id"]] = split_res.astype("int")
1397            # Ensure frame_id and cell_id are integers
1398            file_data[file]["frame_id"] = file_data[file]["frame_id"].astype("int")
1399            file_data[file]["cell_id"] = file_data[file]["cell_id"].astype("int")
1400            # reset indexes since they can cause NaN values in concat
1401            file_data[file] = file_data[file].reset_index(drop=True)
1402
1403        # Merge the data from all files
1404        if len(file_data) == 0:
1405            return EventArray()
1406        elif len(file_data) == 1:
1407            data = [file_data[file] for file in file_data.keys()][0]
1408        else:
1409            data = pd.concat(file_data.values())
1410
1411        # Others is missing the "slide_id". Insert it right before "frame_id" column
1412        if event_type == "others" and "slide_id" not in data.columns:
1413            if os.path.basename(input_path) == "ocular":
1414                slide_id = os.path.basename(os.path.dirname(input_path))
1415            else:
1416                slide_id = "UNKNOWN"
1417            data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id)
1418
1419        # Sort according to ascending cell_id to keep the original, which is in manual_df
1420        data = data.sort_values(by=["cell_id"], ascending=True)
1421        # Filter out duplicates by x & y
1422        data = data.assign(
1423            unique_id=data["slide_id"]
1424            + "_"
1425            + data["frame_id"].astype(str)
1426            + "_"
1427            + data["cellx"].astype(int).astype(str)
1428            + "_"
1429            + data["celly"].astype(int).astype(str)
1430        )
1431        data = data.drop_duplicates(subset=["unique_id"], keep="first")
1432        # Normal unique_id is with cell_id
1433        data = data.assign(
1434            unique_id=data["slide_id"]
1435            + "_"
1436            + data["frame_id"].astype(str)
1437            + "_"
1438            + data["cell_id"].astype(str)
1439        )
1440        data = data.reset_index(drop=True)
1441        # All columns up to "slide_id" are features; drop the "slide_id"
1442        features = data.loc[:, :"slide_id"].iloc[:, :-1]
1443        data = data.loc[:, "slide_id":]
1444        # Grab the info columns
1445        info = data[["slide_id", "frame_id", "cellx", "celly"]]
1446        info.columns = ["slide_id", "tile", "x", "y"]
1447        info = info.assign(roi=0)  # OCULAR only works on 1 ROI, as far as known
1448        info = info[["slide_id", "tile", "roi", "x", "y"]]
1449        # Metadata has duplicate columns for later convenience
1450        metadata = data
1451        # Certain columns tend to be problematic with mixed data formats...
1452        for col in ["TRITC", "CY5", "FITC"]:
1453            if col in metadata:
1454                labels = {
1455                    "False": False,
1456                    "True": True,
1457                    "FALSE": False,
1458                    "TRUE": True,
1459                    False: False,
1460                    True: True,
1461                }
1462                metadata[col] = metadata[col].map(labels).astype(bool)
1463        for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]:
1464            if col in metadata:
1465                metadata[col] = metadata[col].fillna(-1).astype(int)
1466        return EventArray(info, metadata, features)
Parameters
  • input_path:
  • event_type:
  • cell_data_files:
  • others_data_files:
  • atlas_data_files:
  • drop_common_events:
Returns