csi_images.csi_events

Contains the Event class, which represents a single event in a scan. The Event class optionally holds metadata and features. Lists of events with similar metadata or features can be combined into DataFrames for analysis.

The Event class holds the position of the event in the frame, which can be converted to the position in the scanner or slide coordinate positions. See the csi_utils.csi_scans documentation page for more information on the coordinate systems.

   1"""
   2Contains the Event class, which represents a single event in a scan.
   3The Event class optionally holds metadata and features. Lists of events with
   4similar metadata or features can be combined into DataFrames for analysis.
   5
   6The Event class holds the position of the event in the frame, which can be converted
   7to the position in the scanner or slide coordinate positions. See the
   8csi_utils.csi_scans documentation page for more information on the coordinate systems.
   9"""
  10
  11import os
  12import math
  13import warnings
  14from typing import Self, Iterable, Hashable, Sequence
  15
  16import numpy as np
  17import pandas as pd
  18
  19from .csi_scans import Scan
  20from .csi_tiles import Tile
  21from .csi_frames import Frame
  22
  23# Optional dependencies; will raise errors in particular functions if not installed
  24try:
  25    from .csi_images import extract_mask_info
  26except ImportError:
  27    extract_mask_info = None
  28try:
  29    import pyreadr
  30except ImportError:
  31    pyreadr = None
  32
  33
  34class Event:
  35    """
  36    A class that represents a single event in a scan, making it easy to evaluate
  37    singular events. Required metadata is exposed as attributes, and optional
  38    metadata and features are stored as DataFrames.
  39    """
  40
  41    SCAN_TO_SLIDE_TRANSFORM = {
  42        # Axioscan zero is in the top-right corner instead of top-left
  43        Scan.Type.AXIOSCAN7: np.array(
  44            [
  45                [1, 0, 75000],
  46                [0, 1, 0],
  47                [0, 0, 1],
  48            ]
  49        ),
  50        # BZScanner coordinates are a special kind of messed up:
  51        # - The slide is upside-down.
  52        # - The slide is oriented vertically, with the barcode at the bottom.
  53        # - Tiles are numbered from the top-right
  54        Scan.Type.BZSCANNER: np.array(
  55            [
  56                [0, -1, 75000],
  57                [-1, 0, 25000],
  58                [0, 0, 1],
  59            ]
  60        ),
  61    }
  62    """
  63    Homogeneous transformation matrices for converting between scanner and slide
  64    coordinates. The matrices are 3x3, with the final column representing the
  65    translation in micrometers (um). For more information, see 
  66    [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations).
  67    
  68    Transformations are nominal, and accuracy is not guaranteed; this is due to 
  69    imperfections in slides and alignment in the scanners. Units are in micrometers.
  70    """
  71
  72    def __init__(
  73        self,
  74        scan: Scan,
  75        tile: Tile,
  76        x: int,
  77        y: int,
  78        metadata: pd.Series = None,
  79        features: pd.Series = None,
  80    ):
  81        self.scan = scan
  82        self.tile = tile
  83        self.x = int(x)
  84        self.y = int(y)
  85        self.metadata = metadata
  86        self.features = features
  87
  88    def __repr__(self) -> str:
  89        return f"{self.scan.slide_id}-{self.tile.n}-{self.x}-{self.y}"
  90
  91    def __eq__(self, other) -> bool:
  92        return self.__repr__() == other.__repr__()
  93
  94    def __lt__(self, other):
  95        return self.__repr__() < other.__repr__()
  96
  97    def get_scan_position(self) -> tuple[float, float]:
  98        """
  99        Get the position of the event in the scanner's coordinate frame.
 100        :return: the scan position of the event in micrometers (um).
 101        """
 102        # Get overall pixel position
 103        pixel_x = self.x + (self.scan.tile_width_px * self.tile.x)
 104        pixel_y = self.y + (self.scan.tile_height_px * self.tile.y)
 105        # Convert to micrometers
 106        x_um = pixel_x * self.scan.pixel_size_um
 107        y_um = pixel_y * self.scan.pixel_size_um
 108        # Add the scan's origin in the scanner frame
 109        x_um += self.scan.roi[self.tile.n_roi].origin_x_um
 110        y_um += self.scan.roi[self.tile.n_roi].origin_y_um
 111        return x_um, y_um
 112
 113    def get_slide_position(self) -> tuple[float, float]:
 114        """
 115        Get the slide position of the event in micrometers (um).
 116        :return: the slide position of the event.
 117        """
 118        # Turn scan_position into a 3x1 vector
 119        scan_position = self.get_scan_position()
 120        scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]])
 121
 122        # Multiply by the appropriate homogeneous matrix
 123        if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value):
 124            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7]
 125        elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value):
 126            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER]
 127        else:
 128            raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.")
 129        slide_position = np.matmul(transform, scan_position)
 130        return float(slide_position[0][0]), float(slide_position[1][0])
 131
 132    def crop_images(
 133        self, images: Iterable[np.ndarray], crop_size: int = 100, in_pixels: bool = True
 134    ) -> list[np.ndarray]:
 135        """
 136        Get the event crops from the frame images. Called "get" because it does not
 137        need to extract anything; it is very quick for extracting multiple events from
 138        the same tile.
 139        Use this if you're interested in many events.
 140        :param images: the frame images.
 141        :param crop_size: the square size of the image crop to get for this event.
 142        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
 143        :return: image_size x image_size crops of the event in the provided frames. If
 144        the event is too close to the edge, the crop will be smaller and not centered.
 145        """
 146        # Convert a crop size in micrometers to pixels
 147        if not in_pixels:
 148            crop_size = round(crop_size / self.scan.pixel_size_um)
 149        # Find the crop bounds
 150        bounds = [
 151            self.x - (crop_size // 2) + 1,
 152            self.y - (crop_size // 2) + 1,
 153            self.x + math.ceil(crop_size / 2) + 1,
 154            self.y + math.ceil(crop_size / 2) + 1,
 155        ]
 156        # Determine how much the bounds violate the image size
 157        displacements = [
 158            max(0, -bounds[0]),
 159            max(0, -bounds[1]),
 160            max(0, bounds[2] - images[0].shape[1]),
 161            max(0, bounds[3] - images[0].shape[0]),
 162        ]
 163        # Cap off the bounds
 164        bounds = [
 165            max(0, bounds[0]),
 166            max(0, bounds[1]),
 167            min(images[0].shape[1], bounds[2]),
 168            min(images[0].shape[0], bounds[3]),
 169        ]
 170
 171        # Crop the images
 172        crops = []
 173        for image in images:
 174            # Create a blank image of the right size
 175            crop = np.zeros((crop_size, crop_size), dtype=image.dtype)
 176
 177            # Insert the cropped image into the blank image, leaving a black buffer
 178            # around the edges if the crop would go beyond the original image bounds
 179            crop[
 180                displacements[1] : crop_size - displacements[3],
 181                displacements[0] : crop_size - displacements[2],
 182            ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]]
 183            crops.append(crop)
 184        return crops
 185
 186    def extract_images(
 187        self,
 188        crop_size: int = 100,
 189        in_pixels: bool = True,
 190        input_path: str = None,
 191        channels: Iterable[int | str] = None,
 192        apply_gain: bool | Iterable[bool] = True,
 193    ) -> list[np.ndarray]:
 194        """
 195        Extract the images from the scan and tile, reading from the file. Called
 196        "extract" because it must read and extract the images from file, which is slow.
 197        Use this if you're interested in only a few events, as it is inefficient when
 198        reading multiple events from the same tile.
 199        :param crop_size: the square size of the image crop to get for this event.
 200        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
 201        :param input_path: the path to the input images. Defaults to None (uses the scan's path).
 202        :param channels: the channels to extract images for. Defaults to all channels.
 203        :param apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True.
 204                           Can be supplied as a list to apply gain to individual channels.
 205        :return: a list of cropped images from the scan in the order of the channels.
 206        """
 207        frames = Frame.get_frames(self.tile, channels)
 208        if isinstance(apply_gain, bool):
 209            apply_gain = [apply_gain] * len(frames)
 210        images = [f.get_image(input_path, a) for f, a in zip(frames, apply_gain)]
 211        return self.crop_images(images, crop_size, in_pixels)
 212
 213    @classmethod
 214    def extract_images_for_list(
 215        cls,
 216        events: list[Self],
 217        crop_size: int | list[int] = 75,
 218        in_pixels: bool = True,
 219        input_path: str = None,
 220        channels: Iterable[int | str] = None,
 221        apply_gain: bool | Iterable[bool] = True,
 222    ) -> list[list[np.ndarray]]:
 223        """
 224        Get the images for a list of events, ensuring that there is no wasteful reading
 225        of the same tile multiple times. This function is more efficient than calling
 226        extract_event_images for each event.
 227        :param events: the events to extract images for.
 228        :param crop_size: the square size of the image crop to get for this event.
 229                          Defaults to four times the size of the event.
 230        :param in_pixels: whether the crop size is in pixels or micrometers.
 231                          Defaults to pixels, and is ignored if crop_size is None.
 232        :param input_path: the path to the input images. Will only work for lists of events
 233                           from the same scan. Defaults to None (uses the scan's path).
 234        :param channels: the channels to extract images for. Defaults to all channels.
 235        :param apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True.
 236                           Can be supplied as a list to apply gain to individual channels.
 237        :return: a list of lists of cropped images for each event.
 238        """
 239        # Validation
 240        if len(events) == 0:
 241            return []
 242        if isinstance(crop_size, int):
 243            crop_size = [crop_size] * len(events)
 244
 245        # Get the order of the events when sorted by slide/tile
 246        order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__()))
 247
 248        # Allocate the list to size
 249        crops = [[]] * len(events)
 250        last_tile = None
 251        images = None  # Holds large numpy arrays, so expensive to compare
 252        # Iterate through in slide/tile sorted order
 253        for i in order:
 254            if last_tile != events[i].tile:
 255                # Gather the frame images, preserving them for the next event
 256                frames = Frame.get_frames(events[i].tile, channels)
 257                if isinstance(apply_gain, bool):
 258                    gain_list = [apply_gain] * len(frames)
 259                else:
 260                    gain_list = apply_gain
 261                images = [f.get_image(input_path, a) for f, a in zip(frames, gain_list)]
 262                last_tile = events[i].tile
 263            # Use the frame images to crop the event images
 264            crops[i] = events[i].crop_images(images, crop_size[i], in_pixels)
 265        return crops
 266
 267
 268class EventArray:
 269    """
 270    A class that holds a large number of events' data, making it easy to analyze and
 271    manipulate many events at once. A more separated version of the Event class.
 272    """
 273
 274    INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y"]
 275
 276    def __init__(
 277        self,
 278        info: pd.DataFrame = None,
 279        metadata: pd.DataFrame = None,
 280        features: pd.DataFrame = None,
 281    ):
 282        # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y"
 283        if info is not None:
 284            if list(info.columns) != self.INFO_COLUMNS:
 285                raise ValueError(
 286                    'EventArray.info must have columns "slide_id", "tile", "roi", "x", "y"'
 287                )
 288            # Copy first to avoid modifying the original
 289            info = info.copy()
 290            # Ensure that the columns are the right types
 291            info["slide_id"] = info["slide_id"].astype(str)
 292            info["tile"] = info["tile"].astype(np.uint16)
 293            info["roi"] = info["roi"].astype(np.uint8)
 294            info["x"] = info["x"].round().astype(np.uint16)
 295            info["y"] = info["y"].round().astype(np.uint16)
 296        # All DataFrames must all have the same number of rows
 297        if metadata is not None and (info is None or len(info) != len(metadata)):
 298            raise ValueError(
 299                "If EventArray.metadata is not None, it should match rows with .info"
 300            )
 301        if features is not None and (info is None or len(info) != len(features)):
 302            raise ValueError(
 303                "If EventArray.features is not None, it should match rows with .info"
 304            )
 305        # No columns named "metadata_", "features_", or "None"
 306        column_names = []
 307        if metadata is not None:
 308            column_names += metadata.columns.tolist()
 309        if features is not None:
 310            column_names += features.columns.tolist()
 311        if any([col.lower().startswith("metadata_") for col in column_names]):
 312            raise ValueError("EventArray column names cannot start with 'metadata_'")
 313        if any([col.lower().startswith("features_") for col in column_names]):
 314            raise ValueError("EventArray column names cannot start with 'features_'")
 315        if any([col.lower() == "none" for col in column_names]):
 316            raise ValueError("EventArray column names cannot be 'none'")
 317
 318        self.info = info
 319        self.metadata = metadata
 320        self.features = features
 321
 322    def __len__(self) -> int:
 323        # Convenience method to get the number of events
 324        if self.info is None:
 325            return 0
 326        else:
 327            return len(self.info)
 328
 329    def __eq__(self, other):
 330        is_equal = True
 331        # Parse all possibilities for info
 332        if isinstance(self.info, pd.DataFrame):
 333            if isinstance(other.info, pd.DataFrame):
 334                is_equal = self.info.equals(other.info)
 335                if not is_equal:
 336                    return False
 337            else:
 338                return False
 339        elif self.info is None:
 340            if other.info is not None:
 341                return False
 342
 343        # Parse all possibilities for metadata
 344        if isinstance(self.metadata, pd.DataFrame):
 345            if isinstance(other.metadata, pd.DataFrame):
 346                is_equal = self.metadata.equals(other.metadata)
 347                if not is_equal:
 348                    return False
 349            else:
 350                return False
 351        elif self.metadata is None:
 352            if other.metadata is not None:
 353                return False
 354
 355        # Parse all possibilities for features
 356        if isinstance(self.features, pd.DataFrame):
 357            if isinstance(other.features, pd.DataFrame):
 358                is_equal = self.features.equals(other.features)
 359                if not is_equal:
 360                    return False
 361            else:
 362                return False
 363        elif self.features is None:
 364            if other.features is not None:
 365                return False
 366
 367        return is_equal
 368
 369    def get_sort_order(
 370        self, by: Hashable | Sequence[Hashable], ascending: bool | Sequence[bool] = True
 371    ):
 372        """
 373        Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.
 374        :param by: name of the column(s) to sort by.
 375        :param ascending: whether to sort in ascending order; can be a list to match by
 376        :return: the order of the indices to sort by.
 377        """
 378        columns = self.get(by)
 379        return columns.sort_values(by=by, ascending=ascending).index
 380
 381    def sort(
 382        self,
 383        by: Hashable | Sequence[Hashable],
 384        ascending: bool | Sequence[bool] = True,
 385    ) -> Self:
 386        """
 387        Sort the EventArray by column(s) in the info, metadata, or features DataFrames.
 388        :param by: name of the column(s) to sort by.
 389        :param ascending: whether to sort in ascending order; can be a list to match by
 390        :return: a new, sorted EventArray.
 391        """
 392        order = self.get_sort_order(by, ascending)
 393        info = self.info.loc[order].reset_index(drop=True)
 394        if self.metadata is not None:
 395            metadata = self.metadata.loc[order].reset_index(drop=True)
 396        else:
 397            metadata = None
 398        if self.features is not None:
 399            features = self.features.loc[order].reset_index(drop=True)
 400        else:
 401            features = None
 402        return EventArray(info, metadata, features)
 403
 404    def get(self, column_names: Hashable | Sequence[Hashable]) -> pd.DataFrame:
 405        """
 406        Get a DataFrame with the specified columns from the EventArray, by value.
 407        :param column_names: the names of the columns to get.
 408        :return: a DataFrame with the specified columns.
 409        """
 410        if isinstance(column_names, Hashable):
 411            column_names = [column_names]  # Drop into a list for the loop
 412        columns = []
 413        for column_name in column_names:
 414            if column_name in self.info.columns:
 415                columns.append(self.info[column_name])
 416            elif self.metadata is not None and column_name in self.metadata.columns:
 417                columns.append(self.metadata[column_name])
 418            elif self.features is not None and column_name in self.features.columns:
 419                columns.append(self.features[column_name])
 420            else:
 421                raise ValueError(f"Column {column_name} not found in EventArray")
 422        return pd.concat(columns, axis=1)
 423
 424    def rows(self, rows: Sequence[Hashable]) -> Self:
 425        """
 426        Get a subset of the EventArray rows based on a boolean or integer index, by value.
 427        :param rows: row labels, indices, or boolean mask; anything for .loc[]
 428        :return: a new EventArray with the subset of events.
 429        """
 430        info = self.info.loc[rows].reset_index(drop=True)
 431        if self.metadata is not None:
 432            metadata = self.metadata.loc[rows].reset_index(drop=True)
 433        else:
 434            metadata = None
 435        if self.features is not None:
 436            features = self.features.loc[rows].reset_index(drop=True)
 437        else:
 438            features = None
 439        return EventArray(info, metadata, features)
 440
 441    def copy(self) -> Self:
 442        """
 443        Create a deep copy of the EventArray.
 444        :return: a deep copy of the EventArray.
 445        """
 446        return EventArray(
 447            info=self.info.copy(),
 448            metadata=None if self.metadata is None else self.metadata.copy(),
 449            features=None if self.features is None else self.features.copy(),
 450        )
 451
 452    # TODO: add a "filter" convenience function that takes a column name and values to filter by
 453
 454    def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None:
 455        """
 456        Add metadata to the EventArray. Removes the need to check if metadata is None.
 457        Overwrites any existing metadata with the same column names as the new metadata.
 458        :param new_metadata: the metadata to add.
 459        """
 460        if len(self) != len(new_metadata):
 461            raise ValueError("New metadata must match length of existing info")
 462
 463        if self.metadata is None:
 464            self.metadata = new_metadata
 465        else:
 466            if isinstance(new_metadata, pd.Series):
 467                self.metadata[new_metadata.name] = new_metadata
 468            else:
 469                # It's a DataFrame
 470                self.metadata[new_metadata.columns] = new_metadata
 471
 472    def add_features(self, new_features: pd.Series | pd.DataFrame) -> None:
 473        """
 474        Add features to the EventArray. Removes the need to check if features is None.
 475        Overwrites any existing features with the same column names as the new features.
 476        :param new_features: the features to add.
 477        """
 478        if len(self) != len(new_features):
 479            raise ValueError("New features must match length of existing info")
 480
 481        if self.features is None:
 482            self.features = new_features
 483        else:
 484            if isinstance(new_features, pd.Series):
 485                self.features[new_features.name] = new_features
 486            else:
 487                # It's a DataFrame
 488                self.features[new_features.columns] = new_features
 489
 490    @classmethod
 491    def merge(cls, events: Iterable[Self]) -> Self:
 492        """
 493        Combine EventArrays in a list into a single EventArray.
 494        :param events: the new list of events.
 495        """
 496        all_info = []
 497        all_metadata = []
 498        all_features = []
 499        for event_array in events:
 500            # Skip empty EventArrays
 501            if event_array.info is not None:
 502                all_info.append(event_array.info)
 503            if event_array.metadata is not None:
 504                all_metadata.append(event_array.metadata)
 505            if event_array.features is not None:
 506                all_features.append(event_array.features)
 507        if len(all_info) == 0:
 508            return EventArray()
 509        else:
 510            all_info = pd.concat(all_info, ignore_index=True)
 511        if len(all_metadata) == 0:
 512            all_metadata = None
 513        else:
 514            all_metadata = pd.concat(all_metadata, ignore_index=True)
 515        if len(all_features) == 0:
 516            all_features = None
 517        else:
 518            all_features = pd.concat(all_features, ignore_index=True)
 519
 520        return EventArray(all_info, all_metadata, all_features)
 521
 522    def to_events(
 523        self,
 524        scans: Scan | Iterable[Scan],
 525        ignore_missing_scans=True,
 526        ignore_metadata=False,
 527        ignore_features=False,
 528    ) -> list[Event]:
 529        """
 530        Get the events in the EventArray as a list of events.
 531        :param scans: the scans that the events belong to, auto-matched by slide_id.
 532        Pass None if you don't care about scan metadata (pass ignore_missing_scans).
 533        :param ignore_missing_scans: whether to create blank scans for events without scans.
 534        :param ignore_metadata: whether to ignore metadata or not
 535        :param ignore_features: whether to ignore features or not
 536        :return:
 537        """
 538        if isinstance(scans, Scan):
 539            scans = [scans]
 540        scans = {scan.slide_id: scan for scan in scans}
 541        events = []
 542        for i in range(len(self.info)):
 543            # Determine the associated scan
 544            slide_id = self.info["slide_id"][i]
 545            if slide_id not in scans:
 546                if ignore_missing_scans:
 547                    # Create a placeholder scan if the scan is missing
 548                    scan = Scan.make_placeholder(
 549                        slide_id,
 550                        self.info["tile"][i],
 551                        self.info["roi"][i],
 552                    )
 553                else:
 554                    raise ValueError(
 555                        f"Scan {self.info['slide_id'][i]} not found for event {i}."
 556                    )
 557            else:
 558                scan = scans[slide_id]
 559
 560            # Prepare the metadata and features
 561            if ignore_metadata or self.metadata is None:
 562                metadata = None
 563            else:
 564                # This Series creation method is less efficient,
 565                # but required for preserving dtypes
 566                metadata = pd.Series(
 567                    {col: self.metadata.loc[i, col] for col in self.metadata.columns},
 568                    dtype=object,
 569                )
 570            if ignore_features or self.features is None:
 571                features = None
 572            else:
 573                features = pd.Series(
 574                    {col: self.features.loc[i, col] for col in self.features.columns},
 575                    dtype=object,
 576                )
 577            # Create the event and append it to the list
 578            events.append(
 579                Event(
 580                    scan,
 581                    Tile(scan, self.info["tile"][i], self.info["roi"][i]),
 582                    self.info["x"][i],
 583                    self.info["y"][i],
 584                    metadata=metadata,
 585                    features=features,
 586                )
 587            )
 588        return events
 589
 590    @classmethod
 591    def from_events(cls, events: Iterable[Event]) -> Self:
 592        """
 593        Set the events in the EventArray to a new list of events.
 594        :param events: the new list of events.
 595        """
 596        info = pd.DataFrame(
 597            {
 598                "slide_id": [event.scan.slide_id for event in events],
 599                "tile": [event.tile.n for event in events],
 600                "roi": [event.tile.n_roi for event in events],
 601                "x": [event.x for event in events],
 602                "y": [event.y for event in events],
 603            }
 604        )
 605        metadata_list = [event.metadata for event in events]
 606        # Iterate through and ensure that all metadata is the same shape
 607        for metadata in metadata_list:
 608            if type(metadata) != type(metadata_list[0]):
 609                raise ValueError("All metadata must be the same type.")
 610            if metadata is not None and metadata.shape != metadata_list[0].shape:
 611                raise ValueError("All metadata must be the same shape.")
 612        if metadata_list[0] is None:
 613            metadata = None
 614        else:
 615            metadata = pd.DataFrame(metadata_list)
 616        features_list = [event.features for event in events]
 617        # Iterate through and ensure that all features are the same shape
 618        for features in features_list:
 619            if type(features) != type(features_list[0]):
 620                raise ValueError("All features must be the same type.")
 621            if features is not None and features.shape != features_list[0].shape:
 622                raise ValueError("All features must be the same shape.")
 623        if features_list[0] is None:
 624            features = None
 625        else:
 626            features = pd.DataFrame(features_list)
 627        return EventArray(info=info, metadata=metadata, features=features)
 628
 629    def to_dataframe(self) -> pd.DataFrame:
 630        """
 631        Convert all the data in the EventArray to a single DataFrame.
 632        :return: a DataFrame with all the data in the EventArray.
 633        """
 634        # Make a copy of the info DataFrame and prepend "info_" to the column names
 635        output = self.info.copy()
 636        # Combine with the metadata and prepend "metadata_" to the column names
 637        if self.metadata is not None:
 638            metadata = self.metadata.copy()
 639            metadata.columns = [f"metadata_{col}" for col in metadata.columns]
 640            output = pd.concat([output, metadata], axis=1)
 641        # Combine with the features and prepend "features_" to the column names
 642        if self.features is not None:
 643            features = self.features.copy()
 644            features.columns = [f"features_{col}" for col in features.columns]
 645            output = pd.concat([output, features], axis=1)
 646        return output
 647
 648    @classmethod
 649    def from_dataframe(cls, df) -> Self:
 650        """
 651        From a single, special DataFrame, create an EventArray.
 652        :return: a DataFrame with all the data in the EventArray.
 653        """
 654        # Split the columns into info, metadata, and features and strip prefix
 655        info = df[[col for col in df.columns if col in cls.INFO_COLUMNS]].copy()
 656        if info.size == 0:
 657            info = None
 658        metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy()
 659        metadata.columns = [col.replace("metadata_", "") for col in metadata.columns]
 660        if metadata.size == 0:
 661            metadata = None
 662        features = df[[col for col in df.columns if col.startswith("features_")]].copy()
 663        features.columns = [col.replace("features_", "") for col in features.columns]
 664        if features.size == 0:
 665            features = None
 666        return cls(info=info, metadata=metadata, features=features)
 667
 668    @classmethod
 669    def from_mask(
 670        cls,
 671        mask: np.ndarray,
 672        slide_id: str,
 673        tile_n: int,
 674        n_roi: int = 0,
 675        include_cell_id: bool = True,
 676        images: list[np.ndarray] = None,
 677        image_labels: list[str] = None,
 678        properties: list[str] = None,
 679    ) -> Self:
 680        """
 681        Extract events from a mask DataFrame, including metadata and features.
 682        :param mask: the mask to extract events from.
 683        :param slide_id: the slide ID the mask is from.
 684        :param tile_n: the tile number the mask is from.
 685        :param n_roi: the ROI number the mask is from.
 686        :param include_cell_id: whether to include the cell_id, or numerical
 687        mask label, as metadata in the EventArray.
 688        :param images: the intensity images to extract features from.
 689        :param image_labels: the labels for the intensity images.
 690        :param properties: list of properties to extract in addition to the defaults:
 691        :return: EventArray corresponding to the mask labels.
 692        """
 693        if extract_mask_info is None:
 694            raise ModuleNotFoundError(
 695                "csi_images.csi_images dependencies not installed. Install csi-images "
 696                "with [imageio] option to resolve."
 697            )
 698        # Gather mask_info
 699        if images is not None and image_labels is not None:
 700            if len(images) != len(image_labels):
 701                raise ValueError("Intensity images and labels must match lengths.")
 702
 703        mask_info = extract_mask_info(mask, images, image_labels, properties)
 704
 705        if len(mask_info) == 0:
 706            return EventArray()
 707
 708        # Combine provided info and mask info
 709        info = pd.DataFrame(
 710            {
 711                "slide_id": slide_id,
 712                "tile": tile_n,
 713                "roi": n_roi,
 714                "x": mask_info["x"],
 715                "y": mask_info["y"],
 716            },
 717        )
 718        # Extract a metadata column if desired
 719        if include_cell_id:
 720            metadata = pd.DataFrame({"cell_id": mask_info["id"]})
 721        else:
 722            metadata = None
 723        # If any additional properties were extracted, add them as features
 724        mask_info = mask_info.drop(columns=["id", "x", "y"], errors="ignore")
 725        if len(mask_info.columns) > 0:
 726            features = mask_info
 727        else:
 728            features = None
 729        return EventArray(info, metadata, features)
 730
 731    def save_csv(self, output_path: str) -> bool:
 732        """
 733        Save the events to an CSV file, including metadata and features.
 734        :param output_path:
 735        :return:
 736        """
 737        self.to_dataframe().to_csv(output_path, index=False)
 738        return os.path.exists(output_path)
 739
 740    @classmethod
 741    def load_csv(cls, input_path: str) -> Self:
 742        """
 743        Load the events from an CSV file, including metadata and features.
 744        :param input_path:
 745        :return:
 746        """
 747        # Load the CSV file
 748        df = pd.read_csv(input_path)
 749        return cls.from_dataframe(df)
 750
 751    def save_hdf5(self, output_path: str) -> bool:
 752        """
 753        Save the events to an HDF5 file, including metadata and features.
 754        Uses the pandas-provided HDF5 functions for ease, and external compatibility,
 755        though these files are slightly harder to view in HDFView or similar.
 756        :param output_path:
 757        :return:
 758        """
 759        # Open the output_path as an HDF5 file
 760        with pd.HDFStore(output_path) as store:
 761            # Store the dataframes in the HDF5 file
 762            if self.info is not None:
 763                store.put("info", self.info, index=False)
 764            if self.metadata is not None:
 765                store.put("metadata", self.metadata, index=False)
 766            if self.features is not None:
 767                store.put("features", self.features, index=False)
 768        return os.path.exists(output_path)
 769
 770    @classmethod
 771    def load_hdf5(cls, input_path: str) -> Self:
 772        """
 773        Load the events from an HDF5 file, including metadata and features.
 774        :param input_path:
 775        :return:
 776        """
 777        # Open the input_path as an HDF5 file
 778        with pd.HDFStore(input_path, "r") as store:
 779            # Load the dataframes from the HDF5 file
 780            info = store.get("info") if "info" in store else None
 781            metadata = store.get("metadata") if "metadata" in store else None
 782            features = store.get("features") if "features" in store else None
 783        return cls(info=info, metadata=metadata, features=features)
 784
 785    def save_ocular(self, output_path: str, event_type: str = "cells"):
 786        """
 787        Save the events to an OCULAR file. Relies on the dataframe originating
 788        from an OCULAR file (same columns; duplicate metadata/info).
 789        :param output_path:
 790        :param event_type:
 791        :return:
 792        """
 793        if pyreadr is None:
 794            raise ModuleNotFoundError(
 795                "pyreadr not installed. Install pyreadr directly "
 796                "or install csi-images with [rds] option to resolve."
 797            )
 798        if event_type == "cells":
 799            file_stub = "rc-final"
 800        elif event_type == "others":
 801            file_stub = "others-final"
 802        else:
 803            raise ValueError("Invalid event type. Must be cells or others.")
 804
 805        # Ensure good metadata
 806        metadata = pd.DataFrame(
 807            {
 808                "slide_id": self.info["slide_id"],
 809                "frame_id": self.info["tile"],
 810                "cell_id": (
 811                    self.metadata["cell_id"]
 812                    if "cell_id" in self.metadata.columns
 813                    else range(len(self.info))
 814                ),
 815                "cellx": self.info["x"],
 816                "celly": self.info["y"],
 817            }
 818        )
 819        if self.metadata is not None:
 820            metadata[self.metadata.columns] = self.metadata.copy()
 821
 822        # Check for the "ocular_interesting" column
 823        if event_type == "cells":
 824            if "ocular_interesting" in metadata.columns:
 825                interesting_rows = metadata["ocular_interesting"].to_numpy(dtype=bool)
 826            elif "hcpc" in metadata.columns:
 827                # Interesting cells don't get an hcpc designation, leaving them as -1
 828                interesting_rows = (
 829                    metadata["hcpc"].to_numpy() == -1
 830                )  # interesting cells
 831            else:
 832                interesting_rows = []
 833            if sum(interesting_rows) > 0:
 834                # Split the metadata into interesting and regular
 835                interesting_events = self.rows(interesting_rows)
 836                interesting_df = pd.concat(
 837                    [interesting_events.features, interesting_events.metadata], axis=1
 838                )
 839                data_events = self.rows(~interesting_rows)
 840                data_df = pd.concat(
 841                    [data_events.features, data_events.metadata], axis=1
 842                )
 843                data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore")
 844
 845                # Drop particular columns for "interesting"
 846                interesting_df = interesting_df.drop(
 847                    [
 848                        "clust",
 849                        "hcpc",
 850                        "frame_id",
 851                        "cell_id",
 852                        "unique_id",
 853                        "ocular_interesting",
 854                    ],
 855                    axis=1,
 856                    errors="ignore",
 857                )
 858                # Save both .csv and .rds
 859                interesting_stub = os.path.join(output_path, "ocular_interesting")
 860                interesting_df.to_csv(f"{interesting_stub}.csv")
 861                # Suppress pandas FutureWarning
 862                with warnings.catch_warnings():
 863                    warnings.simplefilter(action="ignore", category=FutureWarning)
 864                    pyreadr.write_rds(f"{interesting_stub}.rds", interesting_df)
 865            else:
 866                data_df = pd.concat([self.features, metadata], axis=1)
 867        else:
 868            # Get all data and reset_index (will copy it)
 869            data_df = pd.concat([self.features, metadata], axis=1)
 870
 871        # Split based on cluster number to conform to *-final[1-4].rds
 872        n_clusters = max(data_df["clust"]) + 1
 873        split_idx = [round(i * n_clusters / 4) for i in range(5)]
 874        for i in range(4):
 875            subset = (split_idx[i] <= data_df["clust"]) & (
 876                data_df["clust"] < split_idx[i + 1]
 877            )
 878            data_df.loc[subset, "hcpc"] = i + 1
 879            subset = data_df[subset].reset_index(drop=True)
 880            # Suppress pandas FutureWarning
 881            with warnings.catch_warnings():
 882                warnings.simplefilter(action="ignore", category=FutureWarning)
 883                pyreadr.write_rds(
 884                    os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset
 885                )
 886
 887        # Create new example cell strings
 888        data_df["example_cell_id"] = (
 889            data_df["slide_id"]
 890            + " "
 891            + data_df["frame_id"].astype(str)
 892            + " "
 893            + data_df["cell_id"].astype(str)
 894            + " "
 895            + data_df["cellx"].astype(int).astype(str)
 896            + " "
 897            + data_df["celly"].astype(int).astype(str)
 898        )
 899        # Find averagable data columns
 900        if "cellcluster_id" in data_df.columns:
 901            end_idx = data_df.columns.get_loc("cellcluster_id")
 902        else:
 903            end_idx = data_df.columns.get_loc("slide_id")
 904        avg_cols = data_df.columns[:end_idx].tolist()
 905        # Group by cluster and average
 906        data_df = data_df.groupby("clust").agg(
 907            **{col: (col, "mean") for col in avg_cols},
 908            count=("clust", "size"),  # count rows in each cluster
 909            example_cells=("example_cell_id", lambda x: ",".join(x)),
 910            hcpc=("hcpc", lambda x: x.iloc[0]),
 911        )
 912        data_df = data_df.reset_index()  # Do NOT drop, index is "clust"
 913        # Create new columns
 914        metadata = pd.DataFrame(
 915            {
 916                "count": data_df["count"],
 917                "example_cells": data_df["example_cells"],
 918                "clust": data_df["clust"].astype(int),
 919                "hcpc": data_df["hcpc"].astype(int),
 920                "id": data_df["clust"].astype(int).astype(str),
 921                "cccluster": "0",  # Dummy value
 922                "ccdistance": 0.0,  # Dummy value
 923                "rownum": list(range(len(data_df))),
 924                "framegroup": 0,  # Dummy value
 925            }
 926        )
 927        # Need to pad the features to 761 columns, as per OCULAR report needs
 928        additional_columns = range(len(avg_cols), 761)
 929        if len(additional_columns) > 0:
 930            padding = pd.DataFrame(
 931                np.zeros((len(data_df), len(additional_columns))),
 932                columns=[f"pad{i}" for i in additional_columns],
 933            )
 934            data_df = pd.concat([data_df[avg_cols], padding, metadata], axis=1)
 935        else:
 936            data_df = pd.concat([data_df[avg_cols], metadata], axis=1)
 937
 938        # Save the cluster data
 939        data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv"))
 940        # Suppress pandas FutureWarning
 941        with warnings.catch_warnings():
 942            warnings.simplefilter(action="ignore", category=FutureWarning)
 943            pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df)
 944
 945    @classmethod
 946    def load_ocular(
 947        cls,
 948        input_path: str,
 949        event_type="cells",
 950        cell_data_files=(
 951            "rc-final1.rds",
 952            "rc-final2.rds",
 953            "rc-final3.rds",
 954            "rc-final4.rds",
 955            "ocular_interesting.rds",
 956        ),
 957        others_data_files=(
 958            "others-final1.rds",
 959            "others-final2.rds",
 960            "others-final3.rds",
 961            "others-final4.rds",
 962        ),
 963        atlas_data_files=(
 964            "ocular_interesting.rds",
 965            "ocular_not_interesting.rds",
 966        ),
 967        drop_common_events=True,
 968        log=None,
 969    ) -> Self:
 970        """
 971
 972        :param input_path:
 973        :param event_type:
 974        :param cell_data_files:
 975        :param others_data_files:
 976        :param atlas_data_files:
 977        :param drop_common_events:
 978        :param log:
 979        :return:
 980        """
 981        if pyreadr is None:
 982            raise ModuleNotFoundError(
 983                "pyreadr not installed. Install pyreadr directly "
 984                "or install csi-images with [rds] option to resolve."
 985            )
 986        # Check if the input path is a directory or a file
 987        if os.path.isfile(input_path):
 988            data_files = [os.path.basename(input_path)]
 989            input_path = os.path.dirname(input_path)
 990        if event_type == "cells":
 991            data_files = cell_data_files
 992        elif event_type == "others":
 993            data_files = others_data_files
 994        else:
 995            raise ValueError("Invalid event type.")
 996
 997        # Load the data from the OCULAR files
 998        file_data = {}
 999        for file in data_files:
1000            file_path = os.path.join(input_path, file)
1001            if not os.path.isfile(file_path):
1002                if log is not None:
1003                    log.warning(f"{file} not found for in {input_path}")
1004                continue
1005            file_data[file] = pyreadr.read_r(file_path)
1006            # Get the DataFrame associated with None (pyreadr dict quirk)
1007            file_data[file] = file_data[file][None]
1008            if len(file_data[file]) == 0:
1009                # File gets dropped from the dict
1010                file_data.pop(file)
1011                if log is not None:
1012                    log.warning(f"{file} has no cells")
1013                continue
1014
1015            if log is not None:
1016                log.debug(f"{file} has {len(file_data[file])} cells")
1017
1018            # Drop common cells if requested and in this file
1019            if (
1020                file in atlas_data_files
1021                and drop_common_events
1022                and "catalogue_classification" in file_data[file]
1023            ):
1024                common_cell_indices = (
1025                    file_data[file]["catalogue_classification"] == "common_cell"
1026                )
1027                if log is not None:
1028                    log.debug(
1029                        f"Dropping {int(pd.Series.sum(common_cell_indices))}"
1030                        f"common cells from {file}"
1031                    )
1032                file_data[file] = file_data[file][common_cell_indices == False]
1033
1034            if len(file_data[file]) == 0:
1035                # File gets dropped from the dict
1036                file_data.pop(file)
1037                if log is not None:
1038                    log.warning(f"{file} has no cells after dropping common cells")
1039                continue
1040
1041            # Extract frame_id and cell_id
1042            # DAPI- events already have frame_id cell_id outside rowname
1043            if event_type == "cells" and "frame_id" not in file_data[file].columns:
1044                file_data[file]["rowname"] = file_data[file]["rowname"].astype("str")
1045                # get frame_id cell_id from rownames column and split into two columns
1046                split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True)
1047                if len(split_res.columns) != 2:
1048                    log.warning(
1049                        f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}'
1050                    )
1051                # then assign it back to the dataframe
1052                file_data[file][["frame_id", "cell_id"]] = split_res.astype("int")
1053            # reset indexes since they can cause NaN values in concat
1054            file_data[file] = file_data[file].reset_index(drop=True)
1055
1056        # Merge the data from all files
1057        if len(file_data) == 0:
1058            return EventArray()
1059        elif len(file_data) == 1:
1060            data = [file_data[file] for file in file_data.keys()][0]
1061        else:
1062            data = pd.concat(file_data.values())
1063
1064        if log is not None:
1065            log.debug(f"Gathered a total of {len(data)} events")
1066
1067        # Others is missing the "slide_id". Insert it right before "frame_id" column
1068        if event_type == "others" and "slide_id" not in data.columns:
1069            if os.path.basename(input_path) == "ocular":
1070                slide_id = os.path.basename(os.path.dirname(input_path))
1071            else:
1072                slide_id = "UNKNOWN"
1073            data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id)
1074
1075        # Sort according to ascending cell_id to keep the original, which is in manual_df
1076        data = data.sort_values(by=["cell_id"], ascending=True)
1077        # Filter out duplicates by x & y
1078        data = data.assign(
1079            unique_id=data["slide_id"]
1080            + "_"
1081            + data["frame_id"].astype(str)
1082            + "_"
1083            + data["cellx"].astype(int).astype(str)
1084            + "_"
1085            + data["celly"].astype(int).astype(str)
1086        )
1087        data = data.drop_duplicates(subset=["unique_id"], keep="first")
1088        # Normal unique_id is with cell_id
1089        data = data.assign(
1090            unique_id=data["slide_id"]
1091            + "_"
1092            + data["frame_id"].astype(str)
1093            + "_"
1094            + data["cell_id"].astype(str)
1095        )
1096        data = data.reset_index(drop=True)
1097        # All columns up to "slide_id" are features; drop the "slide_id"
1098        features = data.loc[:, :"slide_id"].iloc[:, :-1]
1099        data = data.loc[:, "slide_id":]
1100        # Grab the info columns
1101        info = data[["slide_id", "frame_id", "cellx", "celly"]]
1102        info.columns = ["slide_id", "tile", "x", "y"]
1103        info = info.assign(roi=0)  # OCULAR only works on 1 ROI, as far as known
1104        info = info[["slide_id", "tile", "roi", "x", "y"]]
1105        # Metadata has duplicate columns for later convenience
1106        metadata = data
1107        # Certain columns tend to be problematic with mixed data formats...
1108        for col in ["TRITC", "CY5", "FITC"]:
1109            if col in metadata:
1110                labels = {
1111                    "False": False,
1112                    "True": True,
1113                    "FALSE": False,
1114                    "TRUE": True,
1115                }
1116                metadata[col] = metadata[col].map(labels).astype(bool)
1117        for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]:
1118            if col in metadata:
1119                metadata[col] = metadata[col].fillna(-1).astype(int)
1120        return EventArray(info, metadata, features)
class Event:
 35class Event:
 36    """
 37    A class that represents a single event in a scan, making it easy to evaluate
 38    singular events. Required metadata is exposed as attributes, and optional
 39    metadata and features are stored as DataFrames.
 40    """
 41
 42    SCAN_TO_SLIDE_TRANSFORM = {
 43        # Axioscan zero is in the top-right corner instead of top-left
 44        Scan.Type.AXIOSCAN7: np.array(
 45            [
 46                [1, 0, 75000],
 47                [0, 1, 0],
 48                [0, 0, 1],
 49            ]
 50        ),
 51        # BZScanner coordinates are a special kind of messed up:
 52        # - The slide is upside-down.
 53        # - The slide is oriented vertically, with the barcode at the bottom.
 54        # - Tiles are numbered from the top-right
 55        Scan.Type.BZSCANNER: np.array(
 56            [
 57                [0, -1, 75000],
 58                [-1, 0, 25000],
 59                [0, 0, 1],
 60            ]
 61        ),
 62    }
 63    """
 64    Homogeneous transformation matrices for converting between scanner and slide
 65    coordinates. The matrices are 3x3, with the final column representing the
 66    translation in micrometers (um). For more information, see 
 67    [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations).
 68    
 69    Transformations are nominal, and accuracy is not guaranteed; this is due to 
 70    imperfections in slides and alignment in the scanners. Units are in micrometers.
 71    """
 72
 73    def __init__(
 74        self,
 75        scan: Scan,
 76        tile: Tile,
 77        x: int,
 78        y: int,
 79        metadata: pd.Series = None,
 80        features: pd.Series = None,
 81    ):
 82        self.scan = scan
 83        self.tile = tile
 84        self.x = int(x)
 85        self.y = int(y)
 86        self.metadata = metadata
 87        self.features = features
 88
 89    def __repr__(self) -> str:
 90        return f"{self.scan.slide_id}-{self.tile.n}-{self.x}-{self.y}"
 91
 92    def __eq__(self, other) -> bool:
 93        return self.__repr__() == other.__repr__()
 94
 95    def __lt__(self, other):
 96        return self.__repr__() < other.__repr__()
 97
 98    def get_scan_position(self) -> tuple[float, float]:
 99        """
100        Get the position of the event in the scanner's coordinate frame.
101        :return: the scan position of the event in micrometers (um).
102        """
103        # Get overall pixel position
104        pixel_x = self.x + (self.scan.tile_width_px * self.tile.x)
105        pixel_y = self.y + (self.scan.tile_height_px * self.tile.y)
106        # Convert to micrometers
107        x_um = pixel_x * self.scan.pixel_size_um
108        y_um = pixel_y * self.scan.pixel_size_um
109        # Add the scan's origin in the scanner frame
110        x_um += self.scan.roi[self.tile.n_roi].origin_x_um
111        y_um += self.scan.roi[self.tile.n_roi].origin_y_um
112        return x_um, y_um
113
114    def get_slide_position(self) -> tuple[float, float]:
115        """
116        Get the slide position of the event in micrometers (um).
117        :return: the slide position of the event.
118        """
119        # Turn scan_position into a 3x1 vector
120        scan_position = self.get_scan_position()
121        scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]])
122
123        # Multiply by the appropriate homogeneous matrix
124        if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value):
125            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7]
126        elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value):
127            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER]
128        else:
129            raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.")
130        slide_position = np.matmul(transform, scan_position)
131        return float(slide_position[0][0]), float(slide_position[1][0])
132
133    def crop_images(
134        self, images: Iterable[np.ndarray], crop_size: int = 100, in_pixels: bool = True
135    ) -> list[np.ndarray]:
136        """
137        Get the event crops from the frame images. Called "get" because it does not
138        need to extract anything; it is very quick for extracting multiple events from
139        the same tile.
140        Use this if you're interested in many events.
141        :param images: the frame images.
142        :param crop_size: the square size of the image crop to get for this event.
143        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
144        :return: image_size x image_size crops of the event in the provided frames. If
145        the event is too close to the edge, the crop will be smaller and not centered.
146        """
147        # Convert a crop size in micrometers to pixels
148        if not in_pixels:
149            crop_size = round(crop_size / self.scan.pixel_size_um)
150        # Find the crop bounds
151        bounds = [
152            self.x - (crop_size // 2) + 1,
153            self.y - (crop_size // 2) + 1,
154            self.x + math.ceil(crop_size / 2) + 1,
155            self.y + math.ceil(crop_size / 2) + 1,
156        ]
157        # Determine how much the bounds violate the image size
158        displacements = [
159            max(0, -bounds[0]),
160            max(0, -bounds[1]),
161            max(0, bounds[2] - images[0].shape[1]),
162            max(0, bounds[3] - images[0].shape[0]),
163        ]
164        # Cap off the bounds
165        bounds = [
166            max(0, bounds[0]),
167            max(0, bounds[1]),
168            min(images[0].shape[1], bounds[2]),
169            min(images[0].shape[0], bounds[3]),
170        ]
171
172        # Crop the images
173        crops = []
174        for image in images:
175            # Create a blank image of the right size
176            crop = np.zeros((crop_size, crop_size), dtype=image.dtype)
177
178            # Insert the cropped image into the blank image, leaving a black buffer
179            # around the edges if the crop would go beyond the original image bounds
180            crop[
181                displacements[1] : crop_size - displacements[3],
182                displacements[0] : crop_size - displacements[2],
183            ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]]
184            crops.append(crop)
185        return crops
186
187    def extract_images(
188        self,
189        crop_size: int = 100,
190        in_pixels: bool = True,
191        input_path: str = None,
192        channels: Iterable[int | str] = None,
193        apply_gain: bool | Iterable[bool] = True,
194    ) -> list[np.ndarray]:
195        """
196        Extract the images from the scan and tile, reading from the file. Called
197        "extract" because it must read and extract the images from file, which is slow.
198        Use this if you're interested in only a few events, as it is inefficient when
199        reading multiple events from the same tile.
200        :param crop_size: the square size of the image crop to get for this event.
201        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
202        :param input_path: the path to the input images. Defaults to None (uses the scan's path).
203        :param channels: the channels to extract images for. Defaults to all channels.
204        :param apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True.
205                           Can be supplied as a list to apply gain to individual channels.
206        :return: a list of cropped images from the scan in the order of the channels.
207        """
208        frames = Frame.get_frames(self.tile, channels)
209        if isinstance(apply_gain, bool):
210            apply_gain = [apply_gain] * len(frames)
211        images = [f.get_image(input_path, a) for f, a in zip(frames, apply_gain)]
212        return self.crop_images(images, crop_size, in_pixels)
213
214    @classmethod
215    def extract_images_for_list(
216        cls,
217        events: list[Self],
218        crop_size: int | list[int] = 75,
219        in_pixels: bool = True,
220        input_path: str = None,
221        channels: Iterable[int | str] = None,
222        apply_gain: bool | Iterable[bool] = True,
223    ) -> list[list[np.ndarray]]:
224        """
225        Get the images for a list of events, ensuring that there is no wasteful reading
226        of the same tile multiple times. This function is more efficient than calling
227        extract_event_images for each event.
228        :param events: the events to extract images for.
229        :param crop_size: the square size of the image crop to get for this event.
230                          Defaults to four times the size of the event.
231        :param in_pixels: whether the crop size is in pixels or micrometers.
232                          Defaults to pixels, and is ignored if crop_size is None.
233        :param input_path: the path to the input images. Will only work for lists of events
234                           from the same scan. Defaults to None (uses the scan's path).
235        :param channels: the channels to extract images for. Defaults to all channels.
236        :param apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True.
237                           Can be supplied as a list to apply gain to individual channels.
238        :return: a list of lists of cropped images for each event.
239        """
240        # Validation
241        if len(events) == 0:
242            return []
243        if isinstance(crop_size, int):
244            crop_size = [crop_size] * len(events)
245
246        # Get the order of the events when sorted by slide/tile
247        order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__()))
248
249        # Allocate the list to size
250        crops = [[]] * len(events)
251        last_tile = None
252        images = None  # Holds large numpy arrays, so expensive to compare
253        # Iterate through in slide/tile sorted order
254        for i in order:
255            if last_tile != events[i].tile:
256                # Gather the frame images, preserving them for the next event
257                frames = Frame.get_frames(events[i].tile, channels)
258                if isinstance(apply_gain, bool):
259                    gain_list = [apply_gain] * len(frames)
260                else:
261                    gain_list = apply_gain
262                images = [f.get_image(input_path, a) for f, a in zip(frames, gain_list)]
263                last_tile = events[i].tile
264            # Use the frame images to crop the event images
265            crops[i] = events[i].crop_images(images, crop_size[i], in_pixels)
266        return crops

A class that represents a single event in a scan, making it easy to evaluate singular events. Required metadata is exposed as attributes, and optional metadata and features are stored as DataFrames.

Event( scan: csi_images.csi_scans.Scan, tile: csi_images.csi_tiles.Tile, x: int, y: int, metadata: pandas.core.series.Series = None, features: pandas.core.series.Series = None)
73    def __init__(
74        self,
75        scan: Scan,
76        tile: Tile,
77        x: int,
78        y: int,
79        metadata: pd.Series = None,
80        features: pd.Series = None,
81    ):
82        self.scan = scan
83        self.tile = tile
84        self.x = int(x)
85        self.y = int(y)
86        self.metadata = metadata
87        self.features = features
SCAN_TO_SLIDE_TRANSFORM = {<Type.AXIOSCAN7: 'axioscan7'>: array([[ 1, 0, 75000], [ 0, 1, 0], [ 0, 0, 1]]), <Type.BZSCANNER: 'bzscanner'>: array([[ 0, -1, 75000], [ -1, 0, 25000], [ 0, 0, 1]])}

Homogeneous transformation matrices for converting between scanner and slide coordinates. The matrices are 3x3, with the final column representing the translation in micrometers (um). For more information, see affine transformations.

Transformations are nominal, and accuracy is not guaranteed; this is due to imperfections in slides and alignment in the scanners. Units are in micrometers.

scan
tile
x
y
metadata
features
def get_scan_position(self) -> tuple[float, float]:
 98    def get_scan_position(self) -> tuple[float, float]:
 99        """
100        Get the position of the event in the scanner's coordinate frame.
101        :return: the scan position of the event in micrometers (um).
102        """
103        # Get overall pixel position
104        pixel_x = self.x + (self.scan.tile_width_px * self.tile.x)
105        pixel_y = self.y + (self.scan.tile_height_px * self.tile.y)
106        # Convert to micrometers
107        x_um = pixel_x * self.scan.pixel_size_um
108        y_um = pixel_y * self.scan.pixel_size_um
109        # Add the scan's origin in the scanner frame
110        x_um += self.scan.roi[self.tile.n_roi].origin_x_um
111        y_um += self.scan.roi[self.tile.n_roi].origin_y_um
112        return x_um, y_um

Get the position of the event in the scanner's coordinate frame.

Returns

the scan position of the event in micrometers (um).

def get_slide_position(self) -> tuple[float, float]:
114    def get_slide_position(self) -> tuple[float, float]:
115        """
116        Get the slide position of the event in micrometers (um).
117        :return: the slide position of the event.
118        """
119        # Turn scan_position into a 3x1 vector
120        scan_position = self.get_scan_position()
121        scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]])
122
123        # Multiply by the appropriate homogeneous matrix
124        if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value):
125            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7]
126        elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value):
127            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER]
128        else:
129            raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.")
130        slide_position = np.matmul(transform, scan_position)
131        return float(slide_position[0][0]), float(slide_position[1][0])

Get the slide position of the event in micrometers (um).

Returns

the slide position of the event.

def crop_images( self, images: Iterable[numpy.ndarray], crop_size: int = 100, in_pixels: bool = True) -> list[numpy.ndarray]:
133    def crop_images(
134        self, images: Iterable[np.ndarray], crop_size: int = 100, in_pixels: bool = True
135    ) -> list[np.ndarray]:
136        """
137        Get the event crops from the frame images. Called "get" because it does not
138        need to extract anything; it is very quick for extracting multiple events from
139        the same tile.
140        Use this if you're interested in many events.
141        :param images: the frame images.
142        :param crop_size: the square size of the image crop to get for this event.
143        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
144        :return: image_size x image_size crops of the event in the provided frames. If
145        the event is too close to the edge, the crop will be smaller and not centered.
146        """
147        # Convert a crop size in micrometers to pixels
148        if not in_pixels:
149            crop_size = round(crop_size / self.scan.pixel_size_um)
150        # Find the crop bounds
151        bounds = [
152            self.x - (crop_size // 2) + 1,
153            self.y - (crop_size // 2) + 1,
154            self.x + math.ceil(crop_size / 2) + 1,
155            self.y + math.ceil(crop_size / 2) + 1,
156        ]
157        # Determine how much the bounds violate the image size
158        displacements = [
159            max(0, -bounds[0]),
160            max(0, -bounds[1]),
161            max(0, bounds[2] - images[0].shape[1]),
162            max(0, bounds[3] - images[0].shape[0]),
163        ]
164        # Cap off the bounds
165        bounds = [
166            max(0, bounds[0]),
167            max(0, bounds[1]),
168            min(images[0].shape[1], bounds[2]),
169            min(images[0].shape[0], bounds[3]),
170        ]
171
172        # Crop the images
173        crops = []
174        for image in images:
175            # Create a blank image of the right size
176            crop = np.zeros((crop_size, crop_size), dtype=image.dtype)
177
178            # Insert the cropped image into the blank image, leaving a black buffer
179            # around the edges if the crop would go beyond the original image bounds
180            crop[
181                displacements[1] : crop_size - displacements[3],
182                displacements[0] : crop_size - displacements[2],
183            ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]]
184            crops.append(crop)
185        return crops

Get the event crops from the frame images. Called "get" because it does not need to extract anything; it is very quick for extracting multiple events from the same tile. Use this if you're interested in many events.

Parameters
  • images: the frame images.
  • crop_size: the square size of the image crop to get for this event.
  • in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
Returns

image_size x image_size crops of the event in the provided frames. If the event is too close to the edge, the crop will be smaller and not centered.

def extract_images( self, crop_size: int = 100, in_pixels: bool = True, input_path: str = None, channels: Iterable[int | str] = None, apply_gain: Union[bool, Iterable[bool]] = True) -> list[numpy.ndarray]:
187    def extract_images(
188        self,
189        crop_size: int = 100,
190        in_pixels: bool = True,
191        input_path: str = None,
192        channels: Iterable[int | str] = None,
193        apply_gain: bool | Iterable[bool] = True,
194    ) -> list[np.ndarray]:
195        """
196        Extract the images from the scan and tile, reading from the file. Called
197        "extract" because it must read and extract the images from file, which is slow.
198        Use this if you're interested in only a few events, as it is inefficient when
199        reading multiple events from the same tile.
200        :param crop_size: the square size of the image crop to get for this event.
201        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
202        :param input_path: the path to the input images. Defaults to None (uses the scan's path).
203        :param channels: the channels to extract images for. Defaults to all channels.
204        :param apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True.
205                           Can be supplied as a list to apply gain to individual channels.
206        :return: a list of cropped images from the scan in the order of the channels.
207        """
208        frames = Frame.get_frames(self.tile, channels)
209        if isinstance(apply_gain, bool):
210            apply_gain = [apply_gain] * len(frames)
211        images = [f.get_image(input_path, a) for f, a in zip(frames, apply_gain)]
212        return self.crop_images(images, crop_size, in_pixels)

Extract the images from the scan and tile, reading from the file. Called "extract" because it must read and extract the images from file, which is slow. Use this if you're interested in only a few events, as it is inefficient when reading multiple events from the same tile.

Parameters
  • crop_size: the square size of the image crop to get for this event.
  • in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
  • input_path: the path to the input images. Defaults to None (uses the scan's path).
  • channels: the channels to extract images for. Defaults to all channels.
  • apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True. Can be supplied as a list to apply gain to individual channels.
Returns

a list of cropped images from the scan in the order of the channels.

@classmethod
def extract_images_for_list( cls, events: list[typing.Self], crop_size: int | list[int] = 75, in_pixels: bool = True, input_path: str = None, channels: Iterable[int | str] = None, apply_gain: Union[bool, Iterable[bool]] = True) -> list[list[numpy.ndarray]]:
214    @classmethod
215    def extract_images_for_list(
216        cls,
217        events: list[Self],
218        crop_size: int | list[int] = 75,
219        in_pixels: bool = True,
220        input_path: str = None,
221        channels: Iterable[int | str] = None,
222        apply_gain: bool | Iterable[bool] = True,
223    ) -> list[list[np.ndarray]]:
224        """
225        Get the images for a list of events, ensuring that there is no wasteful reading
226        of the same tile multiple times. This function is more efficient than calling
227        extract_event_images for each event.
228        :param events: the events to extract images for.
229        :param crop_size: the square size of the image crop to get for this event.
230                          Defaults to four times the size of the event.
231        :param in_pixels: whether the crop size is in pixels or micrometers.
232                          Defaults to pixels, and is ignored if crop_size is None.
233        :param input_path: the path to the input images. Will only work for lists of events
234                           from the same scan. Defaults to None (uses the scan's path).
235        :param channels: the channels to extract images for. Defaults to all channels.
236        :param apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True.
237                           Can be supplied as a list to apply gain to individual channels.
238        :return: a list of lists of cropped images for each event.
239        """
240        # Validation
241        if len(events) == 0:
242            return []
243        if isinstance(crop_size, int):
244            crop_size = [crop_size] * len(events)
245
246        # Get the order of the events when sorted by slide/tile
247        order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__()))
248
249        # Allocate the list to size
250        crops = [[]] * len(events)
251        last_tile = None
252        images = None  # Holds large numpy arrays, so expensive to compare
253        # Iterate through in slide/tile sorted order
254        for i in order:
255            if last_tile != events[i].tile:
256                # Gather the frame images, preserving them for the next event
257                frames = Frame.get_frames(events[i].tile, channels)
258                if isinstance(apply_gain, bool):
259                    gain_list = [apply_gain] * len(frames)
260                else:
261                    gain_list = apply_gain
262                images = [f.get_image(input_path, a) for f, a in zip(frames, gain_list)]
263                last_tile = events[i].tile
264            # Use the frame images to crop the event images
265            crops[i] = events[i].crop_images(images, crop_size[i], in_pixels)
266        return crops

Get the images for a list of events, ensuring that there is no wasteful reading of the same tile multiple times. This function is more efficient than calling extract_event_images for each event.

Parameters
  • events: the events to extract images for.
  • crop_size: the square size of the image crop to get for this event. Defaults to four times the size of the event.
  • in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels, and is ignored if crop_size is None.
  • input_path: the path to the input images. Will only work for lists of events from the same scan. Defaults to None (uses the scan's path).
  • channels: the channels to extract images for. Defaults to all channels.
  • apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True. Can be supplied as a list to apply gain to individual channels.
Returns

a list of lists of cropped images for each event.

class EventArray:
 269class EventArray:
 270    """
 271    A class that holds a large number of events' data, making it easy to analyze and
 272    manipulate many events at once. A more separated version of the Event class.
 273    """
 274
 275    INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y"]
 276
 277    def __init__(
 278        self,
 279        info: pd.DataFrame = None,
 280        metadata: pd.DataFrame = None,
 281        features: pd.DataFrame = None,
 282    ):
 283        # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y"
 284        if info is not None:
 285            if list(info.columns) != self.INFO_COLUMNS:
 286                raise ValueError(
 287                    'EventArray.info must have columns "slide_id", "tile", "roi", "x", "y"'
 288                )
 289            # Copy first to avoid modifying the original
 290            info = info.copy()
 291            # Ensure that the columns are the right types
 292            info["slide_id"] = info["slide_id"].astype(str)
 293            info["tile"] = info["tile"].astype(np.uint16)
 294            info["roi"] = info["roi"].astype(np.uint8)
 295            info["x"] = info["x"].round().astype(np.uint16)
 296            info["y"] = info["y"].round().astype(np.uint16)
 297        # All DataFrames must all have the same number of rows
 298        if metadata is not None and (info is None or len(info) != len(metadata)):
 299            raise ValueError(
 300                "If EventArray.metadata is not None, it should match rows with .info"
 301            )
 302        if features is not None and (info is None or len(info) != len(features)):
 303            raise ValueError(
 304                "If EventArray.features is not None, it should match rows with .info"
 305            )
 306        # No columns named "metadata_", "features_", or "None"
 307        column_names = []
 308        if metadata is not None:
 309            column_names += metadata.columns.tolist()
 310        if features is not None:
 311            column_names += features.columns.tolist()
 312        if any([col.lower().startswith("metadata_") for col in column_names]):
 313            raise ValueError("EventArray column names cannot start with 'metadata_'")
 314        if any([col.lower().startswith("features_") for col in column_names]):
 315            raise ValueError("EventArray column names cannot start with 'features_'")
 316        if any([col.lower() == "none" for col in column_names]):
 317            raise ValueError("EventArray column names cannot be 'none'")
 318
 319        self.info = info
 320        self.metadata = metadata
 321        self.features = features
 322
 323    def __len__(self) -> int:
 324        # Convenience method to get the number of events
 325        if self.info is None:
 326            return 0
 327        else:
 328            return len(self.info)
 329
 330    def __eq__(self, other):
 331        is_equal = True
 332        # Parse all possibilities for info
 333        if isinstance(self.info, pd.DataFrame):
 334            if isinstance(other.info, pd.DataFrame):
 335                is_equal = self.info.equals(other.info)
 336                if not is_equal:
 337                    return False
 338            else:
 339                return False
 340        elif self.info is None:
 341            if other.info is not None:
 342                return False
 343
 344        # Parse all possibilities for metadata
 345        if isinstance(self.metadata, pd.DataFrame):
 346            if isinstance(other.metadata, pd.DataFrame):
 347                is_equal = self.metadata.equals(other.metadata)
 348                if not is_equal:
 349                    return False
 350            else:
 351                return False
 352        elif self.metadata is None:
 353            if other.metadata is not None:
 354                return False
 355
 356        # Parse all possibilities for features
 357        if isinstance(self.features, pd.DataFrame):
 358            if isinstance(other.features, pd.DataFrame):
 359                is_equal = self.features.equals(other.features)
 360                if not is_equal:
 361                    return False
 362            else:
 363                return False
 364        elif self.features is None:
 365            if other.features is not None:
 366                return False
 367
 368        return is_equal
 369
 370    def get_sort_order(
 371        self, by: Hashable | Sequence[Hashable], ascending: bool | Sequence[bool] = True
 372    ):
 373        """
 374        Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.
 375        :param by: name of the column(s) to sort by.
 376        :param ascending: whether to sort in ascending order; can be a list to match by
 377        :return: the order of the indices to sort by.
 378        """
 379        columns = self.get(by)
 380        return columns.sort_values(by=by, ascending=ascending).index
 381
 382    def sort(
 383        self,
 384        by: Hashable | Sequence[Hashable],
 385        ascending: bool | Sequence[bool] = True,
 386    ) -> Self:
 387        """
 388        Sort the EventArray by column(s) in the info, metadata, or features DataFrames.
 389        :param by: name of the column(s) to sort by.
 390        :param ascending: whether to sort in ascending order; can be a list to match by
 391        :return: a new, sorted EventArray.
 392        """
 393        order = self.get_sort_order(by, ascending)
 394        info = self.info.loc[order].reset_index(drop=True)
 395        if self.metadata is not None:
 396            metadata = self.metadata.loc[order].reset_index(drop=True)
 397        else:
 398            metadata = None
 399        if self.features is not None:
 400            features = self.features.loc[order].reset_index(drop=True)
 401        else:
 402            features = None
 403        return EventArray(info, metadata, features)
 404
 405    def get(self, column_names: Hashable | Sequence[Hashable]) -> pd.DataFrame:
 406        """
 407        Get a DataFrame with the specified columns from the EventArray, by value.
 408        :param column_names: the names of the columns to get.
 409        :return: a DataFrame with the specified columns.
 410        """
 411        if isinstance(column_names, Hashable):
 412            column_names = [column_names]  # Drop into a list for the loop
 413        columns = []
 414        for column_name in column_names:
 415            if column_name in self.info.columns:
 416                columns.append(self.info[column_name])
 417            elif self.metadata is not None and column_name in self.metadata.columns:
 418                columns.append(self.metadata[column_name])
 419            elif self.features is not None and column_name in self.features.columns:
 420                columns.append(self.features[column_name])
 421            else:
 422                raise ValueError(f"Column {column_name} not found in EventArray")
 423        return pd.concat(columns, axis=1)
 424
 425    def rows(self, rows: Sequence[Hashable]) -> Self:
 426        """
 427        Get a subset of the EventArray rows based on a boolean or integer index, by value.
 428        :param rows: row labels, indices, or boolean mask; anything for .loc[]
 429        :return: a new EventArray with the subset of events.
 430        """
 431        info = self.info.loc[rows].reset_index(drop=True)
 432        if self.metadata is not None:
 433            metadata = self.metadata.loc[rows].reset_index(drop=True)
 434        else:
 435            metadata = None
 436        if self.features is not None:
 437            features = self.features.loc[rows].reset_index(drop=True)
 438        else:
 439            features = None
 440        return EventArray(info, metadata, features)
 441
 442    def copy(self) -> Self:
 443        """
 444        Create a deep copy of the EventArray.
 445        :return: a deep copy of the EventArray.
 446        """
 447        return EventArray(
 448            info=self.info.copy(),
 449            metadata=None if self.metadata is None else self.metadata.copy(),
 450            features=None if self.features is None else self.features.copy(),
 451        )
 452
 453    # TODO: add a "filter" convenience function that takes a column name and values to filter by
 454
 455    def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None:
 456        """
 457        Add metadata to the EventArray. Removes the need to check if metadata is None.
 458        Overwrites any existing metadata with the same column names as the new metadata.
 459        :param new_metadata: the metadata to add.
 460        """
 461        if len(self) != len(new_metadata):
 462            raise ValueError("New metadata must match length of existing info")
 463
 464        if self.metadata is None:
 465            self.metadata = new_metadata
 466        else:
 467            if isinstance(new_metadata, pd.Series):
 468                self.metadata[new_metadata.name] = new_metadata
 469            else:
 470                # It's a DataFrame
 471                self.metadata[new_metadata.columns] = new_metadata
 472
 473    def add_features(self, new_features: pd.Series | pd.DataFrame) -> None:
 474        """
 475        Add features to the EventArray. Removes the need to check if features is None.
 476        Overwrites any existing features with the same column names as the new features.
 477        :param new_features: the features to add.
 478        """
 479        if len(self) != len(new_features):
 480            raise ValueError("New features must match length of existing info")
 481
 482        if self.features is None:
 483            self.features = new_features
 484        else:
 485            if isinstance(new_features, pd.Series):
 486                self.features[new_features.name] = new_features
 487            else:
 488                # It's a DataFrame
 489                self.features[new_features.columns] = new_features
 490
 491    @classmethod
 492    def merge(cls, events: Iterable[Self]) -> Self:
 493        """
 494        Combine EventArrays in a list into a single EventArray.
 495        :param events: the new list of events.
 496        """
 497        all_info = []
 498        all_metadata = []
 499        all_features = []
 500        for event_array in events:
 501            # Skip empty EventArrays
 502            if event_array.info is not None:
 503                all_info.append(event_array.info)
 504            if event_array.metadata is not None:
 505                all_metadata.append(event_array.metadata)
 506            if event_array.features is not None:
 507                all_features.append(event_array.features)
 508        if len(all_info) == 0:
 509            return EventArray()
 510        else:
 511            all_info = pd.concat(all_info, ignore_index=True)
 512        if len(all_metadata) == 0:
 513            all_metadata = None
 514        else:
 515            all_metadata = pd.concat(all_metadata, ignore_index=True)
 516        if len(all_features) == 0:
 517            all_features = None
 518        else:
 519            all_features = pd.concat(all_features, ignore_index=True)
 520
 521        return EventArray(all_info, all_metadata, all_features)
 522
 523    def to_events(
 524        self,
 525        scans: Scan | Iterable[Scan],
 526        ignore_missing_scans=True,
 527        ignore_metadata=False,
 528        ignore_features=False,
 529    ) -> list[Event]:
 530        """
 531        Get the events in the EventArray as a list of events.
 532        :param scans: the scans that the events belong to, auto-matched by slide_id.
 533        Pass None if you don't care about scan metadata (pass ignore_missing_scans).
 534        :param ignore_missing_scans: whether to create blank scans for events without scans.
 535        :param ignore_metadata: whether to ignore metadata or not
 536        :param ignore_features: whether to ignore features or not
 537        :return:
 538        """
 539        if isinstance(scans, Scan):
 540            scans = [scans]
 541        scans = {scan.slide_id: scan for scan in scans}
 542        events = []
 543        for i in range(len(self.info)):
 544            # Determine the associated scan
 545            slide_id = self.info["slide_id"][i]
 546            if slide_id not in scans:
 547                if ignore_missing_scans:
 548                    # Create a placeholder scan if the scan is missing
 549                    scan = Scan.make_placeholder(
 550                        slide_id,
 551                        self.info["tile"][i],
 552                        self.info["roi"][i],
 553                    )
 554                else:
 555                    raise ValueError(
 556                        f"Scan {self.info['slide_id'][i]} not found for event {i}."
 557                    )
 558            else:
 559                scan = scans[slide_id]
 560
 561            # Prepare the metadata and features
 562            if ignore_metadata or self.metadata is None:
 563                metadata = None
 564            else:
 565                # This Series creation method is less efficient,
 566                # but required for preserving dtypes
 567                metadata = pd.Series(
 568                    {col: self.metadata.loc[i, col] for col in self.metadata.columns},
 569                    dtype=object,
 570                )
 571            if ignore_features or self.features is None:
 572                features = None
 573            else:
 574                features = pd.Series(
 575                    {col: self.features.loc[i, col] for col in self.features.columns},
 576                    dtype=object,
 577                )
 578            # Create the event and append it to the list
 579            events.append(
 580                Event(
 581                    scan,
 582                    Tile(scan, self.info["tile"][i], self.info["roi"][i]),
 583                    self.info["x"][i],
 584                    self.info["y"][i],
 585                    metadata=metadata,
 586                    features=features,
 587                )
 588            )
 589        return events
 590
 591    @classmethod
 592    def from_events(cls, events: Iterable[Event]) -> Self:
 593        """
 594        Set the events in the EventArray to a new list of events.
 595        :param events: the new list of events.
 596        """
 597        info = pd.DataFrame(
 598            {
 599                "slide_id": [event.scan.slide_id for event in events],
 600                "tile": [event.tile.n for event in events],
 601                "roi": [event.tile.n_roi for event in events],
 602                "x": [event.x for event in events],
 603                "y": [event.y for event in events],
 604            }
 605        )
 606        metadata_list = [event.metadata for event in events]
 607        # Iterate through and ensure that all metadata is the same shape
 608        for metadata in metadata_list:
 609            if type(metadata) != type(metadata_list[0]):
 610                raise ValueError("All metadata must be the same type.")
 611            if metadata is not None and metadata.shape != metadata_list[0].shape:
 612                raise ValueError("All metadata must be the same shape.")
 613        if metadata_list[0] is None:
 614            metadata = None
 615        else:
 616            metadata = pd.DataFrame(metadata_list)
 617        features_list = [event.features for event in events]
 618        # Iterate through and ensure that all features are the same shape
 619        for features in features_list:
 620            if type(features) != type(features_list[0]):
 621                raise ValueError("All features must be the same type.")
 622            if features is not None and features.shape != features_list[0].shape:
 623                raise ValueError("All features must be the same shape.")
 624        if features_list[0] is None:
 625            features = None
 626        else:
 627            features = pd.DataFrame(features_list)
 628        return EventArray(info=info, metadata=metadata, features=features)
 629
 630    def to_dataframe(self) -> pd.DataFrame:
 631        """
 632        Convert all the data in the EventArray to a single DataFrame.
 633        :return: a DataFrame with all the data in the EventArray.
 634        """
 635        # Make a copy of the info DataFrame and prepend "info_" to the column names
 636        output = self.info.copy()
 637        # Combine with the metadata and prepend "metadata_" to the column names
 638        if self.metadata is not None:
 639            metadata = self.metadata.copy()
 640            metadata.columns = [f"metadata_{col}" for col in metadata.columns]
 641            output = pd.concat([output, metadata], axis=1)
 642        # Combine with the features and prepend "features_" to the column names
 643        if self.features is not None:
 644            features = self.features.copy()
 645            features.columns = [f"features_{col}" for col in features.columns]
 646            output = pd.concat([output, features], axis=1)
 647        return output
 648
 649    @classmethod
 650    def from_dataframe(cls, df) -> Self:
 651        """
 652        From a single, special DataFrame, create an EventArray.
 653        :return: a DataFrame with all the data in the EventArray.
 654        """
 655        # Split the columns into info, metadata, and features and strip prefix
 656        info = df[[col for col in df.columns if col in cls.INFO_COLUMNS]].copy()
 657        if info.size == 0:
 658            info = None
 659        metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy()
 660        metadata.columns = [col.replace("metadata_", "") for col in metadata.columns]
 661        if metadata.size == 0:
 662            metadata = None
 663        features = df[[col for col in df.columns if col.startswith("features_")]].copy()
 664        features.columns = [col.replace("features_", "") for col in features.columns]
 665        if features.size == 0:
 666            features = None
 667        return cls(info=info, metadata=metadata, features=features)
 668
 669    @classmethod
 670    def from_mask(
 671        cls,
 672        mask: np.ndarray,
 673        slide_id: str,
 674        tile_n: int,
 675        n_roi: int = 0,
 676        include_cell_id: bool = True,
 677        images: list[np.ndarray] = None,
 678        image_labels: list[str] = None,
 679        properties: list[str] = None,
 680    ) -> Self:
 681        """
 682        Extract events from a mask DataFrame, including metadata and features.
 683        :param mask: the mask to extract events from.
 684        :param slide_id: the slide ID the mask is from.
 685        :param tile_n: the tile number the mask is from.
 686        :param n_roi: the ROI number the mask is from.
 687        :param include_cell_id: whether to include the cell_id, or numerical
 688        mask label, as metadata in the EventArray.
 689        :param images: the intensity images to extract features from.
 690        :param image_labels: the labels for the intensity images.
 691        :param properties: list of properties to extract in addition to the defaults:
 692        :return: EventArray corresponding to the mask labels.
 693        """
 694        if extract_mask_info is None:
 695            raise ModuleNotFoundError(
 696                "csi_images.csi_images dependencies not installed. Install csi-images "
 697                "with [imageio] option to resolve."
 698            )
 699        # Gather mask_info
 700        if images is not None and image_labels is not None:
 701            if len(images) != len(image_labels):
 702                raise ValueError("Intensity images and labels must match lengths.")
 703
 704        mask_info = extract_mask_info(mask, images, image_labels, properties)
 705
 706        if len(mask_info) == 0:
 707            return EventArray()
 708
 709        # Combine provided info and mask info
 710        info = pd.DataFrame(
 711            {
 712                "slide_id": slide_id,
 713                "tile": tile_n,
 714                "roi": n_roi,
 715                "x": mask_info["x"],
 716                "y": mask_info["y"],
 717            },
 718        )
 719        # Extract a metadata column if desired
 720        if include_cell_id:
 721            metadata = pd.DataFrame({"cell_id": mask_info["id"]})
 722        else:
 723            metadata = None
 724        # If any additional properties were extracted, add them as features
 725        mask_info = mask_info.drop(columns=["id", "x", "y"], errors="ignore")
 726        if len(mask_info.columns) > 0:
 727            features = mask_info
 728        else:
 729            features = None
 730        return EventArray(info, metadata, features)
 731
 732    def save_csv(self, output_path: str) -> bool:
 733        """
 734        Save the events to an CSV file, including metadata and features.
 735        :param output_path:
 736        :return:
 737        """
 738        self.to_dataframe().to_csv(output_path, index=False)
 739        return os.path.exists(output_path)
 740
 741    @classmethod
 742    def load_csv(cls, input_path: str) -> Self:
 743        """
 744        Load the events from an CSV file, including metadata and features.
 745        :param input_path:
 746        :return:
 747        """
 748        # Load the CSV file
 749        df = pd.read_csv(input_path)
 750        return cls.from_dataframe(df)
 751
 752    def save_hdf5(self, output_path: str) -> bool:
 753        """
 754        Save the events to an HDF5 file, including metadata and features.
 755        Uses the pandas-provided HDF5 functions for ease, and external compatibility,
 756        though these files are slightly harder to view in HDFView or similar.
 757        :param output_path:
 758        :return:
 759        """
 760        # Open the output_path as an HDF5 file
 761        with pd.HDFStore(output_path) as store:
 762            # Store the dataframes in the HDF5 file
 763            if self.info is not None:
 764                store.put("info", self.info, index=False)
 765            if self.metadata is not None:
 766                store.put("metadata", self.metadata, index=False)
 767            if self.features is not None:
 768                store.put("features", self.features, index=False)
 769        return os.path.exists(output_path)
 770
 771    @classmethod
 772    def load_hdf5(cls, input_path: str) -> Self:
 773        """
 774        Load the events from an HDF5 file, including metadata and features.
 775        :param input_path:
 776        :return:
 777        """
 778        # Open the input_path as an HDF5 file
 779        with pd.HDFStore(input_path, "r") as store:
 780            # Load the dataframes from the HDF5 file
 781            info = store.get("info") if "info" in store else None
 782            metadata = store.get("metadata") if "metadata" in store else None
 783            features = store.get("features") if "features" in store else None
 784        return cls(info=info, metadata=metadata, features=features)
 785
 786    def save_ocular(self, output_path: str, event_type: str = "cells"):
 787        """
 788        Save the events to an OCULAR file. Relies on the dataframe originating
 789        from an OCULAR file (same columns; duplicate metadata/info).
 790        :param output_path:
 791        :param event_type:
 792        :return:
 793        """
 794        if pyreadr is None:
 795            raise ModuleNotFoundError(
 796                "pyreadr not installed. Install pyreadr directly "
 797                "or install csi-images with [rds] option to resolve."
 798            )
 799        if event_type == "cells":
 800            file_stub = "rc-final"
 801        elif event_type == "others":
 802            file_stub = "others-final"
 803        else:
 804            raise ValueError("Invalid event type. Must be cells or others.")
 805
 806        # Ensure good metadata
 807        metadata = pd.DataFrame(
 808            {
 809                "slide_id": self.info["slide_id"],
 810                "frame_id": self.info["tile"],
 811                "cell_id": (
 812                    self.metadata["cell_id"]
 813                    if "cell_id" in self.metadata.columns
 814                    else range(len(self.info))
 815                ),
 816                "cellx": self.info["x"],
 817                "celly": self.info["y"],
 818            }
 819        )
 820        if self.metadata is not None:
 821            metadata[self.metadata.columns] = self.metadata.copy()
 822
 823        # Check for the "ocular_interesting" column
 824        if event_type == "cells":
 825            if "ocular_interesting" in metadata.columns:
 826                interesting_rows = metadata["ocular_interesting"].to_numpy(dtype=bool)
 827            elif "hcpc" in metadata.columns:
 828                # Interesting cells don't get an hcpc designation, leaving them as -1
 829                interesting_rows = (
 830                    metadata["hcpc"].to_numpy() == -1
 831                )  # interesting cells
 832            else:
 833                interesting_rows = []
 834            if sum(interesting_rows) > 0:
 835                # Split the metadata into interesting and regular
 836                interesting_events = self.rows(interesting_rows)
 837                interesting_df = pd.concat(
 838                    [interesting_events.features, interesting_events.metadata], axis=1
 839                )
 840                data_events = self.rows(~interesting_rows)
 841                data_df = pd.concat(
 842                    [data_events.features, data_events.metadata], axis=1
 843                )
 844                data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore")
 845
 846                # Drop particular columns for "interesting"
 847                interesting_df = interesting_df.drop(
 848                    [
 849                        "clust",
 850                        "hcpc",
 851                        "frame_id",
 852                        "cell_id",
 853                        "unique_id",
 854                        "ocular_interesting",
 855                    ],
 856                    axis=1,
 857                    errors="ignore",
 858                )
 859                # Save both .csv and .rds
 860                interesting_stub = os.path.join(output_path, "ocular_interesting")
 861                interesting_df.to_csv(f"{interesting_stub}.csv")
 862                # Suppress pandas FutureWarning
 863                with warnings.catch_warnings():
 864                    warnings.simplefilter(action="ignore", category=FutureWarning)
 865                    pyreadr.write_rds(f"{interesting_stub}.rds", interesting_df)
 866            else:
 867                data_df = pd.concat([self.features, metadata], axis=1)
 868        else:
 869            # Get all data and reset_index (will copy it)
 870            data_df = pd.concat([self.features, metadata], axis=1)
 871
 872        # Split based on cluster number to conform to *-final[1-4].rds
 873        n_clusters = max(data_df["clust"]) + 1
 874        split_idx = [round(i * n_clusters / 4) for i in range(5)]
 875        for i in range(4):
 876            subset = (split_idx[i] <= data_df["clust"]) & (
 877                data_df["clust"] < split_idx[i + 1]
 878            )
 879            data_df.loc[subset, "hcpc"] = i + 1
 880            subset = data_df[subset].reset_index(drop=True)
 881            # Suppress pandas FutureWarning
 882            with warnings.catch_warnings():
 883                warnings.simplefilter(action="ignore", category=FutureWarning)
 884                pyreadr.write_rds(
 885                    os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset
 886                )
 887
 888        # Create new example cell strings
 889        data_df["example_cell_id"] = (
 890            data_df["slide_id"]
 891            + " "
 892            + data_df["frame_id"].astype(str)
 893            + " "
 894            + data_df["cell_id"].astype(str)
 895            + " "
 896            + data_df["cellx"].astype(int).astype(str)
 897            + " "
 898            + data_df["celly"].astype(int).astype(str)
 899        )
 900        # Find averagable data columns
 901        if "cellcluster_id" in data_df.columns:
 902            end_idx = data_df.columns.get_loc("cellcluster_id")
 903        else:
 904            end_idx = data_df.columns.get_loc("slide_id")
 905        avg_cols = data_df.columns[:end_idx].tolist()
 906        # Group by cluster and average
 907        data_df = data_df.groupby("clust").agg(
 908            **{col: (col, "mean") for col in avg_cols},
 909            count=("clust", "size"),  # count rows in each cluster
 910            example_cells=("example_cell_id", lambda x: ",".join(x)),
 911            hcpc=("hcpc", lambda x: x.iloc[0]),
 912        )
 913        data_df = data_df.reset_index()  # Do NOT drop, index is "clust"
 914        # Create new columns
 915        metadata = pd.DataFrame(
 916            {
 917                "count": data_df["count"],
 918                "example_cells": data_df["example_cells"],
 919                "clust": data_df["clust"].astype(int),
 920                "hcpc": data_df["hcpc"].astype(int),
 921                "id": data_df["clust"].astype(int).astype(str),
 922                "cccluster": "0",  # Dummy value
 923                "ccdistance": 0.0,  # Dummy value
 924                "rownum": list(range(len(data_df))),
 925                "framegroup": 0,  # Dummy value
 926            }
 927        )
 928        # Need to pad the features to 761 columns, as per OCULAR report needs
 929        additional_columns = range(len(avg_cols), 761)
 930        if len(additional_columns) > 0:
 931            padding = pd.DataFrame(
 932                np.zeros((len(data_df), len(additional_columns))),
 933                columns=[f"pad{i}" for i in additional_columns],
 934            )
 935            data_df = pd.concat([data_df[avg_cols], padding, metadata], axis=1)
 936        else:
 937            data_df = pd.concat([data_df[avg_cols], metadata], axis=1)
 938
 939        # Save the cluster data
 940        data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv"))
 941        # Suppress pandas FutureWarning
 942        with warnings.catch_warnings():
 943            warnings.simplefilter(action="ignore", category=FutureWarning)
 944            pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df)
 945
 946    @classmethod
 947    def load_ocular(
 948        cls,
 949        input_path: str,
 950        event_type="cells",
 951        cell_data_files=(
 952            "rc-final1.rds",
 953            "rc-final2.rds",
 954            "rc-final3.rds",
 955            "rc-final4.rds",
 956            "ocular_interesting.rds",
 957        ),
 958        others_data_files=(
 959            "others-final1.rds",
 960            "others-final2.rds",
 961            "others-final3.rds",
 962            "others-final4.rds",
 963        ),
 964        atlas_data_files=(
 965            "ocular_interesting.rds",
 966            "ocular_not_interesting.rds",
 967        ),
 968        drop_common_events=True,
 969        log=None,
 970    ) -> Self:
 971        """
 972
 973        :param input_path:
 974        :param event_type:
 975        :param cell_data_files:
 976        :param others_data_files:
 977        :param atlas_data_files:
 978        :param drop_common_events:
 979        :param log:
 980        :return:
 981        """
 982        if pyreadr is None:
 983            raise ModuleNotFoundError(
 984                "pyreadr not installed. Install pyreadr directly "
 985                "or install csi-images with [rds] option to resolve."
 986            )
 987        # Check if the input path is a directory or a file
 988        if os.path.isfile(input_path):
 989            data_files = [os.path.basename(input_path)]
 990            input_path = os.path.dirname(input_path)
 991        if event_type == "cells":
 992            data_files = cell_data_files
 993        elif event_type == "others":
 994            data_files = others_data_files
 995        else:
 996            raise ValueError("Invalid event type.")
 997
 998        # Load the data from the OCULAR files
 999        file_data = {}
1000        for file in data_files:
1001            file_path = os.path.join(input_path, file)
1002            if not os.path.isfile(file_path):
1003                if log is not None:
1004                    log.warning(f"{file} not found for in {input_path}")
1005                continue
1006            file_data[file] = pyreadr.read_r(file_path)
1007            # Get the DataFrame associated with None (pyreadr dict quirk)
1008            file_data[file] = file_data[file][None]
1009            if len(file_data[file]) == 0:
1010                # File gets dropped from the dict
1011                file_data.pop(file)
1012                if log is not None:
1013                    log.warning(f"{file} has no cells")
1014                continue
1015
1016            if log is not None:
1017                log.debug(f"{file} has {len(file_data[file])} cells")
1018
1019            # Drop common cells if requested and in this file
1020            if (
1021                file in atlas_data_files
1022                and drop_common_events
1023                and "catalogue_classification" in file_data[file]
1024            ):
1025                common_cell_indices = (
1026                    file_data[file]["catalogue_classification"] == "common_cell"
1027                )
1028                if log is not None:
1029                    log.debug(
1030                        f"Dropping {int(pd.Series.sum(common_cell_indices))}"
1031                        f"common cells from {file}"
1032                    )
1033                file_data[file] = file_data[file][common_cell_indices == False]
1034
1035            if len(file_data[file]) == 0:
1036                # File gets dropped from the dict
1037                file_data.pop(file)
1038                if log is not None:
1039                    log.warning(f"{file} has no cells after dropping common cells")
1040                continue
1041
1042            # Extract frame_id and cell_id
1043            # DAPI- events already have frame_id cell_id outside rowname
1044            if event_type == "cells" and "frame_id" not in file_data[file].columns:
1045                file_data[file]["rowname"] = file_data[file]["rowname"].astype("str")
1046                # get frame_id cell_id from rownames column and split into two columns
1047                split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True)
1048                if len(split_res.columns) != 2:
1049                    log.warning(
1050                        f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}'
1051                    )
1052                # then assign it back to the dataframe
1053                file_data[file][["frame_id", "cell_id"]] = split_res.astype("int")
1054            # reset indexes since they can cause NaN values in concat
1055            file_data[file] = file_data[file].reset_index(drop=True)
1056
1057        # Merge the data from all files
1058        if len(file_data) == 0:
1059            return EventArray()
1060        elif len(file_data) == 1:
1061            data = [file_data[file] for file in file_data.keys()][0]
1062        else:
1063            data = pd.concat(file_data.values())
1064
1065        if log is not None:
1066            log.debug(f"Gathered a total of {len(data)} events")
1067
1068        # Others is missing the "slide_id". Insert it right before "frame_id" column
1069        if event_type == "others" and "slide_id" not in data.columns:
1070            if os.path.basename(input_path) == "ocular":
1071                slide_id = os.path.basename(os.path.dirname(input_path))
1072            else:
1073                slide_id = "UNKNOWN"
1074            data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id)
1075
1076        # Sort according to ascending cell_id to keep the original, which is in manual_df
1077        data = data.sort_values(by=["cell_id"], ascending=True)
1078        # Filter out duplicates by x & y
1079        data = data.assign(
1080            unique_id=data["slide_id"]
1081            + "_"
1082            + data["frame_id"].astype(str)
1083            + "_"
1084            + data["cellx"].astype(int).astype(str)
1085            + "_"
1086            + data["celly"].astype(int).astype(str)
1087        )
1088        data = data.drop_duplicates(subset=["unique_id"], keep="first")
1089        # Normal unique_id is with cell_id
1090        data = data.assign(
1091            unique_id=data["slide_id"]
1092            + "_"
1093            + data["frame_id"].astype(str)
1094            + "_"
1095            + data["cell_id"].astype(str)
1096        )
1097        data = data.reset_index(drop=True)
1098        # All columns up to "slide_id" are features; drop the "slide_id"
1099        features = data.loc[:, :"slide_id"].iloc[:, :-1]
1100        data = data.loc[:, "slide_id":]
1101        # Grab the info columns
1102        info = data[["slide_id", "frame_id", "cellx", "celly"]]
1103        info.columns = ["slide_id", "tile", "x", "y"]
1104        info = info.assign(roi=0)  # OCULAR only works on 1 ROI, as far as known
1105        info = info[["slide_id", "tile", "roi", "x", "y"]]
1106        # Metadata has duplicate columns for later convenience
1107        metadata = data
1108        # Certain columns tend to be problematic with mixed data formats...
1109        for col in ["TRITC", "CY5", "FITC"]:
1110            if col in metadata:
1111                labels = {
1112                    "False": False,
1113                    "True": True,
1114                    "FALSE": False,
1115                    "TRUE": True,
1116                }
1117                metadata[col] = metadata[col].map(labels).astype(bool)
1118        for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]:
1119            if col in metadata:
1120                metadata[col] = metadata[col].fillna(-1).astype(int)
1121        return EventArray(info, metadata, features)

A class that holds a large number of events' data, making it easy to analyze and manipulate many events at once. A more separated version of the Event class.

EventArray( info: pandas.core.frame.DataFrame = None, metadata: pandas.core.frame.DataFrame = None, features: pandas.core.frame.DataFrame = None)
277    def __init__(
278        self,
279        info: pd.DataFrame = None,
280        metadata: pd.DataFrame = None,
281        features: pd.DataFrame = None,
282    ):
283        # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y"
284        if info is not None:
285            if list(info.columns) != self.INFO_COLUMNS:
286                raise ValueError(
287                    'EventArray.info must have columns "slide_id", "tile", "roi", "x", "y"'
288                )
289            # Copy first to avoid modifying the original
290            info = info.copy()
291            # Ensure that the columns are the right types
292            info["slide_id"] = info["slide_id"].astype(str)
293            info["tile"] = info["tile"].astype(np.uint16)
294            info["roi"] = info["roi"].astype(np.uint8)
295            info["x"] = info["x"].round().astype(np.uint16)
296            info["y"] = info["y"].round().astype(np.uint16)
297        # All DataFrames must all have the same number of rows
298        if metadata is not None and (info is None or len(info) != len(metadata)):
299            raise ValueError(
300                "If EventArray.metadata is not None, it should match rows with .info"
301            )
302        if features is not None and (info is None or len(info) != len(features)):
303            raise ValueError(
304                "If EventArray.features is not None, it should match rows with .info"
305            )
306        # No columns named "metadata_", "features_", or "None"
307        column_names = []
308        if metadata is not None:
309            column_names += metadata.columns.tolist()
310        if features is not None:
311            column_names += features.columns.tolist()
312        if any([col.lower().startswith("metadata_") for col in column_names]):
313            raise ValueError("EventArray column names cannot start with 'metadata_'")
314        if any([col.lower().startswith("features_") for col in column_names]):
315            raise ValueError("EventArray column names cannot start with 'features_'")
316        if any([col.lower() == "none" for col in column_names]):
317            raise ValueError("EventArray column names cannot be 'none'")
318
319        self.info = info
320        self.metadata = metadata
321        self.features = features
INFO_COLUMNS = ['slide_id', 'tile', 'roi', 'x', 'y']
info
metadata
features
def get_sort_order( self, by: Union[Hashable, Sequence[Hashable]], ascending: Union[bool, Sequence[bool]] = True):
370    def get_sort_order(
371        self, by: Hashable | Sequence[Hashable], ascending: bool | Sequence[bool] = True
372    ):
373        """
374        Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.
375        :param by: name of the column(s) to sort by.
376        :param ascending: whether to sort in ascending order; can be a list to match by
377        :return: the order of the indices to sort by.
378        """
379        columns = self.get(by)
380        return columns.sort_values(by=by, ascending=ascending).index

Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.

Parameters
  • by: name of the column(s) to sort by.
  • ascending: whether to sort in ascending order; can be a list to match by
Returns

the order of the indices to sort by.

def sort( self, by: Union[Hashable, Sequence[Hashable]], ascending: Union[bool, Sequence[bool]] = True) -> Self:
382    def sort(
383        self,
384        by: Hashable | Sequence[Hashable],
385        ascending: bool | Sequence[bool] = True,
386    ) -> Self:
387        """
388        Sort the EventArray by column(s) in the info, metadata, or features DataFrames.
389        :param by: name of the column(s) to sort by.
390        :param ascending: whether to sort in ascending order; can be a list to match by
391        :return: a new, sorted EventArray.
392        """
393        order = self.get_sort_order(by, ascending)
394        info = self.info.loc[order].reset_index(drop=True)
395        if self.metadata is not None:
396            metadata = self.metadata.loc[order].reset_index(drop=True)
397        else:
398            metadata = None
399        if self.features is not None:
400            features = self.features.loc[order].reset_index(drop=True)
401        else:
402            features = None
403        return EventArray(info, metadata, features)

Sort the EventArray by column(s) in the info, metadata, or features DataFrames.

Parameters
  • by: name of the column(s) to sort by.
  • ascending: whether to sort in ascending order; can be a list to match by
Returns

a new, sorted EventArray.

def get( self, column_names: Union[Hashable, Sequence[Hashable]]) -> pandas.core.frame.DataFrame:
405    def get(self, column_names: Hashable | Sequence[Hashable]) -> pd.DataFrame:
406        """
407        Get a DataFrame with the specified columns from the EventArray, by value.
408        :param column_names: the names of the columns to get.
409        :return: a DataFrame with the specified columns.
410        """
411        if isinstance(column_names, Hashable):
412            column_names = [column_names]  # Drop into a list for the loop
413        columns = []
414        for column_name in column_names:
415            if column_name in self.info.columns:
416                columns.append(self.info[column_name])
417            elif self.metadata is not None and column_name in self.metadata.columns:
418                columns.append(self.metadata[column_name])
419            elif self.features is not None and column_name in self.features.columns:
420                columns.append(self.features[column_name])
421            else:
422                raise ValueError(f"Column {column_name} not found in EventArray")
423        return pd.concat(columns, axis=1)

Get a DataFrame with the specified columns from the EventArray, by value.

Parameters
  • column_names: the names of the columns to get.
Returns

a DataFrame with the specified columns.

def rows(self, rows: Sequence[Hashable]) -> Self:
425    def rows(self, rows: Sequence[Hashable]) -> Self:
426        """
427        Get a subset of the EventArray rows based on a boolean or integer index, by value.
428        :param rows: row labels, indices, or boolean mask; anything for .loc[]
429        :return: a new EventArray with the subset of events.
430        """
431        info = self.info.loc[rows].reset_index(drop=True)
432        if self.metadata is not None:
433            metadata = self.metadata.loc[rows].reset_index(drop=True)
434        else:
435            metadata = None
436        if self.features is not None:
437            features = self.features.loc[rows].reset_index(drop=True)
438        else:
439            features = None
440        return EventArray(info, metadata, features)

Get a subset of the EventArray rows based on a boolean or integer index, by value.

Parameters
  • rows: row labels, indices, or boolean mask; anything for .loc[]
Returns

a new EventArray with the subset of events.

def copy(self) -> Self:
442    def copy(self) -> Self:
443        """
444        Create a deep copy of the EventArray.
445        :return: a deep copy of the EventArray.
446        """
447        return EventArray(
448            info=self.info.copy(),
449            metadata=None if self.metadata is None else self.metadata.copy(),
450            features=None if self.features is None else self.features.copy(),
451        )

Create a deep copy of the EventArray.

Returns

a deep copy of the EventArray.

def add_metadata( self, new_metadata: pandas.core.series.Series | pandas.core.frame.DataFrame) -> None:
455    def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None:
456        """
457        Add metadata to the EventArray. Removes the need to check if metadata is None.
458        Overwrites any existing metadata with the same column names as the new metadata.
459        :param new_metadata: the metadata to add.
460        """
461        if len(self) != len(new_metadata):
462            raise ValueError("New metadata must match length of existing info")
463
464        if self.metadata is None:
465            self.metadata = new_metadata
466        else:
467            if isinstance(new_metadata, pd.Series):
468                self.metadata[new_metadata.name] = new_metadata
469            else:
470                # It's a DataFrame
471                self.metadata[new_metadata.columns] = new_metadata

Add metadata to the EventArray. Removes the need to check if metadata is None. Overwrites any existing metadata with the same column names as the new metadata.

Parameters
  • new_metadata: the metadata to add.
def add_features( self, new_features: pandas.core.series.Series | pandas.core.frame.DataFrame) -> None:
473    def add_features(self, new_features: pd.Series | pd.DataFrame) -> None:
474        """
475        Add features to the EventArray. Removes the need to check if features is None.
476        Overwrites any existing features with the same column names as the new features.
477        :param new_features: the features to add.
478        """
479        if len(self) != len(new_features):
480            raise ValueError("New features must match length of existing info")
481
482        if self.features is None:
483            self.features = new_features
484        else:
485            if isinstance(new_features, pd.Series):
486                self.features[new_features.name] = new_features
487            else:
488                # It's a DataFrame
489                self.features[new_features.columns] = new_features

Add features to the EventArray. Removes the need to check if features is None. Overwrites any existing features with the same column names as the new features.

Parameters
  • new_features: the features to add.
@classmethod
def merge(cls, events: Iterable[Self]) -> Self:
491    @classmethod
492    def merge(cls, events: Iterable[Self]) -> Self:
493        """
494        Combine EventArrays in a list into a single EventArray.
495        :param events: the new list of events.
496        """
497        all_info = []
498        all_metadata = []
499        all_features = []
500        for event_array in events:
501            # Skip empty EventArrays
502            if event_array.info is not None:
503                all_info.append(event_array.info)
504            if event_array.metadata is not None:
505                all_metadata.append(event_array.metadata)
506            if event_array.features is not None:
507                all_features.append(event_array.features)
508        if len(all_info) == 0:
509            return EventArray()
510        else:
511            all_info = pd.concat(all_info, ignore_index=True)
512        if len(all_metadata) == 0:
513            all_metadata = None
514        else:
515            all_metadata = pd.concat(all_metadata, ignore_index=True)
516        if len(all_features) == 0:
517            all_features = None
518        else:
519            all_features = pd.concat(all_features, ignore_index=True)
520
521        return EventArray(all_info, all_metadata, all_features)

Combine EventArrays in a list into a single EventArray.

Parameters
  • events: the new list of events.
def to_events( self, scans: Union[csi_images.csi_scans.Scan, Iterable[csi_images.csi_scans.Scan]], ignore_missing_scans=True, ignore_metadata=False, ignore_features=False) -> list[Event]:
523    def to_events(
524        self,
525        scans: Scan | Iterable[Scan],
526        ignore_missing_scans=True,
527        ignore_metadata=False,
528        ignore_features=False,
529    ) -> list[Event]:
530        """
531        Get the events in the EventArray as a list of events.
532        :param scans: the scans that the events belong to, auto-matched by slide_id.
533        Pass None if you don't care about scan metadata (pass ignore_missing_scans).
534        :param ignore_missing_scans: whether to create blank scans for events without scans.
535        :param ignore_metadata: whether to ignore metadata or not
536        :param ignore_features: whether to ignore features or not
537        :return:
538        """
539        if isinstance(scans, Scan):
540            scans = [scans]
541        scans = {scan.slide_id: scan for scan in scans}
542        events = []
543        for i in range(len(self.info)):
544            # Determine the associated scan
545            slide_id = self.info["slide_id"][i]
546            if slide_id not in scans:
547                if ignore_missing_scans:
548                    # Create a placeholder scan if the scan is missing
549                    scan = Scan.make_placeholder(
550                        slide_id,
551                        self.info["tile"][i],
552                        self.info["roi"][i],
553                    )
554                else:
555                    raise ValueError(
556                        f"Scan {self.info['slide_id'][i]} not found for event {i}."
557                    )
558            else:
559                scan = scans[slide_id]
560
561            # Prepare the metadata and features
562            if ignore_metadata or self.metadata is None:
563                metadata = None
564            else:
565                # This Series creation method is less efficient,
566                # but required for preserving dtypes
567                metadata = pd.Series(
568                    {col: self.metadata.loc[i, col] for col in self.metadata.columns},
569                    dtype=object,
570                )
571            if ignore_features or self.features is None:
572                features = None
573            else:
574                features = pd.Series(
575                    {col: self.features.loc[i, col] for col in self.features.columns},
576                    dtype=object,
577                )
578            # Create the event and append it to the list
579            events.append(
580                Event(
581                    scan,
582                    Tile(scan, self.info["tile"][i], self.info["roi"][i]),
583                    self.info["x"][i],
584                    self.info["y"][i],
585                    metadata=metadata,
586                    features=features,
587                )
588            )
589        return events

Get the events in the EventArray as a list of events.

Parameters
  • scans: the scans that the events belong to, auto-matched by slide_id. Pass None if you don't care about scan metadata (pass ignore_missing_scans).
  • ignore_missing_scans: whether to create blank scans for events without scans.
  • ignore_metadata: whether to ignore metadata or not
  • ignore_features: whether to ignore features or not
Returns
@classmethod
def from_events(cls, events: Iterable[Event]) -> Self:
591    @classmethod
592    def from_events(cls, events: Iterable[Event]) -> Self:
593        """
594        Set the events in the EventArray to a new list of events.
595        :param events: the new list of events.
596        """
597        info = pd.DataFrame(
598            {
599                "slide_id": [event.scan.slide_id for event in events],
600                "tile": [event.tile.n for event in events],
601                "roi": [event.tile.n_roi for event in events],
602                "x": [event.x for event in events],
603                "y": [event.y for event in events],
604            }
605        )
606        metadata_list = [event.metadata for event in events]
607        # Iterate through and ensure that all metadata is the same shape
608        for metadata in metadata_list:
609            if type(metadata) != type(metadata_list[0]):
610                raise ValueError("All metadata must be the same type.")
611            if metadata is not None and metadata.shape != metadata_list[0].shape:
612                raise ValueError("All metadata must be the same shape.")
613        if metadata_list[0] is None:
614            metadata = None
615        else:
616            metadata = pd.DataFrame(metadata_list)
617        features_list = [event.features for event in events]
618        # Iterate through and ensure that all features are the same shape
619        for features in features_list:
620            if type(features) != type(features_list[0]):
621                raise ValueError("All features must be the same type.")
622            if features is not None and features.shape != features_list[0].shape:
623                raise ValueError("All features must be the same shape.")
624        if features_list[0] is None:
625            features = None
626        else:
627            features = pd.DataFrame(features_list)
628        return EventArray(info=info, metadata=metadata, features=features)

Set the events in the EventArray to a new list of events.

Parameters
  • events: the new list of events.
def to_dataframe(self) -> pandas.core.frame.DataFrame:
630    def to_dataframe(self) -> pd.DataFrame:
631        """
632        Convert all the data in the EventArray to a single DataFrame.
633        :return: a DataFrame with all the data in the EventArray.
634        """
635        # Make a copy of the info DataFrame and prepend "info_" to the column names
636        output = self.info.copy()
637        # Combine with the metadata and prepend "metadata_" to the column names
638        if self.metadata is not None:
639            metadata = self.metadata.copy()
640            metadata.columns = [f"metadata_{col}" for col in metadata.columns]
641            output = pd.concat([output, metadata], axis=1)
642        # Combine with the features and prepend "features_" to the column names
643        if self.features is not None:
644            features = self.features.copy()
645            features.columns = [f"features_{col}" for col in features.columns]
646            output = pd.concat([output, features], axis=1)
647        return output

Convert all the data in the EventArray to a single DataFrame.

Returns

a DataFrame with all the data in the EventArray.

@classmethod
def from_dataframe(cls, df) -> Self:
649    @classmethod
650    def from_dataframe(cls, df) -> Self:
651        """
652        From a single, special DataFrame, create an EventArray.
653        :return: a DataFrame with all the data in the EventArray.
654        """
655        # Split the columns into info, metadata, and features and strip prefix
656        info = df[[col for col in df.columns if col in cls.INFO_COLUMNS]].copy()
657        if info.size == 0:
658            info = None
659        metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy()
660        metadata.columns = [col.replace("metadata_", "") for col in metadata.columns]
661        if metadata.size == 0:
662            metadata = None
663        features = df[[col for col in df.columns if col.startswith("features_")]].copy()
664        features.columns = [col.replace("features_", "") for col in features.columns]
665        if features.size == 0:
666            features = None
667        return cls(info=info, metadata=metadata, features=features)

From a single, special DataFrame, create an EventArray.

Returns

a DataFrame with all the data in the EventArray.

@classmethod
def from_mask( cls, mask: numpy.ndarray, slide_id: str, tile_n: int, n_roi: int = 0, include_cell_id: bool = True, images: list[numpy.ndarray] = None, image_labels: list[str] = None, properties: list[str] = None) -> Self:
669    @classmethod
670    def from_mask(
671        cls,
672        mask: np.ndarray,
673        slide_id: str,
674        tile_n: int,
675        n_roi: int = 0,
676        include_cell_id: bool = True,
677        images: list[np.ndarray] = None,
678        image_labels: list[str] = None,
679        properties: list[str] = None,
680    ) -> Self:
681        """
682        Extract events from a mask DataFrame, including metadata and features.
683        :param mask: the mask to extract events from.
684        :param slide_id: the slide ID the mask is from.
685        :param tile_n: the tile number the mask is from.
686        :param n_roi: the ROI number the mask is from.
687        :param include_cell_id: whether to include the cell_id, or numerical
688        mask label, as metadata in the EventArray.
689        :param images: the intensity images to extract features from.
690        :param image_labels: the labels for the intensity images.
691        :param properties: list of properties to extract in addition to the defaults:
692        :return: EventArray corresponding to the mask labels.
693        """
694        if extract_mask_info is None:
695            raise ModuleNotFoundError(
696                "csi_images.csi_images dependencies not installed. Install csi-images "
697                "with [imageio] option to resolve."
698            )
699        # Gather mask_info
700        if images is not None and image_labels is not None:
701            if len(images) != len(image_labels):
702                raise ValueError("Intensity images and labels must match lengths.")
703
704        mask_info = extract_mask_info(mask, images, image_labels, properties)
705
706        if len(mask_info) == 0:
707            return EventArray()
708
709        # Combine provided info and mask info
710        info = pd.DataFrame(
711            {
712                "slide_id": slide_id,
713                "tile": tile_n,
714                "roi": n_roi,
715                "x": mask_info["x"],
716                "y": mask_info["y"],
717            },
718        )
719        # Extract a metadata column if desired
720        if include_cell_id:
721            metadata = pd.DataFrame({"cell_id": mask_info["id"]})
722        else:
723            metadata = None
724        # If any additional properties were extracted, add them as features
725        mask_info = mask_info.drop(columns=["id", "x", "y"], errors="ignore")
726        if len(mask_info.columns) > 0:
727            features = mask_info
728        else:
729            features = None
730        return EventArray(info, metadata, features)

Extract events from a mask DataFrame, including metadata and features.

Parameters
  • mask: the mask to extract events from.
  • slide_id: the slide ID the mask is from.
  • tile_n: the tile number the mask is from.
  • n_roi: the ROI number the mask is from.
  • include_cell_id: whether to include the cell_id, or numerical mask label, as metadata in the EventArray.
  • images: the intensity images to extract features from.
  • image_labels: the labels for the intensity images.
  • properties: list of properties to extract in addition to the defaults:
Returns

EventArray corresponding to the mask labels.

def save_csv(self, output_path: str) -> bool:
732    def save_csv(self, output_path: str) -> bool:
733        """
734        Save the events to an CSV file, including metadata and features.
735        :param output_path:
736        :return:
737        """
738        self.to_dataframe().to_csv(output_path, index=False)
739        return os.path.exists(output_path)

Save the events to an CSV file, including metadata and features.

Parameters
  • output_path:
Returns
@classmethod
def load_csv(cls, input_path: str) -> Self:
741    @classmethod
742    def load_csv(cls, input_path: str) -> Self:
743        """
744        Load the events from an CSV file, including metadata and features.
745        :param input_path:
746        :return:
747        """
748        # Load the CSV file
749        df = pd.read_csv(input_path)
750        return cls.from_dataframe(df)

Load the events from an CSV file, including metadata and features.

Parameters
  • input_path:
Returns
def save_hdf5(self, output_path: str) -> bool:
752    def save_hdf5(self, output_path: str) -> bool:
753        """
754        Save the events to an HDF5 file, including metadata and features.
755        Uses the pandas-provided HDF5 functions for ease, and external compatibility,
756        though these files are slightly harder to view in HDFView or similar.
757        :param output_path:
758        :return:
759        """
760        # Open the output_path as an HDF5 file
761        with pd.HDFStore(output_path) as store:
762            # Store the dataframes in the HDF5 file
763            if self.info is not None:
764                store.put("info", self.info, index=False)
765            if self.metadata is not None:
766                store.put("metadata", self.metadata, index=False)
767            if self.features is not None:
768                store.put("features", self.features, index=False)
769        return os.path.exists(output_path)

Save the events to an HDF5 file, including metadata and features. Uses the pandas-provided HDF5 functions for ease, and external compatibility, though these files are slightly harder to view in HDFView or similar.

Parameters
  • output_path:
Returns
@classmethod
def load_hdf5(cls, input_path: str) -> Self:
771    @classmethod
772    def load_hdf5(cls, input_path: str) -> Self:
773        """
774        Load the events from an HDF5 file, including metadata and features.
775        :param input_path:
776        :return:
777        """
778        # Open the input_path as an HDF5 file
779        with pd.HDFStore(input_path, "r") as store:
780            # Load the dataframes from the HDF5 file
781            info = store.get("info") if "info" in store else None
782            metadata = store.get("metadata") if "metadata" in store else None
783            features = store.get("features") if "features" in store else None
784        return cls(info=info, metadata=metadata, features=features)

Load the events from an HDF5 file, including metadata and features.

Parameters
  • input_path:
Returns
def save_ocular(self, output_path: str, event_type: str = 'cells'):
786    def save_ocular(self, output_path: str, event_type: str = "cells"):
787        """
788        Save the events to an OCULAR file. Relies on the dataframe originating
789        from an OCULAR file (same columns; duplicate metadata/info).
790        :param output_path:
791        :param event_type:
792        :return:
793        """
794        if pyreadr is None:
795            raise ModuleNotFoundError(
796                "pyreadr not installed. Install pyreadr directly "
797                "or install csi-images with [rds] option to resolve."
798            )
799        if event_type == "cells":
800            file_stub = "rc-final"
801        elif event_type == "others":
802            file_stub = "others-final"
803        else:
804            raise ValueError("Invalid event type. Must be cells or others.")
805
806        # Ensure good metadata
807        metadata = pd.DataFrame(
808            {
809                "slide_id": self.info["slide_id"],
810                "frame_id": self.info["tile"],
811                "cell_id": (
812                    self.metadata["cell_id"]
813                    if "cell_id" in self.metadata.columns
814                    else range(len(self.info))
815                ),
816                "cellx": self.info["x"],
817                "celly": self.info["y"],
818            }
819        )
820        if self.metadata is not None:
821            metadata[self.metadata.columns] = self.metadata.copy()
822
823        # Check for the "ocular_interesting" column
824        if event_type == "cells":
825            if "ocular_interesting" in metadata.columns:
826                interesting_rows = metadata["ocular_interesting"].to_numpy(dtype=bool)
827            elif "hcpc" in metadata.columns:
828                # Interesting cells don't get an hcpc designation, leaving them as -1
829                interesting_rows = (
830                    metadata["hcpc"].to_numpy() == -1
831                )  # interesting cells
832            else:
833                interesting_rows = []
834            if sum(interesting_rows) > 0:
835                # Split the metadata into interesting and regular
836                interesting_events = self.rows(interesting_rows)
837                interesting_df = pd.concat(
838                    [interesting_events.features, interesting_events.metadata], axis=1
839                )
840                data_events = self.rows(~interesting_rows)
841                data_df = pd.concat(
842                    [data_events.features, data_events.metadata], axis=1
843                )
844                data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore")
845
846                # Drop particular columns for "interesting"
847                interesting_df = interesting_df.drop(
848                    [
849                        "clust",
850                        "hcpc",
851                        "frame_id",
852                        "cell_id",
853                        "unique_id",
854                        "ocular_interesting",
855                    ],
856                    axis=1,
857                    errors="ignore",
858                )
859                # Save both .csv and .rds
860                interesting_stub = os.path.join(output_path, "ocular_interesting")
861                interesting_df.to_csv(f"{interesting_stub}.csv")
862                # Suppress pandas FutureWarning
863                with warnings.catch_warnings():
864                    warnings.simplefilter(action="ignore", category=FutureWarning)
865                    pyreadr.write_rds(f"{interesting_stub}.rds", interesting_df)
866            else:
867                data_df = pd.concat([self.features, metadata], axis=1)
868        else:
869            # Get all data and reset_index (will copy it)
870            data_df = pd.concat([self.features, metadata], axis=1)
871
872        # Split based on cluster number to conform to *-final[1-4].rds
873        n_clusters = max(data_df["clust"]) + 1
874        split_idx = [round(i * n_clusters / 4) for i in range(5)]
875        for i in range(4):
876            subset = (split_idx[i] <= data_df["clust"]) & (
877                data_df["clust"] < split_idx[i + 1]
878            )
879            data_df.loc[subset, "hcpc"] = i + 1
880            subset = data_df[subset].reset_index(drop=True)
881            # Suppress pandas FutureWarning
882            with warnings.catch_warnings():
883                warnings.simplefilter(action="ignore", category=FutureWarning)
884                pyreadr.write_rds(
885                    os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset
886                )
887
888        # Create new example cell strings
889        data_df["example_cell_id"] = (
890            data_df["slide_id"]
891            + " "
892            + data_df["frame_id"].astype(str)
893            + " "
894            + data_df["cell_id"].astype(str)
895            + " "
896            + data_df["cellx"].astype(int).astype(str)
897            + " "
898            + data_df["celly"].astype(int).astype(str)
899        )
900        # Find averagable data columns
901        if "cellcluster_id" in data_df.columns:
902            end_idx = data_df.columns.get_loc("cellcluster_id")
903        else:
904            end_idx = data_df.columns.get_loc("slide_id")
905        avg_cols = data_df.columns[:end_idx].tolist()
906        # Group by cluster and average
907        data_df = data_df.groupby("clust").agg(
908            **{col: (col, "mean") for col in avg_cols},
909            count=("clust", "size"),  # count rows in each cluster
910            example_cells=("example_cell_id", lambda x: ",".join(x)),
911            hcpc=("hcpc", lambda x: x.iloc[0]),
912        )
913        data_df = data_df.reset_index()  # Do NOT drop, index is "clust"
914        # Create new columns
915        metadata = pd.DataFrame(
916            {
917                "count": data_df["count"],
918                "example_cells": data_df["example_cells"],
919                "clust": data_df["clust"].astype(int),
920                "hcpc": data_df["hcpc"].astype(int),
921                "id": data_df["clust"].astype(int).astype(str),
922                "cccluster": "0",  # Dummy value
923                "ccdistance": 0.0,  # Dummy value
924                "rownum": list(range(len(data_df))),
925                "framegroup": 0,  # Dummy value
926            }
927        )
928        # Need to pad the features to 761 columns, as per OCULAR report needs
929        additional_columns = range(len(avg_cols), 761)
930        if len(additional_columns) > 0:
931            padding = pd.DataFrame(
932                np.zeros((len(data_df), len(additional_columns))),
933                columns=[f"pad{i}" for i in additional_columns],
934            )
935            data_df = pd.concat([data_df[avg_cols], padding, metadata], axis=1)
936        else:
937            data_df = pd.concat([data_df[avg_cols], metadata], axis=1)
938
939        # Save the cluster data
940        data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv"))
941        # Suppress pandas FutureWarning
942        with warnings.catch_warnings():
943            warnings.simplefilter(action="ignore", category=FutureWarning)
944            pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df)

Save the events to an OCULAR file. Relies on the dataframe originating from an OCULAR file (same columns; duplicate metadata/info).

Parameters
  • output_path:
  • event_type:
Returns
@classmethod
def load_ocular( cls, input_path: str, event_type='cells', cell_data_files=('rc-final1.rds', 'rc-final2.rds', 'rc-final3.rds', 'rc-final4.rds', 'ocular_interesting.rds'), others_data_files=('others-final1.rds', 'others-final2.rds', 'others-final3.rds', 'others-final4.rds'), atlas_data_files=('ocular_interesting.rds', 'ocular_not_interesting.rds'), drop_common_events=True, log=None) -> Self:
 946    @classmethod
 947    def load_ocular(
 948        cls,
 949        input_path: str,
 950        event_type="cells",
 951        cell_data_files=(
 952            "rc-final1.rds",
 953            "rc-final2.rds",
 954            "rc-final3.rds",
 955            "rc-final4.rds",
 956            "ocular_interesting.rds",
 957        ),
 958        others_data_files=(
 959            "others-final1.rds",
 960            "others-final2.rds",
 961            "others-final3.rds",
 962            "others-final4.rds",
 963        ),
 964        atlas_data_files=(
 965            "ocular_interesting.rds",
 966            "ocular_not_interesting.rds",
 967        ),
 968        drop_common_events=True,
 969        log=None,
 970    ) -> Self:
 971        """
 972
 973        :param input_path:
 974        :param event_type:
 975        :param cell_data_files:
 976        :param others_data_files:
 977        :param atlas_data_files:
 978        :param drop_common_events:
 979        :param log:
 980        :return:
 981        """
 982        if pyreadr is None:
 983            raise ModuleNotFoundError(
 984                "pyreadr not installed. Install pyreadr directly "
 985                "or install csi-images with [rds] option to resolve."
 986            )
 987        # Check if the input path is a directory or a file
 988        if os.path.isfile(input_path):
 989            data_files = [os.path.basename(input_path)]
 990            input_path = os.path.dirname(input_path)
 991        if event_type == "cells":
 992            data_files = cell_data_files
 993        elif event_type == "others":
 994            data_files = others_data_files
 995        else:
 996            raise ValueError("Invalid event type.")
 997
 998        # Load the data from the OCULAR files
 999        file_data = {}
1000        for file in data_files:
1001            file_path = os.path.join(input_path, file)
1002            if not os.path.isfile(file_path):
1003                if log is not None:
1004                    log.warning(f"{file} not found for in {input_path}")
1005                continue
1006            file_data[file] = pyreadr.read_r(file_path)
1007            # Get the DataFrame associated with None (pyreadr dict quirk)
1008            file_data[file] = file_data[file][None]
1009            if len(file_data[file]) == 0:
1010                # File gets dropped from the dict
1011                file_data.pop(file)
1012                if log is not None:
1013                    log.warning(f"{file} has no cells")
1014                continue
1015
1016            if log is not None:
1017                log.debug(f"{file} has {len(file_data[file])} cells")
1018
1019            # Drop common cells if requested and in this file
1020            if (
1021                file in atlas_data_files
1022                and drop_common_events
1023                and "catalogue_classification" in file_data[file]
1024            ):
1025                common_cell_indices = (
1026                    file_data[file]["catalogue_classification"] == "common_cell"
1027                )
1028                if log is not None:
1029                    log.debug(
1030                        f"Dropping {int(pd.Series.sum(common_cell_indices))}"
1031                        f"common cells from {file}"
1032                    )
1033                file_data[file] = file_data[file][common_cell_indices == False]
1034
1035            if len(file_data[file]) == 0:
1036                # File gets dropped from the dict
1037                file_data.pop(file)
1038                if log is not None:
1039                    log.warning(f"{file} has no cells after dropping common cells")
1040                continue
1041
1042            # Extract frame_id and cell_id
1043            # DAPI- events already have frame_id cell_id outside rowname
1044            if event_type == "cells" and "frame_id" not in file_data[file].columns:
1045                file_data[file]["rowname"] = file_data[file]["rowname"].astype("str")
1046                # get frame_id cell_id from rownames column and split into two columns
1047                split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True)
1048                if len(split_res.columns) != 2:
1049                    log.warning(
1050                        f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}'
1051                    )
1052                # then assign it back to the dataframe
1053                file_data[file][["frame_id", "cell_id"]] = split_res.astype("int")
1054            # reset indexes since they can cause NaN values in concat
1055            file_data[file] = file_data[file].reset_index(drop=True)
1056
1057        # Merge the data from all files
1058        if len(file_data) == 0:
1059            return EventArray()
1060        elif len(file_data) == 1:
1061            data = [file_data[file] for file in file_data.keys()][0]
1062        else:
1063            data = pd.concat(file_data.values())
1064
1065        if log is not None:
1066            log.debug(f"Gathered a total of {len(data)} events")
1067
1068        # Others is missing the "slide_id". Insert it right before "frame_id" column
1069        if event_type == "others" and "slide_id" not in data.columns:
1070            if os.path.basename(input_path) == "ocular":
1071                slide_id = os.path.basename(os.path.dirname(input_path))
1072            else:
1073                slide_id = "UNKNOWN"
1074            data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id)
1075
1076        # Sort according to ascending cell_id to keep the original, which is in manual_df
1077        data = data.sort_values(by=["cell_id"], ascending=True)
1078        # Filter out duplicates by x & y
1079        data = data.assign(
1080            unique_id=data["slide_id"]
1081            + "_"
1082            + data["frame_id"].astype(str)
1083            + "_"
1084            + data["cellx"].astype(int).astype(str)
1085            + "_"
1086            + data["celly"].astype(int).astype(str)
1087        )
1088        data = data.drop_duplicates(subset=["unique_id"], keep="first")
1089        # Normal unique_id is with cell_id
1090        data = data.assign(
1091            unique_id=data["slide_id"]
1092            + "_"
1093            + data["frame_id"].astype(str)
1094            + "_"
1095            + data["cell_id"].astype(str)
1096        )
1097        data = data.reset_index(drop=True)
1098        # All columns up to "slide_id" are features; drop the "slide_id"
1099        features = data.loc[:, :"slide_id"].iloc[:, :-1]
1100        data = data.loc[:, "slide_id":]
1101        # Grab the info columns
1102        info = data[["slide_id", "frame_id", "cellx", "celly"]]
1103        info.columns = ["slide_id", "tile", "x", "y"]
1104        info = info.assign(roi=0)  # OCULAR only works on 1 ROI, as far as known
1105        info = info[["slide_id", "tile", "roi", "x", "y"]]
1106        # Metadata has duplicate columns for later convenience
1107        metadata = data
1108        # Certain columns tend to be problematic with mixed data formats...
1109        for col in ["TRITC", "CY5", "FITC"]:
1110            if col in metadata:
1111                labels = {
1112                    "False": False,
1113                    "True": True,
1114                    "FALSE": False,
1115                    "TRUE": True,
1116                }
1117                metadata[col] = metadata[col].map(labels).astype(bool)
1118        for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]:
1119            if col in metadata:
1120                metadata[col] = metadata[col].fillna(-1).astype(int)
1121        return EventArray(info, metadata, features)
Parameters
  • input_path:
  • event_type:
  • cell_data_files:
  • others_data_files:
  • atlas_data_files:
  • drop_common_events:
  • log:
Returns