csi_images.csi_events

Contains the Event class, which represents a single event in a scan. The Event class optionally holds metadata and features. Lists of events with similar metadata or features can be combined into DataFrames for analysis.

The Event class holds the position of the event in the frame, which can be converted to the position in the scanner or slide coordinate positions. See the csi_utils.csi_scans documentation page for more information on the coordinate systems.

   1"""
   2Contains the Event class, which represents a single event in a scan.
   3The Event class optionally holds metadata and features. Lists of events with
   4similar metadata or features can be combined into DataFrames for analysis.
   5
   6The Event class holds the position of the event in the frame, which can be converted
   7to the position in the scanner or slide coordinate positions. See the
   8csi_utils.csi_scans documentation page for more information on the coordinate systems.
   9"""
  10
  11import os
  12import math
  13import warnings
  14from typing import Self
  15
  16import numpy as np
  17import pandas as pd
  18
  19from .csi_scans import Scan
  20from .csi_tiles import Tile
  21from .csi_frames import Frame
  22from .csi_images import extract_mask_info
  23
  24# Optional dependencies; will raise errors in particular functions if not installed
  25try:
  26    import pyreadr
  27except ImportError:
  28    pyreadr = None
  29
  30
  31class Event:
  32    """
  33    A class that represents a single event in a scan, making it easy to evaluate
  34    singular events. Required metadata is exposed as attributes, and optional
  35    metadata and features are stored as DataFrames.
  36    """
  37
  38    SCAN_TO_SLIDE_TRANSFORM = {
  39        # Axioscan zero is in the top-right corner instead of top-left
  40        Scan.Type.AXIOSCAN7: np.array(
  41            [
  42                [1, 0, 75000],
  43                [0, 1, 0],
  44                [0, 0, 1],
  45            ]
  46        ),
  47        # BZScanner coordinates are a special kind of messed up:
  48        # - The slide is upside-down.
  49        # - The slide is oriented vertically, with the barcode at the bottom.
  50        # - Tiles are numbered from the top-right
  51        Scan.Type.BZSCANNER: np.array(
  52            [
  53                [0, -1, 75000],
  54                [-1, 0, 25000],
  55                [0, 0, 1],
  56            ]
  57        ),
  58    }
  59    """
  60    Homogeneous transformation matrices for converting between scanner and slide
  61    coordinates. The matrices are 3x3, with the final column representing the
  62    translation in micrometers (um). For more information, see 
  63    [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations).
  64    
  65    Transformations are nominal, and accuracy is not guaranteed; this is due to 
  66    imperfections in slides and alignment in the scanners. Units are in micrometers.
  67    """
  68
  69    def __init__(
  70        self,
  71        scan: Scan,
  72        tile: Tile,
  73        x: int,
  74        y: int,
  75        size: int = 12,  # End-to-end size in pixels
  76        metadata: pd.Series = None,
  77        features: pd.Series = None,
  78    ):
  79        self.scan = scan
  80        self.tile = tile
  81        self.x = x
  82        self.y = y
  83        self.size = size
  84        self.metadata = metadata
  85        self.features = features
  86
  87    def __repr__(self) -> str:
  88        return f"{self.scan.slide_id}-{self.tile.n}-{self.x}-{self.y}"
  89
  90    def __eq__(self, other) -> bool:
  91        return self.__repr__() == other.__repr__()
  92
  93    def __lt__(self, other):
  94        return self.__repr__() < other.__repr__()
  95
  96    def get_scan_position(self) -> tuple[float, float]:
  97        """
  98        Get the position of the event in the scanner's coordinate frame.
  99        :return: the scan position of the event in micrometers (um).
 100        """
 101        # Get overall pixel position
 102        pixel_x = self.x + (self.scan.tile_width_px * self.tile.x)
 103        pixel_y = self.y + (self.scan.tile_height_px * self.tile.y)
 104        # Convert to micrometers
 105        x_um = pixel_x * self.scan.pixel_size_um
 106        y_um = pixel_y * self.scan.pixel_size_um
 107        # Add the scan's origin in the scanner frame
 108        x_um += self.scan.roi[self.tile.n_roi].origin_x_um
 109        y_um += self.scan.roi[self.tile.n_roi].origin_y_um
 110        return x_um, y_um
 111
 112    def get_slide_position(self) -> tuple[float, float]:
 113        """
 114        Get the slide position of the event in micrometers (um).
 115        :return: the slide position of the event.
 116        """
 117        # Turn scan_position into a 3x1 vector
 118        scan_position = self.get_scan_position()
 119        scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]])
 120
 121        # Multiply by the appropriate homogeneous matrix
 122        if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value):
 123            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7]
 124        elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value):
 125            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER]
 126        else:
 127            raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.")
 128        slide_position = np.matmul(transform, scan_position)
 129        return float(slide_position[0][0]), float(slide_position[1][0])
 130
 131    def crop_images(
 132        self, images: list[np.ndarray], crop_size: int = 100, in_pixels: bool = True
 133    ) -> list[np.ndarray]:
 134        """
 135        Get the event crops from the frame images. Called "get" because it does not
 136        need to extract anything; it is very quick for extracting multiple events from
 137        the same tile.
 138        Use this if you're interested in many events.
 139        :param images: the frame images.
 140        :param crop_size: the square size of the image crop to get for this event.
 141        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
 142        :return: image_size x image_size crops of the event in the provided frames. If
 143        the event is too close to the edge, the crop will be smaller and not centered.
 144        """
 145        # Convert a crop size in micrometers to pixels
 146        if not in_pixels:
 147            crop_size = round(crop_size / self.scan.pixel_size_um)
 148        # Find the crop bounds
 149        bounds = [
 150            self.x - crop_size // 2,
 151            self.y - crop_size // 2,
 152            self.x + math.ceil(crop_size / 2),
 153            self.y + math.ceil(crop_size / 2),
 154        ]
 155        # Determine how much the bounds violate the image size
 156        displacements = [
 157            max(0, -bounds[0]),
 158            max(0, -bounds[1]),
 159            max(0, bounds[2] - images[0].shape[1]),
 160            max(0, bounds[3] - images[0].shape[0]),
 161        ]
 162        # Cap off the bounds
 163        bounds = [
 164            max(0, bounds[0]),
 165            max(0, bounds[1]),
 166            min(images[0].shape[1], bounds[2]),
 167            min(images[0].shape[0], bounds[3]),
 168        ]
 169
 170        # Crop the images
 171        cropped_images = []
 172        for image in images:
 173            # Create a blank image of the right size
 174            cropped_image = np.zeros((crop_size, crop_size), dtype=image.dtype)
 175
 176            # Insert the cropped image into the blank image, leaving a black buffer
 177            # around the edges if the crop would go beyond the original image bounds
 178            cropped_image[
 179                displacements[1] : crop_size - displacements[3],
 180                displacements[0] : crop_size - displacements[2],
 181            ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]]
 182            cropped_images.append(cropped_image)
 183        return cropped_images
 184
 185    def extract_images(
 186        self, crop_size: int = 100, in_pixels: bool = True
 187    ) -> list[np.ndarray]:
 188        """
 189        Extract the images from the scan and tile, reading from the file. Called
 190        "extract" because it must read and extract the images from file, which is slow.
 191        Use this if you're interested in only a few events, as it is inefficient when
 192        reading multiple events from the same tile.
 193        :param crop_size: the square size of the image crop to get for this event.
 194        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
 195        :return: a list of cropped images from the scan in the order of the channels.
 196        """
 197        frames = Frame.get_frames(self.tile)
 198        images = [frame.get_image() for frame in frames]
 199        return self.crop_images(images, crop_size, in_pixels)
 200
 201    @classmethod
 202    def extract_images_for_list(
 203        cls,
 204        events: list[Self],
 205        crop_size: int | list[int] = None,
 206        in_pixels: bool = True,
 207    ) -> list[list[np.ndarray]]:
 208        """
 209        Get the images for a list of events, ensuring that there is no wasteful reading
 210        of the same tile multiple times. This function is more efficient than calling
 211        extract_event_images for each event.
 212        TODO: test this function
 213        :param events: the events to extract images for.
 214        :param crop_size: the square size of the image crop to get for this event.
 215                          Defaults to four times the size of the event.
 216        :param in_pixels: whether the crop size is in pixels or micrometers.
 217                          Defaults to pixels, and is ignored if crop_size is None.
 218        :return: a list of lists of cropped images for each event.
 219        """
 220        if len(events) == 0:
 221            return []
 222
 223        # Populate a crop size if none provided
 224        if crop_size is None:
 225            crop_size = [4 * event.size for event in events]
 226            in_pixels = True
 227        # Propagate a constant crop size
 228        elif isinstance(crop_size, int):
 229            crop_size = [crop_size] * len(events)
 230
 231        # Sort the events by tile; use a shallow copy to avoid modifying the original
 232        order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__()))
 233
 234        # Allocate the list to size
 235        images = [None] * len(events)
 236        last_tile = None
 237        frame_images = None  # Holds large numpy arrays, so expensive to compare
 238        # Iterate through in sorted order
 239        for i in order:
 240            if last_tile != events[i].tile:
 241                # Gather the frame images, preserving them for the next event
 242                frames = Frame.get_frames(events[i].tile)
 243                frame_images = [frame.get_image() for frame in frames]
 244
 245                last_tile = events[i].tile
 246            # Use the frame images to crop the event images
 247            # Preserve the original order using order[i]
 248            images[i] = events[i].crop_images(frame_images, crop_size[i], in_pixels)
 249        return images
 250
 251
 252class EventArray:
 253    """
 254    A class that holds a large number of events' data, making it easy to analyze and
 255    manipulate many events at once. A more separated version of the Event class.
 256    """
 257
 258    INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y", "size"]
 259
 260    def __init__(
 261        self,
 262        info: pd.DataFrame = None,
 263        metadata: pd.DataFrame = None,
 264        features: pd.DataFrame = None,
 265    ):
 266        # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y", "size"
 267        if info is not None:
 268            if list(info.columns) != self.INFO_COLUMNS:
 269                raise ValueError(
 270                    "EventArray.info must have columns 'slide_id', 'tile', 'roi', 'x', 'y', 'size'"
 271                )
 272            # Copy first to avoid modifying the original
 273            info = info.copy()
 274            # Ensure that the columns are the right types
 275            info["slide_id"] = info["slide_id"].astype(str)
 276            info["tile"] = info["tile"].astype(np.uint16)
 277            info["roi"] = info["roi"].astype(np.uint8)
 278            info["x"] = info["x"].round().astype(np.uint16)
 279            info["y"] = info["y"].round().astype(np.uint16)
 280            info["size"] = info["size"].round().astype(np.uint16)
 281        # All DataFrames must all have the same number of rows
 282        if metadata is not None and (info is None or len(info) != len(metadata)):
 283            raise ValueError(
 284                "If EventArray.metadata is not None, it should match rows with .info"
 285            )
 286        if features is not None and (info is None or len(info) != len(features)):
 287            raise ValueError(
 288                "If EventArray.features is not None, it should match rows with .info"
 289            )
 290        self.info = info
 291        self.metadata = metadata
 292        self.features = features
 293
 294    def __len__(self) -> int:
 295        # Convenience method to get the number of events
 296        if self.info is None:
 297            return 0
 298        else:
 299            return len(self.info)
 300
 301    def __eq__(self, other):
 302        is_equal = True
 303        # Parse all possibilities for info
 304        if isinstance(self.info, pd.DataFrame):
 305            if isinstance(other.info, pd.DataFrame):
 306                is_equal = self.info.equals(other.info)
 307                if not is_equal:
 308                    return False
 309            else:
 310                return False
 311        elif self.info is None:
 312            if other.info is not None:
 313                return False
 314
 315        # Parse all possibilities for metadata
 316        if isinstance(self.metadata, pd.DataFrame):
 317            if isinstance(other.metadata, pd.DataFrame):
 318                is_equal = self.metadata.equals(other.metadata)
 319                if not is_equal:
 320                    return False
 321            else:
 322                return False
 323        elif self.metadata is None:
 324            if other.metadata is not None:
 325                return False
 326
 327        # Parse all possibilities for features
 328        if isinstance(self.features, pd.DataFrame):
 329            if isinstance(other.features, pd.DataFrame):
 330                is_equal = self.features.equals(other.features)
 331                if not is_equal:
 332                    return False
 333            else:
 334                return False
 335        elif self.features is None:
 336            if other.features is not None:
 337                return False
 338
 339        return is_equal
 340
 341    def get_sort_order(self, by: str | list[str], ascending: bool | list[bool] = True):
 342        """
 343        Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.
 344        :param by: name of the column(s) to sort by.
 345        :param ascending: whether to sort in ascending order; can be a list to match by
 346        :return: the order of the indices to sort by.
 347        """
 348        columns = self.get(by)
 349        return columns.sort_values(by=by, ascending=ascending).index
 350
 351    def sort(self, by: str | list[str], ascending: bool | list[bool] = True) -> Self:
 352        """
 353        Sort the EventArray by column(s) in the info, metadata, or features DataFrames.
 354        :param by: name of the column(s) to sort by.
 355        :param ascending: whether to sort in ascending order; can be a list to match by
 356        :return: a new, sorted EventArray.
 357        """
 358        order = self.get_sort_order(by, ascending)
 359        info = self.info.loc[order].reset_index(drop=True)
 360        if self.metadata is not None:
 361            metadata = self.metadata.loc[order].reset_index(drop=True)
 362        else:
 363            metadata = None
 364        if self.features is not None:
 365            features = self.features.loc[order].reset_index(drop=True)
 366        else:
 367            features = None
 368        return EventArray(info, metadata, features)
 369
 370    def get(self, column_names: int | str | list[int] | list[str]) -> pd.DataFrame:
 371        """
 372        Get a DataFrame with the specified columns from the EventArray, by value.
 373        :param column_names: the names of the columns to get.
 374        :return: a DataFrame with the specified columns.
 375        """
 376        if isinstance(column_names, int) or isinstance(column_names, str):
 377            column_names = [column_names]
 378        columns = []
 379        for column_name in column_names:
 380            if column_name in self.info.columns:
 381                columns.append(self.info[column_name])
 382            elif self.metadata is not None and column_name in self.metadata.columns:
 383                columns.append(self.metadata[column_name])
 384            elif self.features is not None and column_name in self.features.columns:
 385                columns.append(self.features[column_name])
 386            else:
 387                raise ValueError(f"Column {column_name} not found in EventArray")
 388        return pd.concat(columns, axis=1)
 389
 390    def rows(self, rows) -> Self:
 391        """
 392        Get a subset of the EventArray rows based on a boolean or integer index, by value.
 393        :param rows: the indices to get as a 1D boolean/integer list/array/series
 394        :return: a new EventArray with the subset of events.
 395        """
 396        info = self.info.loc[rows].reset_index(drop=True)
 397        if self.metadata is not None:
 398            metadata = self.metadata.loc[rows].reset_index(drop=True)
 399        else:
 400            metadata = None
 401        if self.features is not None:
 402            features = self.features.loc[rows].reset_index(drop=True)
 403        else:
 404            features = None
 405        return EventArray(info, metadata, features)
 406
 407    def copy(self) -> Self:
 408        """
 409        Create a deep copy of the EventArray.
 410        :return: a deep copy of the EventArray.
 411        """
 412        return EventArray(
 413            info=self.info.copy(),
 414            metadata=None if self.metadata is None else self.metadata.copy(),
 415            features=None if self.features is None else self.features.copy(),
 416        )
 417
 418    def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None:
 419        """
 420        Add metadata to the EventArray. Removes the need to check if metadata is None.
 421        Overwrites any existing metadata with the same column names as the new metadata.
 422        :param new_metadata: the metadata to add.
 423        """
 424        if len(self) != len(new_metadata):
 425            raise ValueError("New metadata must match length of existing info")
 426
 427        if self.metadata is None:
 428            self.metadata = new_metadata
 429        else:
 430            if isinstance(new_metadata, pd.Series):
 431                self.metadata[new_metadata.name] = new_metadata
 432            else:
 433                # It's a DataFrame
 434                self.metadata[new_metadata.columns] = new_metadata
 435
 436    def add_features(self, new_features: pd.Series | pd.DataFrame) -> None:
 437        """
 438        Add features to the EventArray. Removes the need to check if features is None.
 439        Overwrites any existing features with the same column names as the new features.
 440        :param new_features: the features to add.
 441        """
 442        if len(self) != len(new_features):
 443            raise ValueError("New features must match length of existing info")
 444
 445        if self.features is None:
 446            self.features = new_features
 447        else:
 448            if isinstance(new_features, pd.Series):
 449                self.features[new_features.name] = new_features
 450            else:
 451                # It's a DataFrame
 452                self.features[new_features.columns] = new_features
 453
 454    @classmethod
 455    def merge(cls, events: list[Self]) -> Self:
 456        """
 457        Combine EventArrays in a list into a single EventArray.
 458        :param events: the new list of events.
 459        """
 460        all_info = []
 461        all_metadata = []
 462        all_features = []
 463        for event_array in events:
 464            # Skip empty EventArrays
 465            if event_array.info is not None:
 466                all_info.append(event_array.info)
 467            if event_array.metadata is not None:
 468                all_metadata.append(event_array.metadata)
 469            if event_array.features is not None:
 470                all_features.append(event_array.features)
 471        if len(all_info) == 0:
 472            return EventArray()
 473        else:
 474            all_info = pd.concat(all_info, ignore_index=True)
 475        if len(all_metadata) == 0:
 476            all_metadata = None
 477        else:
 478            all_metadata = pd.concat(all_metadata, ignore_index=True)
 479        if len(all_features) == 0:
 480            all_features = None
 481        else:
 482            all_features = pd.concat(all_features, ignore_index=True)
 483
 484        return EventArray(all_info, all_metadata, all_features)
 485
 486    def to_events(
 487        self,
 488        scans: Scan | list[Scan],
 489        ignore_missing_scans=True,
 490        ignore_metadata=False,
 491        ignore_features=False,
 492    ) -> list[Event]:
 493        """
 494        Get the events in the EventArray as a list of events.
 495        :param scans: the scans that the events belong to, auto-matched by slide_id.
 496        Pass None if you don't care about scan metadata (pass ignore_missing_scans).
 497        :param ignore_missing_scans: whether to create blank scans for events without scans.
 498        :param ignore_metadata: whether to ignore metadata or not
 499        :param ignore_features: whether to ignore features or not
 500        :return:
 501        """
 502        if isinstance(scans, Scan):
 503            scans = [scans] * len(self.info)
 504        events = []
 505        for i in range(len(self.info)):
 506            # Determine the associated scan
 507            scan = None
 508            for s in scans:
 509                if s.slide_id == self.info["slide_id"][i]:
 510                    scan = s
 511                    break
 512            if scan is None:
 513                if ignore_missing_scans:
 514                    # Create a placeholder scan if the scan is missing
 515                    scan = Scan.make_placeholder(
 516                        self.info["slide_id"][i],
 517                        self.info["tile"][i],
 518                        self.info["roi"][i],
 519                    )
 520                else:
 521                    raise ValueError(
 522                        f"Scan {self.info['slide_id'][i]} not found for event {i}."
 523                    )
 524            # Prepare the metadata and features
 525            if ignore_metadata or self.metadata is None:
 526                metadata = None
 527            else:
 528                # This Series creation method is less efficient,
 529                # but required for preserving dtypes
 530                metadata = pd.Series(
 531                    {col: self.metadata.loc[i, col] for col in self.metadata.columns},
 532                    dtype=object,
 533                )
 534            if ignore_features or self.features is None:
 535                features = None
 536            else:
 537                features = pd.Series(
 538                    {col: self.features.loc[i, col] for col in self.features.columns},
 539                    dtype=object,
 540                )
 541            # Create the event and append it to the list
 542            events.append(
 543                Event(
 544                    scan,
 545                    Tile(scan, self.info["tile"][i], self.info["roi"][i]),
 546                    self.info["x"][i],
 547                    self.info["y"][i],
 548                    size=self.info["size"][i],
 549                    metadata=metadata,
 550                    features=features,
 551                )
 552            )
 553        return events
 554
 555    @classmethod
 556    def from_events(cls, events: list[Event]) -> Self:
 557        """
 558        Set the events in the EventArray to a new list of events.
 559        :param events: the new list of events.
 560        """
 561        # Return an empty array if we were passed nothing
 562        if events is None or len(events) == 0:
 563            return EventArray()
 564        # Otherwise, grab the info
 565        info = pd.DataFrame(
 566            {
 567                "slide_id": [event.scan.slide_id for event in events],
 568                "tile": [event.tile.n for event in events],
 569                "roi": [event.tile.n_roi for event in events],
 570                "x": [event.x for event in events],
 571                "y": [event.y for event in events],
 572                "size": [event.size for event in events],
 573            }
 574        )
 575        metadata_list = [event.metadata for event in events]
 576        # Iterate through and ensure that all metadata is the same shape
 577        for metadata in metadata_list:
 578            if type(metadata) != type(metadata_list[0]):
 579                raise ValueError("All metadata must be the same type.")
 580            if metadata is not None and metadata.shape != metadata_list[0].shape:
 581                raise ValueError("All metadata must be the same shape.")
 582        if metadata_list[0] is None:
 583            metadata = None
 584        else:
 585            metadata = pd.DataFrame(metadata_list)
 586        features_list = [event.features for event in events]
 587        # Iterate through and ensure that all features are the same shape
 588        for features in features_list:
 589            if type(features) != type(features_list[0]):
 590                raise ValueError("All features must be the same type.")
 591            if features is not None and features.shape != features_list[0].shape:
 592                raise ValueError("All features must be the same shape.")
 593        if features_list[0] is None:
 594            features = None
 595        else:
 596            features = pd.DataFrame(features_list)
 597        return EventArray(info=info, metadata=metadata, features=features)
 598
 599    def to_dataframe(self) -> pd.DataFrame:
 600        """
 601        Convert all the data in the EventArray to a single DataFrame.
 602        :return: a DataFrame with all the data in the EventArray.
 603        """
 604        # Make a copy of the info DataFrame and prepend "info_" to the column names
 605        output = self.info.copy()
 606        output.columns = [f"info_{col}" for col in output.columns]
 607        # Combine with the metadata and prepend "metadata_" to the column names
 608        if self.metadata is not None:
 609            metadata = self.metadata.copy()
 610            metadata.columns = [f"metadata_{col}" for col in metadata.columns]
 611            output = pd.concat([output, metadata], axis=1)
 612        # Combine with the features and prepend "features_" to the column names
 613        if self.features is not None:
 614            features = self.features.copy()
 615            features.columns = [f"features_{col}" for col in features.columns]
 616            output = pd.concat([output, features], axis=1)
 617        return output
 618
 619    @classmethod
 620    def from_dataframe(cls, df) -> Self:
 621        """
 622        From a single, special DataFrame, create an EventArray.
 623        :return: a DataFrame with all the data in the EventArray.
 624        """
 625        # Split the columns into info, metadata, and features and strip prefix
 626        info = df[[col for col in df.columns if col.startswith("info_")]].copy()
 627        info.columns = [col.replace("info_", "") for col in info.columns]
 628        if info.size == 0:
 629            info = None
 630        metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy()
 631        metadata.columns = [col.replace("metadata_", "") for col in metadata.columns]
 632        if metadata.size == 0:
 633            metadata = None
 634        features = df[[col for col in df.columns if col.startswith("features_")]].copy()
 635        features.columns = [col.replace("features_", "") for col in features.columns]
 636        if features.size == 0:
 637            features = None
 638        return cls(info=info, metadata=metadata, features=features)
 639
 640    @classmethod
 641    def from_mask(
 642        cls,
 643        mask: np.ndarray,
 644        slide_id: str,
 645        tile_n: int,
 646        n_roi: int = 0,
 647        include_cell_id: bool = True,
 648        images: list[np.ndarray] = None,
 649        image_labels: list[str] = None,
 650        properties: list[str] = None,
 651    ) -> Self:
 652        """
 653        Extract events from a mask DataFrame, including metadata and features.
 654        :param mask: the mask to extract events from.
 655        :param slide_id: the slide ID the mask is from.
 656        :param tile_n: the tile number the mask is from.
 657        :param n_roi: the ROI number the mask is from.
 658        :param include_cell_id: whether to include the cell_id, or numerical
 659        mask label, as metadata in the EventArray.
 660        :param images: the intensity images to extract features from.
 661        :param image_labels: the labels for the intensity images.
 662        :param properties: list of properties to extract in addition to the defaults:
 663        :return: EventArray corresponding to the mask labels.
 664        """
 665        # Gather mask_info
 666        if images is not None and image_labels is not None:
 667            if len(images) != len(image_labels):
 668                raise ValueError("Intensity images and labels must match lengths.")
 669
 670        mask_info = extract_mask_info(mask, images, image_labels, properties)
 671
 672        if len(mask_info) == 0:
 673            return EventArray()
 674
 675        # Combine provided info and mask info
 676        info = pd.DataFrame(
 677            {
 678                "slide_id": slide_id,
 679                "tile": tile_n,
 680                "roi": n_roi,
 681                "x": mask_info["x"],
 682                "y": mask_info["y"],
 683                "size": mask_info["size"],
 684            },
 685        )
 686        # Extract a metadata column if desired
 687        if include_cell_id:
 688            metadata = pd.DataFrame({"cell_id": mask_info["id"]})
 689        else:
 690            metadata = None
 691        # If any additional properties were extracted, add them as features
 692        mask_info = mask_info.drop(columns=["id", "x", "y", "size"], errors="ignore")
 693        if len(mask_info.columns) > 0:
 694            features = mask_info
 695        else:
 696            features = None
 697        return EventArray(info, metadata, features)
 698
 699    def save_csv(self, output_path: str) -> bool:
 700        """
 701        Save the events to an CSV file, including metadata and features.
 702        :param output_path:
 703        :return:
 704        """
 705        self.to_dataframe().to_csv(output_path, index=False)
 706        return os.path.exists(output_path)
 707
 708    @classmethod
 709    def load_csv(cls, input_path: str) -> Self:
 710        """
 711        Load the events from an CSV file, including metadata and features.
 712        :param input_path:
 713        :return:
 714        """
 715        # Load the CSV file
 716        df = pd.read_csv(input_path)
 717        return cls.from_dataframe(df)
 718
 719    def save_hdf5(self, output_path: str) -> bool:
 720        """
 721        Save the events to an HDF5 file, including metadata and features.
 722        Uses the pandas-provided HDF5 functions for ease, and external compatibility,
 723        though these files are slightly harder to view in HDFView or similar.
 724        :param output_path:
 725        :return:
 726        """
 727        # Open the output_path as an HDF5 file
 728        with pd.HDFStore(output_path) as store:
 729            # Store the dataframes in the HDF5 file
 730            if self.info is not None:
 731                store.put("info", self.info, index=False)
 732            if self.metadata is not None:
 733                store.put("metadata", self.metadata, index=False)
 734            if self.features is not None:
 735                store.put("features", self.features, index=False)
 736        return os.path.exists(output_path)
 737
 738    @classmethod
 739    def load_hdf5(cls, input_path: str) -> Self:
 740        """
 741        Load the events from an HDF5 file, including metadata and features.
 742        :param input_path:
 743        :return:
 744        """
 745        # Open the input_path as an HDF5 file
 746        with pd.HDFStore(input_path) as store:
 747            # Load the dataframes from the HDF5 file
 748            info = store.get("info") if "info" in store else None
 749            metadata = store.get("metadata") if "metadata" in store else None
 750            features = store.get("features") if "features" in store else None
 751        return cls(info=info, metadata=metadata, features=features)
 752
 753    def save_ocular(self, output_path: str, event_type: str = "cells"):
 754        """
 755        Save the events to an OCULAR file. Relies on the dataframe originating
 756        from an OCULAR file (same columns; duplicate metadata/info).
 757        :param output_path:
 758        :param event_type:
 759        :return:
 760        """
 761        if pyreadr is None:
 762            raise ModuleNotFoundError(
 763                "pyreadr not installed. Install pyreadr directly "
 764                "or install csi-images with [rds] option to resolve."
 765            )
 766        if event_type == "cells":
 767            file_stub = "rc-final"
 768        elif event_type == "others":
 769            file_stub = "others-final"
 770        else:
 771            raise ValueError("Invalid event type. Must be cells or others.")
 772
 773        # Ensure good metadata
 774        metadata = pd.DataFrame(
 775            {
 776                "slide_id": self.info["slide_id"],
 777                "frame_id": self.info["tile"],
 778                "cellx": self.info["x"],
 779                "celly": self.info["y"],
 780                "cell_id": (
 781                    self.metadata["cell_id"]
 782                    if "cell_id" in self.metadata.columns
 783                    else range(len(self.info))
 784                ),
 785            }
 786        )
 787        if self.metadata is not None:
 788            metadata[self.metadata.columns] = self.metadata.copy()
 789
 790        # Check for the "ocular_interesting" column
 791        if event_type == "cells":
 792            if "ocular_interesting" in metadata.columns:
 793                interesting_rows = metadata["ocular_interesting"].to_numpy(dtype=bool)
 794            elif "hcpc" in metadata.columns:
 795                # Interesting cells don't get an hcpc designation, leaving them as -1
 796                interesting_rows = (
 797                    metadata["hcpc"].to_numpy() == -1
 798                )  # interesting cells
 799            else:
 800                interesting_rows = []
 801            if sum(interesting_rows) > 0:
 802                # Split the metadata into interesting and regular
 803                interesting_events = self.rows(interesting_rows)
 804                interesting_df = pd.concat(
 805                    [interesting_events.features, interesting_events.metadata], axis=1
 806                )
 807                data_events = self.rows(~interesting_rows)
 808                data_df = pd.concat(
 809                    [data_events.features, data_events.metadata], axis=1
 810                )
 811                data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore")
 812
 813                # Drop particular columns for "interesting"
 814                interesting_df = interesting_df.drop(
 815                    [
 816                        "clust",
 817                        "hcpc",
 818                        "frame_id",
 819                        "cell_id",
 820                        "unique_id",
 821                        "ocular_interesting",
 822                    ],
 823                    axis=1,
 824                    errors="ignore",
 825                )
 826                # Save both .csv and .rds
 827                file_stub = os.path.join(output_path, "ocular_interesting")
 828                interesting_df.to_csv(f"{file_stub}.csv")
 829                # Suppress pandas FutureWarning
 830                with warnings.catch_warnings():
 831                    warnings.simplefilter(action="ignore", category=FutureWarning)
 832                    pyreadr.write_rds(f"{file_stub}.rds", interesting_df)
 833            else:
 834                data_df = pd.concat([self.features, metadata], axis=1)
 835        else:
 836            # Get all data and reset_index (will copy it)
 837            data_df = pd.concat([self.features, metadata], axis=1)
 838
 839        # Split based on cluster number to conform to *-final[1-4].rds
 840        n_clusters = max(data_df["clust"]) + 1
 841        split_idx = [round(i * n_clusters / 4) for i in range(5)]
 842        for i in range(4):
 843            subset = (split_idx[i] <= data_df["clust"]) & (
 844                data_df["clust"] < split_idx[i + 1]
 845            )
 846            data_df.loc[subset, "hcpc"] = i + 1
 847            subset = data_df[subset].reset_index(drop=True)
 848            # Suppress pandas FutureWarning
 849            with warnings.catch_warnings():
 850                warnings.simplefilter(action="ignore", category=FutureWarning)
 851                pyreadr.write_rds(
 852                    os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset
 853                )
 854
 855        # Create new example cell strings
 856        data_df["example_cell_id"] = (
 857            data_df["slide_id"]
 858            + " "
 859            + data_df["frame_id"].astype(str)
 860            + " "
 861            + data_df["cell_id"].astype(str)
 862            + " "
 863            + data_df["cellx"].astype(int).astype(str)
 864            + " "
 865            + data_df["celly"].astype(int).astype(str)
 866        )
 867        # Find averagable data columns
 868        if "cellcluster_id" in data_df.columns:
 869            end_idx = data_df.columns.get_loc("cellcluster_id")
 870        else:
 871            end_idx = data_df.columns.get_loc("slide_id")
 872        avg_cols = data_df.columns[:end_idx].tolist()
 873        # Group by cluster and average
 874        data_df = data_df.groupby("clust").agg(
 875            **{col: (col, "mean") for col in avg_cols},
 876            count=("clust", "size"),  # count rows in each cluster
 877            example_cells=("example_cell_id", lambda x: ",".join(x)),
 878            hcpc=("hcpc", lambda x: x.iloc[0]),
 879        )
 880        data_df = data_df.reset_index()  # Do NOT drop, index is "clust"
 881        # Create new columns
 882        metadata = pd.DataFrame(
 883            {
 884                "count": data_df["count"],
 885                "example_cells": data_df["example_cells"],
 886                "clust": data_df["clust"].astype(int),
 887                "hcpc": data_df["hcpc"].astype(int),
 888                "id": data_df["clust"].astype(int).astype(str),
 889                "cccluster": "0",  # Dummy value
 890                "ccdistance": 0.0,  # Dummy value
 891                "rownum": list(range(len(data_df))),
 892                "framegroup": 0,  # Dummy value
 893            }
 894        )
 895        # Need to pad the features to 761 columns, as per OCULAR report needs
 896        additional_columns = range(len(avg_cols), 761)
 897        if len(additional_columns) > 0:
 898            padding = pd.DataFrame(
 899                np.zeros((len(data_df), len(additional_columns))),
 900                columns=[f"pad{i}" for i in additional_columns],
 901            )
 902            data_df = pd.concat([data_df[avg_cols], padding, metadata], axis=1)
 903        else:
 904            data_df = pd.concat([data_df[avg_cols], metadata], axis=1)
 905
 906        # Save the cluster data
 907        data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv"))
 908        # Suppress pandas FutureWarning
 909        with warnings.catch_warnings():
 910            warnings.simplefilter(action="ignore", category=FutureWarning)
 911            pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df)
 912
 913    @classmethod
 914    def load_ocular(
 915        cls,
 916        input_path: str,
 917        event_type="cells",
 918        cell_data_files=(
 919            "rc-final1.rds",
 920            "rc-final2.rds",
 921            "rc-final3.rds",
 922            "rc-final4.rds",
 923            "ocular_interesting.rds",
 924        ),
 925        others_data_files=(
 926            "others-final1.rds",
 927            "others-final2.rds",
 928            "others-final3.rds",
 929            "others-final4.rds",
 930        ),
 931        atlas_data_files=(
 932            "ocular_interesting.rds",
 933            "ocular_not_interesting.rds",
 934        ),
 935        drop_common_events=True,
 936        log=None,
 937    ) -> Self:
 938        """
 939
 940        :param input_path:
 941        :param event_type:
 942        :param cell_data_files:
 943        :param others_data_files:
 944        :param atlas_data_files:
 945        :param drop_common_events:
 946        :param log:
 947        :return:
 948        """
 949        if pyreadr is None:
 950            raise ModuleNotFoundError(
 951                "pyreadr not installed. Install pyreadr directly "
 952                "or install csi-images with [rds] option to resolve."
 953            )
 954        # Check if the input path is a directory or a file
 955        if os.path.isfile(input_path):
 956            data_files = [os.path.basename(input_path)]
 957            input_path = os.path.dirname(input_path)
 958        if event_type == "cells":
 959            data_files = cell_data_files
 960        elif event_type == "others":
 961            data_files = others_data_files
 962        else:
 963            raise ValueError("Invalid event type.")
 964
 965        # Load the data from the OCULAR files
 966        file_data = {}
 967        for file in data_files:
 968            file_path = os.path.join(input_path, file)
 969            if not os.path.isfile(file_path):
 970                if log is not None:
 971                    log.warning(f"{file} not found for in {input_path}")
 972                continue
 973            file_data[file] = pyreadr.read_r(file_path)
 974            # Get the DataFrame associated with None (pyreadr dict quirk)
 975            file_data[file] = file_data[file][None]
 976            if len(file_data[file]) == 0:
 977                # File gets dropped from the dict
 978                file_data.pop(file)
 979                if log is not None:
 980                    log.warning(f"{file} has no cells")
 981                continue
 982
 983            if log is not None:
 984                log.debug(f"{file} has {len(file_data[file])} cells")
 985
 986            # Drop common cells if requested and in this file
 987            if file in atlas_data_files and drop_common_events:
 988                common_cell_indices = (
 989                    file_data[file]["catalogue_classification"] == "common_cell"
 990                )
 991                if log is not None:
 992                    log.debug(
 993                        f"Dropping {int(pd.Series.sum(common_cell_indices))}"
 994                        f"common cells from {file}"
 995                    )
 996                file_data[file] = file_data[file][common_cell_indices == False]
 997
 998            if len(file_data[file]) == 0:
 999                # File gets dropped from the dict
1000                file_data.pop(file)
1001                if log is not None:
1002                    log.warning(f"{file} has no cells after dropping common cells")
1003                continue
1004
1005            # Extract frame_id and cell_id
1006            # DAPI- events already have frame_id cell_id outside rowname
1007            if event_type == "cells" and "frame_id" not in file_data[file].columns:
1008                file_data[file]["rowname"] = file_data[file]["rowname"].astype("str")
1009                # get frame_id cell_id from rownames column and split into two columns
1010                split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True)
1011                if len(split_res.columns) != 2:
1012                    log.warning(
1013                        f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}'
1014                    )
1015                # then assign it back to the dataframe
1016                file_data[file][["frame_id", "cell_id"]] = split_res.astype("int")
1017            # reset indexes since they can cause NaN values in concat
1018            file_data[file] = file_data[file].reset_index(drop=True)
1019
1020        # Merge the data from all files
1021        if len(file_data) == 0:
1022            return EventArray()
1023        elif len(file_data) == 1:
1024            data = [file_data[file] for file in file_data.keys()][0]
1025        else:
1026            data = pd.concat(file_data.values())
1027
1028        if log is not None:
1029            log.debug(f"Gathered a total of {len(data)} events")
1030
1031        # Others is missing the "slide_id". Insert it right before "frame_id" column
1032        if event_type == "others" and "slide_id" not in data.columns:
1033            if os.path.basename(input_path) == "ocular":
1034                slide_id = os.path.basename(os.path.dirname(input_path))
1035            else:
1036                slide_id = "UNKNOWN"
1037            data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id)
1038
1039        # Sort according to ascending cell_id to keep the original, which is in manual_df
1040        data = data.sort_values(by=["cell_id"], ascending=True)
1041        # Filter out duplicates by x & y
1042        data = data.assign(
1043            unique_id=data["slide_id"]
1044            + "_"
1045            + data["frame_id"].astype(str)
1046            + "_"
1047            + data["cellx"].astype(int).astype(str)
1048            + "_"
1049            + data["celly"].astype(int).astype(str)
1050        )
1051        data = data.drop_duplicates(subset=["unique_id"], keep="first")
1052        # Normal unique_id is with cell_id
1053        data = data.assign(
1054            unique_id=data["slide_id"]
1055            + "_"
1056            + data["frame_id"].astype(str)
1057            + "_"
1058            + data["cell_id"].astype(str)
1059        )
1060        data = data.reset_index(drop=True)
1061        # All columns up to "slide_id" are features; drop the "slide_id"
1062        features = data.loc[:, :"slide_id"].iloc[:, :-1]
1063        data = data.loc[:, "slide_id":]
1064        # Grab the info columns
1065        info = data[["slide_id", "frame_id", "cellx", "celly"]]
1066        info.columns = ["slide_id", "tile", "x", "y"]
1067        info = info.assign(
1068            roi=0,  # OCULAR only works on 1 ROI, as far as known
1069            size=25,  # Static, for later montaging
1070        )
1071        info = info[["slide_id", "tile", "roi", "x", "y", "size"]]
1072        # Metadata has duplicate columns for later convenience
1073        metadata = data
1074        # Certain columns tend to be problematic with mixed data formats...
1075        for col in ["TRITC", "CY5", "FITC"]:
1076            if col in metadata:
1077                labels = {
1078                    "False": False,
1079                    "True": True,
1080                    "FALSE": False,
1081                    "TRUE": True,
1082                }
1083                metadata[col] = metadata[col].map(labels).astype(bool)
1084        for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]:
1085            if col in metadata:
1086                metadata[col] = metadata[col].fillna(-1).astype(int)
1087        return EventArray(info, metadata, features)
class Event:
 32class Event:
 33    """
 34    A class that represents a single event in a scan, making it easy to evaluate
 35    singular events. Required metadata is exposed as attributes, and optional
 36    metadata and features are stored as DataFrames.
 37    """
 38
 39    SCAN_TO_SLIDE_TRANSFORM = {
 40        # Axioscan zero is in the top-right corner instead of top-left
 41        Scan.Type.AXIOSCAN7: np.array(
 42            [
 43                [1, 0, 75000],
 44                [0, 1, 0],
 45                [0, 0, 1],
 46            ]
 47        ),
 48        # BZScanner coordinates are a special kind of messed up:
 49        # - The slide is upside-down.
 50        # - The slide is oriented vertically, with the barcode at the bottom.
 51        # - Tiles are numbered from the top-right
 52        Scan.Type.BZSCANNER: np.array(
 53            [
 54                [0, -1, 75000],
 55                [-1, 0, 25000],
 56                [0, 0, 1],
 57            ]
 58        ),
 59    }
 60    """
 61    Homogeneous transformation matrices for converting between scanner and slide
 62    coordinates. The matrices are 3x3, with the final column representing the
 63    translation in micrometers (um). For more information, see 
 64    [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations).
 65    
 66    Transformations are nominal, and accuracy is not guaranteed; this is due to 
 67    imperfections in slides and alignment in the scanners. Units are in micrometers.
 68    """
 69
 70    def __init__(
 71        self,
 72        scan: Scan,
 73        tile: Tile,
 74        x: int,
 75        y: int,
 76        size: int = 12,  # End-to-end size in pixels
 77        metadata: pd.Series = None,
 78        features: pd.Series = None,
 79    ):
 80        self.scan = scan
 81        self.tile = tile
 82        self.x = x
 83        self.y = y
 84        self.size = size
 85        self.metadata = metadata
 86        self.features = features
 87
 88    def __repr__(self) -> str:
 89        return f"{self.scan.slide_id}-{self.tile.n}-{self.x}-{self.y}"
 90
 91    def __eq__(self, other) -> bool:
 92        return self.__repr__() == other.__repr__()
 93
 94    def __lt__(self, other):
 95        return self.__repr__() < other.__repr__()
 96
 97    def get_scan_position(self) -> tuple[float, float]:
 98        """
 99        Get the position of the event in the scanner's coordinate frame.
100        :return: the scan position of the event in micrometers (um).
101        """
102        # Get overall pixel position
103        pixel_x = self.x + (self.scan.tile_width_px * self.tile.x)
104        pixel_y = self.y + (self.scan.tile_height_px * self.tile.y)
105        # Convert to micrometers
106        x_um = pixel_x * self.scan.pixel_size_um
107        y_um = pixel_y * self.scan.pixel_size_um
108        # Add the scan's origin in the scanner frame
109        x_um += self.scan.roi[self.tile.n_roi].origin_x_um
110        y_um += self.scan.roi[self.tile.n_roi].origin_y_um
111        return x_um, y_um
112
113    def get_slide_position(self) -> tuple[float, float]:
114        """
115        Get the slide position of the event in micrometers (um).
116        :return: the slide position of the event.
117        """
118        # Turn scan_position into a 3x1 vector
119        scan_position = self.get_scan_position()
120        scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]])
121
122        # Multiply by the appropriate homogeneous matrix
123        if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value):
124            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7]
125        elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value):
126            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER]
127        else:
128            raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.")
129        slide_position = np.matmul(transform, scan_position)
130        return float(slide_position[0][0]), float(slide_position[1][0])
131
132    def crop_images(
133        self, images: list[np.ndarray], crop_size: int = 100, in_pixels: bool = True
134    ) -> list[np.ndarray]:
135        """
136        Get the event crops from the frame images. Called "get" because it does not
137        need to extract anything; it is very quick for extracting multiple events from
138        the same tile.
139        Use this if you're interested in many events.
140        :param images: the frame images.
141        :param crop_size: the square size of the image crop to get for this event.
142        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
143        :return: image_size x image_size crops of the event in the provided frames. If
144        the event is too close to the edge, the crop will be smaller and not centered.
145        """
146        # Convert a crop size in micrometers to pixels
147        if not in_pixels:
148            crop_size = round(crop_size / self.scan.pixel_size_um)
149        # Find the crop bounds
150        bounds = [
151            self.x - crop_size // 2,
152            self.y - crop_size // 2,
153            self.x + math.ceil(crop_size / 2),
154            self.y + math.ceil(crop_size / 2),
155        ]
156        # Determine how much the bounds violate the image size
157        displacements = [
158            max(0, -bounds[0]),
159            max(0, -bounds[1]),
160            max(0, bounds[2] - images[0].shape[1]),
161            max(0, bounds[3] - images[0].shape[0]),
162        ]
163        # Cap off the bounds
164        bounds = [
165            max(0, bounds[0]),
166            max(0, bounds[1]),
167            min(images[0].shape[1], bounds[2]),
168            min(images[0].shape[0], bounds[3]),
169        ]
170
171        # Crop the images
172        cropped_images = []
173        for image in images:
174            # Create a blank image of the right size
175            cropped_image = np.zeros((crop_size, crop_size), dtype=image.dtype)
176
177            # Insert the cropped image into the blank image, leaving a black buffer
178            # around the edges if the crop would go beyond the original image bounds
179            cropped_image[
180                displacements[1] : crop_size - displacements[3],
181                displacements[0] : crop_size - displacements[2],
182            ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]]
183            cropped_images.append(cropped_image)
184        return cropped_images
185
186    def extract_images(
187        self, crop_size: int = 100, in_pixels: bool = True
188    ) -> list[np.ndarray]:
189        """
190        Extract the images from the scan and tile, reading from the file. Called
191        "extract" because it must read and extract the images from file, which is slow.
192        Use this if you're interested in only a few events, as it is inefficient when
193        reading multiple events from the same tile.
194        :param crop_size: the square size of the image crop to get for this event.
195        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
196        :return: a list of cropped images from the scan in the order of the channels.
197        """
198        frames = Frame.get_frames(self.tile)
199        images = [frame.get_image() for frame in frames]
200        return self.crop_images(images, crop_size, in_pixels)
201
202    @classmethod
203    def extract_images_for_list(
204        cls,
205        events: list[Self],
206        crop_size: int | list[int] = None,
207        in_pixels: bool = True,
208    ) -> list[list[np.ndarray]]:
209        """
210        Get the images for a list of events, ensuring that there is no wasteful reading
211        of the same tile multiple times. This function is more efficient than calling
212        extract_event_images for each event.
213        TODO: test this function
214        :param events: the events to extract images for.
215        :param crop_size: the square size of the image crop to get for this event.
216                          Defaults to four times the size of the event.
217        :param in_pixels: whether the crop size is in pixels or micrometers.
218                          Defaults to pixels, and is ignored if crop_size is None.
219        :return: a list of lists of cropped images for each event.
220        """
221        if len(events) == 0:
222            return []
223
224        # Populate a crop size if none provided
225        if crop_size is None:
226            crop_size = [4 * event.size for event in events]
227            in_pixels = True
228        # Propagate a constant crop size
229        elif isinstance(crop_size, int):
230            crop_size = [crop_size] * len(events)
231
232        # Sort the events by tile; use a shallow copy to avoid modifying the original
233        order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__()))
234
235        # Allocate the list to size
236        images = [None] * len(events)
237        last_tile = None
238        frame_images = None  # Holds large numpy arrays, so expensive to compare
239        # Iterate through in sorted order
240        for i in order:
241            if last_tile != events[i].tile:
242                # Gather the frame images, preserving them for the next event
243                frames = Frame.get_frames(events[i].tile)
244                frame_images = [frame.get_image() for frame in frames]
245
246                last_tile = events[i].tile
247            # Use the frame images to crop the event images
248            # Preserve the original order using order[i]
249            images[i] = events[i].crop_images(frame_images, crop_size[i], in_pixels)
250        return images

A class that represents a single event in a scan, making it easy to evaluate singular events. Required metadata is exposed as attributes, and optional metadata and features are stored as DataFrames.

Event( scan: csi_images.csi_scans.Scan, tile: csi_images.csi_tiles.Tile, x: int, y: int, size: int = 12, metadata: pandas.core.series.Series = None, features: pandas.core.series.Series = None)
70    def __init__(
71        self,
72        scan: Scan,
73        tile: Tile,
74        x: int,
75        y: int,
76        size: int = 12,  # End-to-end size in pixels
77        metadata: pd.Series = None,
78        features: pd.Series = None,
79    ):
80        self.scan = scan
81        self.tile = tile
82        self.x = x
83        self.y = y
84        self.size = size
85        self.metadata = metadata
86        self.features = features
SCAN_TO_SLIDE_TRANSFORM = {<Type.AXIOSCAN7: 'axioscan7'>: array([[ 1, 0, 75000], [ 0, 1, 0], [ 0, 0, 1]]), <Type.BZSCANNER: 'bzscanner'>: array([[ 0, -1, 75000], [ -1, 0, 25000], [ 0, 0, 1]])}

Homogeneous transformation matrices for converting between scanner and slide coordinates. The matrices are 3x3, with the final column representing the translation in micrometers (um). For more information, see affine transformations.

Transformations are nominal, and accuracy is not guaranteed; this is due to imperfections in slides and alignment in the scanners. Units are in micrometers.

scan
tile
x
y
size
metadata
features
def get_scan_position(self) -> tuple[float, float]:
 97    def get_scan_position(self) -> tuple[float, float]:
 98        """
 99        Get the position of the event in the scanner's coordinate frame.
100        :return: the scan position of the event in micrometers (um).
101        """
102        # Get overall pixel position
103        pixel_x = self.x + (self.scan.tile_width_px * self.tile.x)
104        pixel_y = self.y + (self.scan.tile_height_px * self.tile.y)
105        # Convert to micrometers
106        x_um = pixel_x * self.scan.pixel_size_um
107        y_um = pixel_y * self.scan.pixel_size_um
108        # Add the scan's origin in the scanner frame
109        x_um += self.scan.roi[self.tile.n_roi].origin_x_um
110        y_um += self.scan.roi[self.tile.n_roi].origin_y_um
111        return x_um, y_um

Get the position of the event in the scanner's coordinate frame.

Returns

the scan position of the event in micrometers (um).

def get_slide_position(self) -> tuple[float, float]:
113    def get_slide_position(self) -> tuple[float, float]:
114        """
115        Get the slide position of the event in micrometers (um).
116        :return: the slide position of the event.
117        """
118        # Turn scan_position into a 3x1 vector
119        scan_position = self.get_scan_position()
120        scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]])
121
122        # Multiply by the appropriate homogeneous matrix
123        if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value):
124            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7]
125        elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value):
126            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER]
127        else:
128            raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.")
129        slide_position = np.matmul(transform, scan_position)
130        return float(slide_position[0][0]), float(slide_position[1][0])

Get the slide position of the event in micrometers (um).

Returns

the slide position of the event.

def crop_images( self, images: list[numpy.ndarray], crop_size: int = 100, in_pixels: bool = True) -> list[numpy.ndarray]:
132    def crop_images(
133        self, images: list[np.ndarray], crop_size: int = 100, in_pixels: bool = True
134    ) -> list[np.ndarray]:
135        """
136        Get the event crops from the frame images. Called "get" because it does not
137        need to extract anything; it is very quick for extracting multiple events from
138        the same tile.
139        Use this if you're interested in many events.
140        :param images: the frame images.
141        :param crop_size: the square size of the image crop to get for this event.
142        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
143        :return: image_size x image_size crops of the event in the provided frames. If
144        the event is too close to the edge, the crop will be smaller and not centered.
145        """
146        # Convert a crop size in micrometers to pixels
147        if not in_pixels:
148            crop_size = round(crop_size / self.scan.pixel_size_um)
149        # Find the crop bounds
150        bounds = [
151            self.x - crop_size // 2,
152            self.y - crop_size // 2,
153            self.x + math.ceil(crop_size / 2),
154            self.y + math.ceil(crop_size / 2),
155        ]
156        # Determine how much the bounds violate the image size
157        displacements = [
158            max(0, -bounds[0]),
159            max(0, -bounds[1]),
160            max(0, bounds[2] - images[0].shape[1]),
161            max(0, bounds[3] - images[0].shape[0]),
162        ]
163        # Cap off the bounds
164        bounds = [
165            max(0, bounds[0]),
166            max(0, bounds[1]),
167            min(images[0].shape[1], bounds[2]),
168            min(images[0].shape[0], bounds[3]),
169        ]
170
171        # Crop the images
172        cropped_images = []
173        for image in images:
174            # Create a blank image of the right size
175            cropped_image = np.zeros((crop_size, crop_size), dtype=image.dtype)
176
177            # Insert the cropped image into the blank image, leaving a black buffer
178            # around the edges if the crop would go beyond the original image bounds
179            cropped_image[
180                displacements[1] : crop_size - displacements[3],
181                displacements[0] : crop_size - displacements[2],
182            ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]]
183            cropped_images.append(cropped_image)
184        return cropped_images

Get the event crops from the frame images. Called "get" because it does not need to extract anything; it is very quick for extracting multiple events from the same tile. Use this if you're interested in many events.

Parameters
  • images: the frame images.
  • crop_size: the square size of the image crop to get for this event.
  • in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
Returns

image_size x image_size crops of the event in the provided frames. If the event is too close to the edge, the crop will be smaller and not centered.

def extract_images( self, crop_size: int = 100, in_pixels: bool = True) -> list[numpy.ndarray]:
186    def extract_images(
187        self, crop_size: int = 100, in_pixels: bool = True
188    ) -> list[np.ndarray]:
189        """
190        Extract the images from the scan and tile, reading from the file. Called
191        "extract" because it must read and extract the images from file, which is slow.
192        Use this if you're interested in only a few events, as it is inefficient when
193        reading multiple events from the same tile.
194        :param crop_size: the square size of the image crop to get for this event.
195        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
196        :return: a list of cropped images from the scan in the order of the channels.
197        """
198        frames = Frame.get_frames(self.tile)
199        images = [frame.get_image() for frame in frames]
200        return self.crop_images(images, crop_size, in_pixels)

Extract the images from the scan and tile, reading from the file. Called "extract" because it must read and extract the images from file, which is slow. Use this if you're interested in only a few events, as it is inefficient when reading multiple events from the same tile.

Parameters
  • crop_size: the square size of the image crop to get for this event.
  • in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
Returns

a list of cropped images from the scan in the order of the channels.

@classmethod
def extract_images_for_list( cls, events: list[typing.Self], crop_size: int | list[int] = None, in_pixels: bool = True) -> list[list[numpy.ndarray]]:
202    @classmethod
203    def extract_images_for_list(
204        cls,
205        events: list[Self],
206        crop_size: int | list[int] = None,
207        in_pixels: bool = True,
208    ) -> list[list[np.ndarray]]:
209        """
210        Get the images for a list of events, ensuring that there is no wasteful reading
211        of the same tile multiple times. This function is more efficient than calling
212        extract_event_images for each event.
213        TODO: test this function
214        :param events: the events to extract images for.
215        :param crop_size: the square size of the image crop to get for this event.
216                          Defaults to four times the size of the event.
217        :param in_pixels: whether the crop size is in pixels or micrometers.
218                          Defaults to pixels, and is ignored if crop_size is None.
219        :return: a list of lists of cropped images for each event.
220        """
221        if len(events) == 0:
222            return []
223
224        # Populate a crop size if none provided
225        if crop_size is None:
226            crop_size = [4 * event.size for event in events]
227            in_pixels = True
228        # Propagate a constant crop size
229        elif isinstance(crop_size, int):
230            crop_size = [crop_size] * len(events)
231
232        # Sort the events by tile; use a shallow copy to avoid modifying the original
233        order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__()))
234
235        # Allocate the list to size
236        images = [None] * len(events)
237        last_tile = None
238        frame_images = None  # Holds large numpy arrays, so expensive to compare
239        # Iterate through in sorted order
240        for i in order:
241            if last_tile != events[i].tile:
242                # Gather the frame images, preserving them for the next event
243                frames = Frame.get_frames(events[i].tile)
244                frame_images = [frame.get_image() for frame in frames]
245
246                last_tile = events[i].tile
247            # Use the frame images to crop the event images
248            # Preserve the original order using order[i]
249            images[i] = events[i].crop_images(frame_images, crop_size[i], in_pixels)
250        return images

Get the images for a list of events, ensuring that there is no wasteful reading of the same tile multiple times. This function is more efficient than calling extract_event_images for each event. TODO: test this function

Parameters
  • events: the events to extract images for.
  • crop_size: the square size of the image crop to get for this event. Defaults to four times the size of the event.
  • in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels, and is ignored if crop_size is None.
Returns

a list of lists of cropped images for each event.

class EventArray:
 253class EventArray:
 254    """
 255    A class that holds a large number of events' data, making it easy to analyze and
 256    manipulate many events at once. A more separated version of the Event class.
 257    """
 258
 259    INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y", "size"]
 260
 261    def __init__(
 262        self,
 263        info: pd.DataFrame = None,
 264        metadata: pd.DataFrame = None,
 265        features: pd.DataFrame = None,
 266    ):
 267        # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y", "size"
 268        if info is not None:
 269            if list(info.columns) != self.INFO_COLUMNS:
 270                raise ValueError(
 271                    "EventArray.info must have columns 'slide_id', 'tile', 'roi', 'x', 'y', 'size'"
 272                )
 273            # Copy first to avoid modifying the original
 274            info = info.copy()
 275            # Ensure that the columns are the right types
 276            info["slide_id"] = info["slide_id"].astype(str)
 277            info["tile"] = info["tile"].astype(np.uint16)
 278            info["roi"] = info["roi"].astype(np.uint8)
 279            info["x"] = info["x"].round().astype(np.uint16)
 280            info["y"] = info["y"].round().astype(np.uint16)
 281            info["size"] = info["size"].round().astype(np.uint16)
 282        # All DataFrames must all have the same number of rows
 283        if metadata is not None and (info is None or len(info) != len(metadata)):
 284            raise ValueError(
 285                "If EventArray.metadata is not None, it should match rows with .info"
 286            )
 287        if features is not None and (info is None or len(info) != len(features)):
 288            raise ValueError(
 289                "If EventArray.features is not None, it should match rows with .info"
 290            )
 291        self.info = info
 292        self.metadata = metadata
 293        self.features = features
 294
 295    def __len__(self) -> int:
 296        # Convenience method to get the number of events
 297        if self.info is None:
 298            return 0
 299        else:
 300            return len(self.info)
 301
 302    def __eq__(self, other):
 303        is_equal = True
 304        # Parse all possibilities for info
 305        if isinstance(self.info, pd.DataFrame):
 306            if isinstance(other.info, pd.DataFrame):
 307                is_equal = self.info.equals(other.info)
 308                if not is_equal:
 309                    return False
 310            else:
 311                return False
 312        elif self.info is None:
 313            if other.info is not None:
 314                return False
 315
 316        # Parse all possibilities for metadata
 317        if isinstance(self.metadata, pd.DataFrame):
 318            if isinstance(other.metadata, pd.DataFrame):
 319                is_equal = self.metadata.equals(other.metadata)
 320                if not is_equal:
 321                    return False
 322            else:
 323                return False
 324        elif self.metadata is None:
 325            if other.metadata is not None:
 326                return False
 327
 328        # Parse all possibilities for features
 329        if isinstance(self.features, pd.DataFrame):
 330            if isinstance(other.features, pd.DataFrame):
 331                is_equal = self.features.equals(other.features)
 332                if not is_equal:
 333                    return False
 334            else:
 335                return False
 336        elif self.features is None:
 337            if other.features is not None:
 338                return False
 339
 340        return is_equal
 341
 342    def get_sort_order(self, by: str | list[str], ascending: bool | list[bool] = True):
 343        """
 344        Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.
 345        :param by: name of the column(s) to sort by.
 346        :param ascending: whether to sort in ascending order; can be a list to match by
 347        :return: the order of the indices to sort by.
 348        """
 349        columns = self.get(by)
 350        return columns.sort_values(by=by, ascending=ascending).index
 351
 352    def sort(self, by: str | list[str], ascending: bool | list[bool] = True) -> Self:
 353        """
 354        Sort the EventArray by column(s) in the info, metadata, or features DataFrames.
 355        :param by: name of the column(s) to sort by.
 356        :param ascending: whether to sort in ascending order; can be a list to match by
 357        :return: a new, sorted EventArray.
 358        """
 359        order = self.get_sort_order(by, ascending)
 360        info = self.info.loc[order].reset_index(drop=True)
 361        if self.metadata is not None:
 362            metadata = self.metadata.loc[order].reset_index(drop=True)
 363        else:
 364            metadata = None
 365        if self.features is not None:
 366            features = self.features.loc[order].reset_index(drop=True)
 367        else:
 368            features = None
 369        return EventArray(info, metadata, features)
 370
 371    def get(self, column_names: int | str | list[int] | list[str]) -> pd.DataFrame:
 372        """
 373        Get a DataFrame with the specified columns from the EventArray, by value.
 374        :param column_names: the names of the columns to get.
 375        :return: a DataFrame with the specified columns.
 376        """
 377        if isinstance(column_names, int) or isinstance(column_names, str):
 378            column_names = [column_names]
 379        columns = []
 380        for column_name in column_names:
 381            if column_name in self.info.columns:
 382                columns.append(self.info[column_name])
 383            elif self.metadata is not None and column_name in self.metadata.columns:
 384                columns.append(self.metadata[column_name])
 385            elif self.features is not None and column_name in self.features.columns:
 386                columns.append(self.features[column_name])
 387            else:
 388                raise ValueError(f"Column {column_name} not found in EventArray")
 389        return pd.concat(columns, axis=1)
 390
 391    def rows(self, rows) -> Self:
 392        """
 393        Get a subset of the EventArray rows based on a boolean or integer index, by value.
 394        :param rows: the indices to get as a 1D boolean/integer list/array/series
 395        :return: a new EventArray with the subset of events.
 396        """
 397        info = self.info.loc[rows].reset_index(drop=True)
 398        if self.metadata is not None:
 399            metadata = self.metadata.loc[rows].reset_index(drop=True)
 400        else:
 401            metadata = None
 402        if self.features is not None:
 403            features = self.features.loc[rows].reset_index(drop=True)
 404        else:
 405            features = None
 406        return EventArray(info, metadata, features)
 407
 408    def copy(self) -> Self:
 409        """
 410        Create a deep copy of the EventArray.
 411        :return: a deep copy of the EventArray.
 412        """
 413        return EventArray(
 414            info=self.info.copy(),
 415            metadata=None if self.metadata is None else self.metadata.copy(),
 416            features=None if self.features is None else self.features.copy(),
 417        )
 418
 419    def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None:
 420        """
 421        Add metadata to the EventArray. Removes the need to check if metadata is None.
 422        Overwrites any existing metadata with the same column names as the new metadata.
 423        :param new_metadata: the metadata to add.
 424        """
 425        if len(self) != len(new_metadata):
 426            raise ValueError("New metadata must match length of existing info")
 427
 428        if self.metadata is None:
 429            self.metadata = new_metadata
 430        else:
 431            if isinstance(new_metadata, pd.Series):
 432                self.metadata[new_metadata.name] = new_metadata
 433            else:
 434                # It's a DataFrame
 435                self.metadata[new_metadata.columns] = new_metadata
 436
 437    def add_features(self, new_features: pd.Series | pd.DataFrame) -> None:
 438        """
 439        Add features to the EventArray. Removes the need to check if features is None.
 440        Overwrites any existing features with the same column names as the new features.
 441        :param new_features: the features to add.
 442        """
 443        if len(self) != len(new_features):
 444            raise ValueError("New features must match length of existing info")
 445
 446        if self.features is None:
 447            self.features = new_features
 448        else:
 449            if isinstance(new_features, pd.Series):
 450                self.features[new_features.name] = new_features
 451            else:
 452                # It's a DataFrame
 453                self.features[new_features.columns] = new_features
 454
 455    @classmethod
 456    def merge(cls, events: list[Self]) -> Self:
 457        """
 458        Combine EventArrays in a list into a single EventArray.
 459        :param events: the new list of events.
 460        """
 461        all_info = []
 462        all_metadata = []
 463        all_features = []
 464        for event_array in events:
 465            # Skip empty EventArrays
 466            if event_array.info is not None:
 467                all_info.append(event_array.info)
 468            if event_array.metadata is not None:
 469                all_metadata.append(event_array.metadata)
 470            if event_array.features is not None:
 471                all_features.append(event_array.features)
 472        if len(all_info) == 0:
 473            return EventArray()
 474        else:
 475            all_info = pd.concat(all_info, ignore_index=True)
 476        if len(all_metadata) == 0:
 477            all_metadata = None
 478        else:
 479            all_metadata = pd.concat(all_metadata, ignore_index=True)
 480        if len(all_features) == 0:
 481            all_features = None
 482        else:
 483            all_features = pd.concat(all_features, ignore_index=True)
 484
 485        return EventArray(all_info, all_metadata, all_features)
 486
 487    def to_events(
 488        self,
 489        scans: Scan | list[Scan],
 490        ignore_missing_scans=True,
 491        ignore_metadata=False,
 492        ignore_features=False,
 493    ) -> list[Event]:
 494        """
 495        Get the events in the EventArray as a list of events.
 496        :param scans: the scans that the events belong to, auto-matched by slide_id.
 497        Pass None if you don't care about scan metadata (pass ignore_missing_scans).
 498        :param ignore_missing_scans: whether to create blank scans for events without scans.
 499        :param ignore_metadata: whether to ignore metadata or not
 500        :param ignore_features: whether to ignore features or not
 501        :return:
 502        """
 503        if isinstance(scans, Scan):
 504            scans = [scans] * len(self.info)
 505        events = []
 506        for i in range(len(self.info)):
 507            # Determine the associated scan
 508            scan = None
 509            for s in scans:
 510                if s.slide_id == self.info["slide_id"][i]:
 511                    scan = s
 512                    break
 513            if scan is None:
 514                if ignore_missing_scans:
 515                    # Create a placeholder scan if the scan is missing
 516                    scan = Scan.make_placeholder(
 517                        self.info["slide_id"][i],
 518                        self.info["tile"][i],
 519                        self.info["roi"][i],
 520                    )
 521                else:
 522                    raise ValueError(
 523                        f"Scan {self.info['slide_id'][i]} not found for event {i}."
 524                    )
 525            # Prepare the metadata and features
 526            if ignore_metadata or self.metadata is None:
 527                metadata = None
 528            else:
 529                # This Series creation method is less efficient,
 530                # but required for preserving dtypes
 531                metadata = pd.Series(
 532                    {col: self.metadata.loc[i, col] for col in self.metadata.columns},
 533                    dtype=object,
 534                )
 535            if ignore_features or self.features is None:
 536                features = None
 537            else:
 538                features = pd.Series(
 539                    {col: self.features.loc[i, col] for col in self.features.columns},
 540                    dtype=object,
 541                )
 542            # Create the event and append it to the list
 543            events.append(
 544                Event(
 545                    scan,
 546                    Tile(scan, self.info["tile"][i], self.info["roi"][i]),
 547                    self.info["x"][i],
 548                    self.info["y"][i],
 549                    size=self.info["size"][i],
 550                    metadata=metadata,
 551                    features=features,
 552                )
 553            )
 554        return events
 555
 556    @classmethod
 557    def from_events(cls, events: list[Event]) -> Self:
 558        """
 559        Set the events in the EventArray to a new list of events.
 560        :param events: the new list of events.
 561        """
 562        # Return an empty array if we were passed nothing
 563        if events is None or len(events) == 0:
 564            return EventArray()
 565        # Otherwise, grab the info
 566        info = pd.DataFrame(
 567            {
 568                "slide_id": [event.scan.slide_id for event in events],
 569                "tile": [event.tile.n for event in events],
 570                "roi": [event.tile.n_roi for event in events],
 571                "x": [event.x for event in events],
 572                "y": [event.y for event in events],
 573                "size": [event.size for event in events],
 574            }
 575        )
 576        metadata_list = [event.metadata for event in events]
 577        # Iterate through and ensure that all metadata is the same shape
 578        for metadata in metadata_list:
 579            if type(metadata) != type(metadata_list[0]):
 580                raise ValueError("All metadata must be the same type.")
 581            if metadata is not None and metadata.shape != metadata_list[0].shape:
 582                raise ValueError("All metadata must be the same shape.")
 583        if metadata_list[0] is None:
 584            metadata = None
 585        else:
 586            metadata = pd.DataFrame(metadata_list)
 587        features_list = [event.features for event in events]
 588        # Iterate through and ensure that all features are the same shape
 589        for features in features_list:
 590            if type(features) != type(features_list[0]):
 591                raise ValueError("All features must be the same type.")
 592            if features is not None and features.shape != features_list[0].shape:
 593                raise ValueError("All features must be the same shape.")
 594        if features_list[0] is None:
 595            features = None
 596        else:
 597            features = pd.DataFrame(features_list)
 598        return EventArray(info=info, metadata=metadata, features=features)
 599
 600    def to_dataframe(self) -> pd.DataFrame:
 601        """
 602        Convert all the data in the EventArray to a single DataFrame.
 603        :return: a DataFrame with all the data in the EventArray.
 604        """
 605        # Make a copy of the info DataFrame and prepend "info_" to the column names
 606        output = self.info.copy()
 607        output.columns = [f"info_{col}" for col in output.columns]
 608        # Combine with the metadata and prepend "metadata_" to the column names
 609        if self.metadata is not None:
 610            metadata = self.metadata.copy()
 611            metadata.columns = [f"metadata_{col}" for col in metadata.columns]
 612            output = pd.concat([output, metadata], axis=1)
 613        # Combine with the features and prepend "features_" to the column names
 614        if self.features is not None:
 615            features = self.features.copy()
 616            features.columns = [f"features_{col}" for col in features.columns]
 617            output = pd.concat([output, features], axis=1)
 618        return output
 619
 620    @classmethod
 621    def from_dataframe(cls, df) -> Self:
 622        """
 623        From a single, special DataFrame, create an EventArray.
 624        :return: a DataFrame with all the data in the EventArray.
 625        """
 626        # Split the columns into info, metadata, and features and strip prefix
 627        info = df[[col for col in df.columns if col.startswith("info_")]].copy()
 628        info.columns = [col.replace("info_", "") for col in info.columns]
 629        if info.size == 0:
 630            info = None
 631        metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy()
 632        metadata.columns = [col.replace("metadata_", "") for col in metadata.columns]
 633        if metadata.size == 0:
 634            metadata = None
 635        features = df[[col for col in df.columns if col.startswith("features_")]].copy()
 636        features.columns = [col.replace("features_", "") for col in features.columns]
 637        if features.size == 0:
 638            features = None
 639        return cls(info=info, metadata=metadata, features=features)
 640
 641    @classmethod
 642    def from_mask(
 643        cls,
 644        mask: np.ndarray,
 645        slide_id: str,
 646        tile_n: int,
 647        n_roi: int = 0,
 648        include_cell_id: bool = True,
 649        images: list[np.ndarray] = None,
 650        image_labels: list[str] = None,
 651        properties: list[str] = None,
 652    ) -> Self:
 653        """
 654        Extract events from a mask DataFrame, including metadata and features.
 655        :param mask: the mask to extract events from.
 656        :param slide_id: the slide ID the mask is from.
 657        :param tile_n: the tile number the mask is from.
 658        :param n_roi: the ROI number the mask is from.
 659        :param include_cell_id: whether to include the cell_id, or numerical
 660        mask label, as metadata in the EventArray.
 661        :param images: the intensity images to extract features from.
 662        :param image_labels: the labels for the intensity images.
 663        :param properties: list of properties to extract in addition to the defaults:
 664        :return: EventArray corresponding to the mask labels.
 665        """
 666        # Gather mask_info
 667        if images is not None and image_labels is not None:
 668            if len(images) != len(image_labels):
 669                raise ValueError("Intensity images and labels must match lengths.")
 670
 671        mask_info = extract_mask_info(mask, images, image_labels, properties)
 672
 673        if len(mask_info) == 0:
 674            return EventArray()
 675
 676        # Combine provided info and mask info
 677        info = pd.DataFrame(
 678            {
 679                "slide_id": slide_id,
 680                "tile": tile_n,
 681                "roi": n_roi,
 682                "x": mask_info["x"],
 683                "y": mask_info["y"],
 684                "size": mask_info["size"],
 685            },
 686        )
 687        # Extract a metadata column if desired
 688        if include_cell_id:
 689            metadata = pd.DataFrame({"cell_id": mask_info["id"]})
 690        else:
 691            metadata = None
 692        # If any additional properties were extracted, add them as features
 693        mask_info = mask_info.drop(columns=["id", "x", "y", "size"], errors="ignore")
 694        if len(mask_info.columns) > 0:
 695            features = mask_info
 696        else:
 697            features = None
 698        return EventArray(info, metadata, features)
 699
 700    def save_csv(self, output_path: str) -> bool:
 701        """
 702        Save the events to an CSV file, including metadata and features.
 703        :param output_path:
 704        :return:
 705        """
 706        self.to_dataframe().to_csv(output_path, index=False)
 707        return os.path.exists(output_path)
 708
 709    @classmethod
 710    def load_csv(cls, input_path: str) -> Self:
 711        """
 712        Load the events from an CSV file, including metadata and features.
 713        :param input_path:
 714        :return:
 715        """
 716        # Load the CSV file
 717        df = pd.read_csv(input_path)
 718        return cls.from_dataframe(df)
 719
 720    def save_hdf5(self, output_path: str) -> bool:
 721        """
 722        Save the events to an HDF5 file, including metadata and features.
 723        Uses the pandas-provided HDF5 functions for ease, and external compatibility,
 724        though these files are slightly harder to view in HDFView or similar.
 725        :param output_path:
 726        :return:
 727        """
 728        # Open the output_path as an HDF5 file
 729        with pd.HDFStore(output_path) as store:
 730            # Store the dataframes in the HDF5 file
 731            if self.info is not None:
 732                store.put("info", self.info, index=False)
 733            if self.metadata is not None:
 734                store.put("metadata", self.metadata, index=False)
 735            if self.features is not None:
 736                store.put("features", self.features, index=False)
 737        return os.path.exists(output_path)
 738
 739    @classmethod
 740    def load_hdf5(cls, input_path: str) -> Self:
 741        """
 742        Load the events from an HDF5 file, including metadata and features.
 743        :param input_path:
 744        :return:
 745        """
 746        # Open the input_path as an HDF5 file
 747        with pd.HDFStore(input_path) as store:
 748            # Load the dataframes from the HDF5 file
 749            info = store.get("info") if "info" in store else None
 750            metadata = store.get("metadata") if "metadata" in store else None
 751            features = store.get("features") if "features" in store else None
 752        return cls(info=info, metadata=metadata, features=features)
 753
 754    def save_ocular(self, output_path: str, event_type: str = "cells"):
 755        """
 756        Save the events to an OCULAR file. Relies on the dataframe originating
 757        from an OCULAR file (same columns; duplicate metadata/info).
 758        :param output_path:
 759        :param event_type:
 760        :return:
 761        """
 762        if pyreadr is None:
 763            raise ModuleNotFoundError(
 764                "pyreadr not installed. Install pyreadr directly "
 765                "or install csi-images with [rds] option to resolve."
 766            )
 767        if event_type == "cells":
 768            file_stub = "rc-final"
 769        elif event_type == "others":
 770            file_stub = "others-final"
 771        else:
 772            raise ValueError("Invalid event type. Must be cells or others.")
 773
 774        # Ensure good metadata
 775        metadata = pd.DataFrame(
 776            {
 777                "slide_id": self.info["slide_id"],
 778                "frame_id": self.info["tile"],
 779                "cellx": self.info["x"],
 780                "celly": self.info["y"],
 781                "cell_id": (
 782                    self.metadata["cell_id"]
 783                    if "cell_id" in self.metadata.columns
 784                    else range(len(self.info))
 785                ),
 786            }
 787        )
 788        if self.metadata is not None:
 789            metadata[self.metadata.columns] = self.metadata.copy()
 790
 791        # Check for the "ocular_interesting" column
 792        if event_type == "cells":
 793            if "ocular_interesting" in metadata.columns:
 794                interesting_rows = metadata["ocular_interesting"].to_numpy(dtype=bool)
 795            elif "hcpc" in metadata.columns:
 796                # Interesting cells don't get an hcpc designation, leaving them as -1
 797                interesting_rows = (
 798                    metadata["hcpc"].to_numpy() == -1
 799                )  # interesting cells
 800            else:
 801                interesting_rows = []
 802            if sum(interesting_rows) > 0:
 803                # Split the metadata into interesting and regular
 804                interesting_events = self.rows(interesting_rows)
 805                interesting_df = pd.concat(
 806                    [interesting_events.features, interesting_events.metadata], axis=1
 807                )
 808                data_events = self.rows(~interesting_rows)
 809                data_df = pd.concat(
 810                    [data_events.features, data_events.metadata], axis=1
 811                )
 812                data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore")
 813
 814                # Drop particular columns for "interesting"
 815                interesting_df = interesting_df.drop(
 816                    [
 817                        "clust",
 818                        "hcpc",
 819                        "frame_id",
 820                        "cell_id",
 821                        "unique_id",
 822                        "ocular_interesting",
 823                    ],
 824                    axis=1,
 825                    errors="ignore",
 826                )
 827                # Save both .csv and .rds
 828                file_stub = os.path.join(output_path, "ocular_interesting")
 829                interesting_df.to_csv(f"{file_stub}.csv")
 830                # Suppress pandas FutureWarning
 831                with warnings.catch_warnings():
 832                    warnings.simplefilter(action="ignore", category=FutureWarning)
 833                    pyreadr.write_rds(f"{file_stub}.rds", interesting_df)
 834            else:
 835                data_df = pd.concat([self.features, metadata], axis=1)
 836        else:
 837            # Get all data and reset_index (will copy it)
 838            data_df = pd.concat([self.features, metadata], axis=1)
 839
 840        # Split based on cluster number to conform to *-final[1-4].rds
 841        n_clusters = max(data_df["clust"]) + 1
 842        split_idx = [round(i * n_clusters / 4) for i in range(5)]
 843        for i in range(4):
 844            subset = (split_idx[i] <= data_df["clust"]) & (
 845                data_df["clust"] < split_idx[i + 1]
 846            )
 847            data_df.loc[subset, "hcpc"] = i + 1
 848            subset = data_df[subset].reset_index(drop=True)
 849            # Suppress pandas FutureWarning
 850            with warnings.catch_warnings():
 851                warnings.simplefilter(action="ignore", category=FutureWarning)
 852                pyreadr.write_rds(
 853                    os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset
 854                )
 855
 856        # Create new example cell strings
 857        data_df["example_cell_id"] = (
 858            data_df["slide_id"]
 859            + " "
 860            + data_df["frame_id"].astype(str)
 861            + " "
 862            + data_df["cell_id"].astype(str)
 863            + " "
 864            + data_df["cellx"].astype(int).astype(str)
 865            + " "
 866            + data_df["celly"].astype(int).astype(str)
 867        )
 868        # Find averagable data columns
 869        if "cellcluster_id" in data_df.columns:
 870            end_idx = data_df.columns.get_loc("cellcluster_id")
 871        else:
 872            end_idx = data_df.columns.get_loc("slide_id")
 873        avg_cols = data_df.columns[:end_idx].tolist()
 874        # Group by cluster and average
 875        data_df = data_df.groupby("clust").agg(
 876            **{col: (col, "mean") for col in avg_cols},
 877            count=("clust", "size"),  # count rows in each cluster
 878            example_cells=("example_cell_id", lambda x: ",".join(x)),
 879            hcpc=("hcpc", lambda x: x.iloc[0]),
 880        )
 881        data_df = data_df.reset_index()  # Do NOT drop, index is "clust"
 882        # Create new columns
 883        metadata = pd.DataFrame(
 884            {
 885                "count": data_df["count"],
 886                "example_cells": data_df["example_cells"],
 887                "clust": data_df["clust"].astype(int),
 888                "hcpc": data_df["hcpc"].astype(int),
 889                "id": data_df["clust"].astype(int).astype(str),
 890                "cccluster": "0",  # Dummy value
 891                "ccdistance": 0.0,  # Dummy value
 892                "rownum": list(range(len(data_df))),
 893                "framegroup": 0,  # Dummy value
 894            }
 895        )
 896        # Need to pad the features to 761 columns, as per OCULAR report needs
 897        additional_columns = range(len(avg_cols), 761)
 898        if len(additional_columns) > 0:
 899            padding = pd.DataFrame(
 900                np.zeros((len(data_df), len(additional_columns))),
 901                columns=[f"pad{i}" for i in additional_columns],
 902            )
 903            data_df = pd.concat([data_df[avg_cols], padding, metadata], axis=1)
 904        else:
 905            data_df = pd.concat([data_df[avg_cols], metadata], axis=1)
 906
 907        # Save the cluster data
 908        data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv"))
 909        # Suppress pandas FutureWarning
 910        with warnings.catch_warnings():
 911            warnings.simplefilter(action="ignore", category=FutureWarning)
 912            pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df)
 913
 914    @classmethod
 915    def load_ocular(
 916        cls,
 917        input_path: str,
 918        event_type="cells",
 919        cell_data_files=(
 920            "rc-final1.rds",
 921            "rc-final2.rds",
 922            "rc-final3.rds",
 923            "rc-final4.rds",
 924            "ocular_interesting.rds",
 925        ),
 926        others_data_files=(
 927            "others-final1.rds",
 928            "others-final2.rds",
 929            "others-final3.rds",
 930            "others-final4.rds",
 931        ),
 932        atlas_data_files=(
 933            "ocular_interesting.rds",
 934            "ocular_not_interesting.rds",
 935        ),
 936        drop_common_events=True,
 937        log=None,
 938    ) -> Self:
 939        """
 940
 941        :param input_path:
 942        :param event_type:
 943        :param cell_data_files:
 944        :param others_data_files:
 945        :param atlas_data_files:
 946        :param drop_common_events:
 947        :param log:
 948        :return:
 949        """
 950        if pyreadr is None:
 951            raise ModuleNotFoundError(
 952                "pyreadr not installed. Install pyreadr directly "
 953                "or install csi-images with [rds] option to resolve."
 954            )
 955        # Check if the input path is a directory or a file
 956        if os.path.isfile(input_path):
 957            data_files = [os.path.basename(input_path)]
 958            input_path = os.path.dirname(input_path)
 959        if event_type == "cells":
 960            data_files = cell_data_files
 961        elif event_type == "others":
 962            data_files = others_data_files
 963        else:
 964            raise ValueError("Invalid event type.")
 965
 966        # Load the data from the OCULAR files
 967        file_data = {}
 968        for file in data_files:
 969            file_path = os.path.join(input_path, file)
 970            if not os.path.isfile(file_path):
 971                if log is not None:
 972                    log.warning(f"{file} not found for in {input_path}")
 973                continue
 974            file_data[file] = pyreadr.read_r(file_path)
 975            # Get the DataFrame associated with None (pyreadr dict quirk)
 976            file_data[file] = file_data[file][None]
 977            if len(file_data[file]) == 0:
 978                # File gets dropped from the dict
 979                file_data.pop(file)
 980                if log is not None:
 981                    log.warning(f"{file} has no cells")
 982                continue
 983
 984            if log is not None:
 985                log.debug(f"{file} has {len(file_data[file])} cells")
 986
 987            # Drop common cells if requested and in this file
 988            if file in atlas_data_files and drop_common_events:
 989                common_cell_indices = (
 990                    file_data[file]["catalogue_classification"] == "common_cell"
 991                )
 992                if log is not None:
 993                    log.debug(
 994                        f"Dropping {int(pd.Series.sum(common_cell_indices))}"
 995                        f"common cells from {file}"
 996                    )
 997                file_data[file] = file_data[file][common_cell_indices == False]
 998
 999            if len(file_data[file]) == 0:
1000                # File gets dropped from the dict
1001                file_data.pop(file)
1002                if log is not None:
1003                    log.warning(f"{file} has no cells after dropping common cells")
1004                continue
1005
1006            # Extract frame_id and cell_id
1007            # DAPI- events already have frame_id cell_id outside rowname
1008            if event_type == "cells" and "frame_id" not in file_data[file].columns:
1009                file_data[file]["rowname"] = file_data[file]["rowname"].astype("str")
1010                # get frame_id cell_id from rownames column and split into two columns
1011                split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True)
1012                if len(split_res.columns) != 2:
1013                    log.warning(
1014                        f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}'
1015                    )
1016                # then assign it back to the dataframe
1017                file_data[file][["frame_id", "cell_id"]] = split_res.astype("int")
1018            # reset indexes since they can cause NaN values in concat
1019            file_data[file] = file_data[file].reset_index(drop=True)
1020
1021        # Merge the data from all files
1022        if len(file_data) == 0:
1023            return EventArray()
1024        elif len(file_data) == 1:
1025            data = [file_data[file] for file in file_data.keys()][0]
1026        else:
1027            data = pd.concat(file_data.values())
1028
1029        if log is not None:
1030            log.debug(f"Gathered a total of {len(data)} events")
1031
1032        # Others is missing the "slide_id". Insert it right before "frame_id" column
1033        if event_type == "others" and "slide_id" not in data.columns:
1034            if os.path.basename(input_path) == "ocular":
1035                slide_id = os.path.basename(os.path.dirname(input_path))
1036            else:
1037                slide_id = "UNKNOWN"
1038            data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id)
1039
1040        # Sort according to ascending cell_id to keep the original, which is in manual_df
1041        data = data.sort_values(by=["cell_id"], ascending=True)
1042        # Filter out duplicates by x & y
1043        data = data.assign(
1044            unique_id=data["slide_id"]
1045            + "_"
1046            + data["frame_id"].astype(str)
1047            + "_"
1048            + data["cellx"].astype(int).astype(str)
1049            + "_"
1050            + data["celly"].astype(int).astype(str)
1051        )
1052        data = data.drop_duplicates(subset=["unique_id"], keep="first")
1053        # Normal unique_id is with cell_id
1054        data = data.assign(
1055            unique_id=data["slide_id"]
1056            + "_"
1057            + data["frame_id"].astype(str)
1058            + "_"
1059            + data["cell_id"].astype(str)
1060        )
1061        data = data.reset_index(drop=True)
1062        # All columns up to "slide_id" are features; drop the "slide_id"
1063        features = data.loc[:, :"slide_id"].iloc[:, :-1]
1064        data = data.loc[:, "slide_id":]
1065        # Grab the info columns
1066        info = data[["slide_id", "frame_id", "cellx", "celly"]]
1067        info.columns = ["slide_id", "tile", "x", "y"]
1068        info = info.assign(
1069            roi=0,  # OCULAR only works on 1 ROI, as far as known
1070            size=25,  # Static, for later montaging
1071        )
1072        info = info[["slide_id", "tile", "roi", "x", "y", "size"]]
1073        # Metadata has duplicate columns for later convenience
1074        metadata = data
1075        # Certain columns tend to be problematic with mixed data formats...
1076        for col in ["TRITC", "CY5", "FITC"]:
1077            if col in metadata:
1078                labels = {
1079                    "False": False,
1080                    "True": True,
1081                    "FALSE": False,
1082                    "TRUE": True,
1083                }
1084                metadata[col] = metadata[col].map(labels).astype(bool)
1085        for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]:
1086            if col in metadata:
1087                metadata[col] = metadata[col].fillna(-1).astype(int)
1088        return EventArray(info, metadata, features)

A class that holds a large number of events' data, making it easy to analyze and manipulate many events at once. A more separated version of the Event class.

EventArray( info: pandas.core.frame.DataFrame = None, metadata: pandas.core.frame.DataFrame = None, features: pandas.core.frame.DataFrame = None)
261    def __init__(
262        self,
263        info: pd.DataFrame = None,
264        metadata: pd.DataFrame = None,
265        features: pd.DataFrame = None,
266    ):
267        # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y", "size"
268        if info is not None:
269            if list(info.columns) != self.INFO_COLUMNS:
270                raise ValueError(
271                    "EventArray.info must have columns 'slide_id', 'tile', 'roi', 'x', 'y', 'size'"
272                )
273            # Copy first to avoid modifying the original
274            info = info.copy()
275            # Ensure that the columns are the right types
276            info["slide_id"] = info["slide_id"].astype(str)
277            info["tile"] = info["tile"].astype(np.uint16)
278            info["roi"] = info["roi"].astype(np.uint8)
279            info["x"] = info["x"].round().astype(np.uint16)
280            info["y"] = info["y"].round().astype(np.uint16)
281            info["size"] = info["size"].round().astype(np.uint16)
282        # All DataFrames must all have the same number of rows
283        if metadata is not None and (info is None or len(info) != len(metadata)):
284            raise ValueError(
285                "If EventArray.metadata is not None, it should match rows with .info"
286            )
287        if features is not None and (info is None or len(info) != len(features)):
288            raise ValueError(
289                "If EventArray.features is not None, it should match rows with .info"
290            )
291        self.info = info
292        self.metadata = metadata
293        self.features = features
INFO_COLUMNS = ['slide_id', 'tile', 'roi', 'x', 'y', 'size']
info
metadata
features
def get_sort_order(self, by: str | list[str], ascending: bool | list[bool] = True):
342    def get_sort_order(self, by: str | list[str], ascending: bool | list[bool] = True):
343        """
344        Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.
345        :param by: name of the column(s) to sort by.
346        :param ascending: whether to sort in ascending order; can be a list to match by
347        :return: the order of the indices to sort by.
348        """
349        columns = self.get(by)
350        return columns.sort_values(by=by, ascending=ascending).index

Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.

Parameters
  • by: name of the column(s) to sort by.
  • ascending: whether to sort in ascending order; can be a list to match by
Returns

the order of the indices to sort by.

def sort(self, by: str | list[str], ascending: bool | list[bool] = True) -> Self:
352    def sort(self, by: str | list[str], ascending: bool | list[bool] = True) -> Self:
353        """
354        Sort the EventArray by column(s) in the info, metadata, or features DataFrames.
355        :param by: name of the column(s) to sort by.
356        :param ascending: whether to sort in ascending order; can be a list to match by
357        :return: a new, sorted EventArray.
358        """
359        order = self.get_sort_order(by, ascending)
360        info = self.info.loc[order].reset_index(drop=True)
361        if self.metadata is not None:
362            metadata = self.metadata.loc[order].reset_index(drop=True)
363        else:
364            metadata = None
365        if self.features is not None:
366            features = self.features.loc[order].reset_index(drop=True)
367        else:
368            features = None
369        return EventArray(info, metadata, features)

Sort the EventArray by column(s) in the info, metadata, or features DataFrames.

Parameters
  • by: name of the column(s) to sort by.
  • ascending: whether to sort in ascending order; can be a list to match by
Returns

a new, sorted EventArray.

def get( self, column_names: int | str | list[int] | list[str]) -> pandas.core.frame.DataFrame:
371    def get(self, column_names: int | str | list[int] | list[str]) -> pd.DataFrame:
372        """
373        Get a DataFrame with the specified columns from the EventArray, by value.
374        :param column_names: the names of the columns to get.
375        :return: a DataFrame with the specified columns.
376        """
377        if isinstance(column_names, int) or isinstance(column_names, str):
378            column_names = [column_names]
379        columns = []
380        for column_name in column_names:
381            if column_name in self.info.columns:
382                columns.append(self.info[column_name])
383            elif self.metadata is not None and column_name in self.metadata.columns:
384                columns.append(self.metadata[column_name])
385            elif self.features is not None and column_name in self.features.columns:
386                columns.append(self.features[column_name])
387            else:
388                raise ValueError(f"Column {column_name} not found in EventArray")
389        return pd.concat(columns, axis=1)

Get a DataFrame with the specified columns from the EventArray, by value.

Parameters
  • column_names: the names of the columns to get.
Returns

a DataFrame with the specified columns.

def rows(self, rows) -> Self:
391    def rows(self, rows) -> Self:
392        """
393        Get a subset of the EventArray rows based on a boolean or integer index, by value.
394        :param rows: the indices to get as a 1D boolean/integer list/array/series
395        :return: a new EventArray with the subset of events.
396        """
397        info = self.info.loc[rows].reset_index(drop=True)
398        if self.metadata is not None:
399            metadata = self.metadata.loc[rows].reset_index(drop=True)
400        else:
401            metadata = None
402        if self.features is not None:
403            features = self.features.loc[rows].reset_index(drop=True)
404        else:
405            features = None
406        return EventArray(info, metadata, features)

Get a subset of the EventArray rows based on a boolean or integer index, by value.

Parameters
  • rows: the indices to get as a 1D boolean/integer list/array/series
Returns

a new EventArray with the subset of events.

def copy(self) -> Self:
408    def copy(self) -> Self:
409        """
410        Create a deep copy of the EventArray.
411        :return: a deep copy of the EventArray.
412        """
413        return EventArray(
414            info=self.info.copy(),
415            metadata=None if self.metadata is None else self.metadata.copy(),
416            features=None if self.features is None else self.features.copy(),
417        )

Create a deep copy of the EventArray.

Returns

a deep copy of the EventArray.

def add_metadata( self, new_metadata: pandas.core.series.Series | pandas.core.frame.DataFrame) -> None:
419    def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None:
420        """
421        Add metadata to the EventArray. Removes the need to check if metadata is None.
422        Overwrites any existing metadata with the same column names as the new metadata.
423        :param new_metadata: the metadata to add.
424        """
425        if len(self) != len(new_metadata):
426            raise ValueError("New metadata must match length of existing info")
427
428        if self.metadata is None:
429            self.metadata = new_metadata
430        else:
431            if isinstance(new_metadata, pd.Series):
432                self.metadata[new_metadata.name] = new_metadata
433            else:
434                # It's a DataFrame
435                self.metadata[new_metadata.columns] = new_metadata

Add metadata to the EventArray. Removes the need to check if metadata is None. Overwrites any existing metadata with the same column names as the new metadata.

Parameters
  • new_metadata: the metadata to add.
def add_features( self, new_features: pandas.core.series.Series | pandas.core.frame.DataFrame) -> None:
437    def add_features(self, new_features: pd.Series | pd.DataFrame) -> None:
438        """
439        Add features to the EventArray. Removes the need to check if features is None.
440        Overwrites any existing features with the same column names as the new features.
441        :param new_features: the features to add.
442        """
443        if len(self) != len(new_features):
444            raise ValueError("New features must match length of existing info")
445
446        if self.features is None:
447            self.features = new_features
448        else:
449            if isinstance(new_features, pd.Series):
450                self.features[new_features.name] = new_features
451            else:
452                # It's a DataFrame
453                self.features[new_features.columns] = new_features

Add features to the EventArray. Removes the need to check if features is None. Overwrites any existing features with the same column names as the new features.

Parameters
  • new_features: the features to add.
@classmethod
def merge(cls, events: list[typing.Self]) -> Self:
455    @classmethod
456    def merge(cls, events: list[Self]) -> Self:
457        """
458        Combine EventArrays in a list into a single EventArray.
459        :param events: the new list of events.
460        """
461        all_info = []
462        all_metadata = []
463        all_features = []
464        for event_array in events:
465            # Skip empty EventArrays
466            if event_array.info is not None:
467                all_info.append(event_array.info)
468            if event_array.metadata is not None:
469                all_metadata.append(event_array.metadata)
470            if event_array.features is not None:
471                all_features.append(event_array.features)
472        if len(all_info) == 0:
473            return EventArray()
474        else:
475            all_info = pd.concat(all_info, ignore_index=True)
476        if len(all_metadata) == 0:
477            all_metadata = None
478        else:
479            all_metadata = pd.concat(all_metadata, ignore_index=True)
480        if len(all_features) == 0:
481            all_features = None
482        else:
483            all_features = pd.concat(all_features, ignore_index=True)
484
485        return EventArray(all_info, all_metadata, all_features)

Combine EventArrays in a list into a single EventArray.

Parameters
  • events: the new list of events.
def to_events( self, scans: csi_images.csi_scans.Scan | list[csi_images.csi_scans.Scan], ignore_missing_scans=True, ignore_metadata=False, ignore_features=False) -> list[Event]:
487    def to_events(
488        self,
489        scans: Scan | list[Scan],
490        ignore_missing_scans=True,
491        ignore_metadata=False,
492        ignore_features=False,
493    ) -> list[Event]:
494        """
495        Get the events in the EventArray as a list of events.
496        :param scans: the scans that the events belong to, auto-matched by slide_id.
497        Pass None if you don't care about scan metadata (pass ignore_missing_scans).
498        :param ignore_missing_scans: whether to create blank scans for events without scans.
499        :param ignore_metadata: whether to ignore metadata or not
500        :param ignore_features: whether to ignore features or not
501        :return:
502        """
503        if isinstance(scans, Scan):
504            scans = [scans] * len(self.info)
505        events = []
506        for i in range(len(self.info)):
507            # Determine the associated scan
508            scan = None
509            for s in scans:
510                if s.slide_id == self.info["slide_id"][i]:
511                    scan = s
512                    break
513            if scan is None:
514                if ignore_missing_scans:
515                    # Create a placeholder scan if the scan is missing
516                    scan = Scan.make_placeholder(
517                        self.info["slide_id"][i],
518                        self.info["tile"][i],
519                        self.info["roi"][i],
520                    )
521                else:
522                    raise ValueError(
523                        f"Scan {self.info['slide_id'][i]} not found for event {i}."
524                    )
525            # Prepare the metadata and features
526            if ignore_metadata or self.metadata is None:
527                metadata = None
528            else:
529                # This Series creation method is less efficient,
530                # but required for preserving dtypes
531                metadata = pd.Series(
532                    {col: self.metadata.loc[i, col] for col in self.metadata.columns},
533                    dtype=object,
534                )
535            if ignore_features or self.features is None:
536                features = None
537            else:
538                features = pd.Series(
539                    {col: self.features.loc[i, col] for col in self.features.columns},
540                    dtype=object,
541                )
542            # Create the event and append it to the list
543            events.append(
544                Event(
545                    scan,
546                    Tile(scan, self.info["tile"][i], self.info["roi"][i]),
547                    self.info["x"][i],
548                    self.info["y"][i],
549                    size=self.info["size"][i],
550                    metadata=metadata,
551                    features=features,
552                )
553            )
554        return events

Get the events in the EventArray as a list of events.

Parameters
  • scans: the scans that the events belong to, auto-matched by slide_id. Pass None if you don't care about scan metadata (pass ignore_missing_scans).
  • ignore_missing_scans: whether to create blank scans for events without scans.
  • ignore_metadata: whether to ignore metadata or not
  • ignore_features: whether to ignore features or not
Returns
@classmethod
def from_events(cls, events: list[Event]) -> Self:
556    @classmethod
557    def from_events(cls, events: list[Event]) -> Self:
558        """
559        Set the events in the EventArray to a new list of events.
560        :param events: the new list of events.
561        """
562        # Return an empty array if we were passed nothing
563        if events is None or len(events) == 0:
564            return EventArray()
565        # Otherwise, grab the info
566        info = pd.DataFrame(
567            {
568                "slide_id": [event.scan.slide_id for event in events],
569                "tile": [event.tile.n for event in events],
570                "roi": [event.tile.n_roi for event in events],
571                "x": [event.x for event in events],
572                "y": [event.y for event in events],
573                "size": [event.size for event in events],
574            }
575        )
576        metadata_list = [event.metadata for event in events]
577        # Iterate through and ensure that all metadata is the same shape
578        for metadata in metadata_list:
579            if type(metadata) != type(metadata_list[0]):
580                raise ValueError("All metadata must be the same type.")
581            if metadata is not None and metadata.shape != metadata_list[0].shape:
582                raise ValueError("All metadata must be the same shape.")
583        if metadata_list[0] is None:
584            metadata = None
585        else:
586            metadata = pd.DataFrame(metadata_list)
587        features_list = [event.features for event in events]
588        # Iterate through and ensure that all features are the same shape
589        for features in features_list:
590            if type(features) != type(features_list[0]):
591                raise ValueError("All features must be the same type.")
592            if features is not None and features.shape != features_list[0].shape:
593                raise ValueError("All features must be the same shape.")
594        if features_list[0] is None:
595            features = None
596        else:
597            features = pd.DataFrame(features_list)
598        return EventArray(info=info, metadata=metadata, features=features)

Set the events in the EventArray to a new list of events.

Parameters
  • events: the new list of events.
def to_dataframe(self) -> pandas.core.frame.DataFrame:
600    def to_dataframe(self) -> pd.DataFrame:
601        """
602        Convert all the data in the EventArray to a single DataFrame.
603        :return: a DataFrame with all the data in the EventArray.
604        """
605        # Make a copy of the info DataFrame and prepend "info_" to the column names
606        output = self.info.copy()
607        output.columns = [f"info_{col}" for col in output.columns]
608        # Combine with the metadata and prepend "metadata_" to the column names
609        if self.metadata is not None:
610            metadata = self.metadata.copy()
611            metadata.columns = [f"metadata_{col}" for col in metadata.columns]
612            output = pd.concat([output, metadata], axis=1)
613        # Combine with the features and prepend "features_" to the column names
614        if self.features is not None:
615            features = self.features.copy()
616            features.columns = [f"features_{col}" for col in features.columns]
617            output = pd.concat([output, features], axis=1)
618        return output

Convert all the data in the EventArray to a single DataFrame.

Returns

a DataFrame with all the data in the EventArray.

@classmethod
def from_dataframe(cls, df) -> Self:
620    @classmethod
621    def from_dataframe(cls, df) -> Self:
622        """
623        From a single, special DataFrame, create an EventArray.
624        :return: a DataFrame with all the data in the EventArray.
625        """
626        # Split the columns into info, metadata, and features and strip prefix
627        info = df[[col for col in df.columns if col.startswith("info_")]].copy()
628        info.columns = [col.replace("info_", "") for col in info.columns]
629        if info.size == 0:
630            info = None
631        metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy()
632        metadata.columns = [col.replace("metadata_", "") for col in metadata.columns]
633        if metadata.size == 0:
634            metadata = None
635        features = df[[col for col in df.columns if col.startswith("features_")]].copy()
636        features.columns = [col.replace("features_", "") for col in features.columns]
637        if features.size == 0:
638            features = None
639        return cls(info=info, metadata=metadata, features=features)

From a single, special DataFrame, create an EventArray.

Returns

a DataFrame with all the data in the EventArray.

@classmethod
def from_mask( cls, mask: numpy.ndarray, slide_id: str, tile_n: int, n_roi: int = 0, include_cell_id: bool = True, images: list[numpy.ndarray] = None, image_labels: list[str] = None, properties: list[str] = None) -> Self:
641    @classmethod
642    def from_mask(
643        cls,
644        mask: np.ndarray,
645        slide_id: str,
646        tile_n: int,
647        n_roi: int = 0,
648        include_cell_id: bool = True,
649        images: list[np.ndarray] = None,
650        image_labels: list[str] = None,
651        properties: list[str] = None,
652    ) -> Self:
653        """
654        Extract events from a mask DataFrame, including metadata and features.
655        :param mask: the mask to extract events from.
656        :param slide_id: the slide ID the mask is from.
657        :param tile_n: the tile number the mask is from.
658        :param n_roi: the ROI number the mask is from.
659        :param include_cell_id: whether to include the cell_id, or numerical
660        mask label, as metadata in the EventArray.
661        :param images: the intensity images to extract features from.
662        :param image_labels: the labels for the intensity images.
663        :param properties: list of properties to extract in addition to the defaults:
664        :return: EventArray corresponding to the mask labels.
665        """
666        # Gather mask_info
667        if images is not None and image_labels is not None:
668            if len(images) != len(image_labels):
669                raise ValueError("Intensity images and labels must match lengths.")
670
671        mask_info = extract_mask_info(mask, images, image_labels, properties)
672
673        if len(mask_info) == 0:
674            return EventArray()
675
676        # Combine provided info and mask info
677        info = pd.DataFrame(
678            {
679                "slide_id": slide_id,
680                "tile": tile_n,
681                "roi": n_roi,
682                "x": mask_info["x"],
683                "y": mask_info["y"],
684                "size": mask_info["size"],
685            },
686        )
687        # Extract a metadata column if desired
688        if include_cell_id:
689            metadata = pd.DataFrame({"cell_id": mask_info["id"]})
690        else:
691            metadata = None
692        # If any additional properties were extracted, add them as features
693        mask_info = mask_info.drop(columns=["id", "x", "y", "size"], errors="ignore")
694        if len(mask_info.columns) > 0:
695            features = mask_info
696        else:
697            features = None
698        return EventArray(info, metadata, features)

Extract events from a mask DataFrame, including metadata and features.

Parameters
  • mask: the mask to extract events from.
  • slide_id: the slide ID the mask is from.
  • tile_n: the tile number the mask is from.
  • n_roi: the ROI number the mask is from.
  • include_cell_id: whether to include the cell_id, or numerical mask label, as metadata in the EventArray.
  • images: the intensity images to extract features from.
  • image_labels: the labels for the intensity images.
  • properties: list of properties to extract in addition to the defaults:
Returns

EventArray corresponding to the mask labels.

def save_csv(self, output_path: str) -> bool:
700    def save_csv(self, output_path: str) -> bool:
701        """
702        Save the events to an CSV file, including metadata and features.
703        :param output_path:
704        :return:
705        """
706        self.to_dataframe().to_csv(output_path, index=False)
707        return os.path.exists(output_path)

Save the events to an CSV file, including metadata and features.

Parameters
  • output_path:
Returns
@classmethod
def load_csv(cls, input_path: str) -> Self:
709    @classmethod
710    def load_csv(cls, input_path: str) -> Self:
711        """
712        Load the events from an CSV file, including metadata and features.
713        :param input_path:
714        :return:
715        """
716        # Load the CSV file
717        df = pd.read_csv(input_path)
718        return cls.from_dataframe(df)

Load the events from an CSV file, including metadata and features.

Parameters
  • input_path:
Returns
def save_hdf5(self, output_path: str) -> bool:
720    def save_hdf5(self, output_path: str) -> bool:
721        """
722        Save the events to an HDF5 file, including metadata and features.
723        Uses the pandas-provided HDF5 functions for ease, and external compatibility,
724        though these files are slightly harder to view in HDFView or similar.
725        :param output_path:
726        :return:
727        """
728        # Open the output_path as an HDF5 file
729        with pd.HDFStore(output_path) as store:
730            # Store the dataframes in the HDF5 file
731            if self.info is not None:
732                store.put("info", self.info, index=False)
733            if self.metadata is not None:
734                store.put("metadata", self.metadata, index=False)
735            if self.features is not None:
736                store.put("features", self.features, index=False)
737        return os.path.exists(output_path)

Save the events to an HDF5 file, including metadata and features. Uses the pandas-provided HDF5 functions for ease, and external compatibility, though these files are slightly harder to view in HDFView or similar.

Parameters
  • output_path:
Returns
@classmethod
def load_hdf5(cls, input_path: str) -> Self:
739    @classmethod
740    def load_hdf5(cls, input_path: str) -> Self:
741        """
742        Load the events from an HDF5 file, including metadata and features.
743        :param input_path:
744        :return:
745        """
746        # Open the input_path as an HDF5 file
747        with pd.HDFStore(input_path) as store:
748            # Load the dataframes from the HDF5 file
749            info = store.get("info") if "info" in store else None
750            metadata = store.get("metadata") if "metadata" in store else None
751            features = store.get("features") if "features" in store else None
752        return cls(info=info, metadata=metadata, features=features)

Load the events from an HDF5 file, including metadata and features.

Parameters
  • input_path:
Returns
def save_ocular(self, output_path: str, event_type: str = 'cells'):
754    def save_ocular(self, output_path: str, event_type: str = "cells"):
755        """
756        Save the events to an OCULAR file. Relies on the dataframe originating
757        from an OCULAR file (same columns; duplicate metadata/info).
758        :param output_path:
759        :param event_type:
760        :return:
761        """
762        if pyreadr is None:
763            raise ModuleNotFoundError(
764                "pyreadr not installed. Install pyreadr directly "
765                "or install csi-images with [rds] option to resolve."
766            )
767        if event_type == "cells":
768            file_stub = "rc-final"
769        elif event_type == "others":
770            file_stub = "others-final"
771        else:
772            raise ValueError("Invalid event type. Must be cells or others.")
773
774        # Ensure good metadata
775        metadata = pd.DataFrame(
776            {
777                "slide_id": self.info["slide_id"],
778                "frame_id": self.info["tile"],
779                "cellx": self.info["x"],
780                "celly": self.info["y"],
781                "cell_id": (
782                    self.metadata["cell_id"]
783                    if "cell_id" in self.metadata.columns
784                    else range(len(self.info))
785                ),
786            }
787        )
788        if self.metadata is not None:
789            metadata[self.metadata.columns] = self.metadata.copy()
790
791        # Check for the "ocular_interesting" column
792        if event_type == "cells":
793            if "ocular_interesting" in metadata.columns:
794                interesting_rows = metadata["ocular_interesting"].to_numpy(dtype=bool)
795            elif "hcpc" in metadata.columns:
796                # Interesting cells don't get an hcpc designation, leaving them as -1
797                interesting_rows = (
798                    metadata["hcpc"].to_numpy() == -1
799                )  # interesting cells
800            else:
801                interesting_rows = []
802            if sum(interesting_rows) > 0:
803                # Split the metadata into interesting and regular
804                interesting_events = self.rows(interesting_rows)
805                interesting_df = pd.concat(
806                    [interesting_events.features, interesting_events.metadata], axis=1
807                )
808                data_events = self.rows(~interesting_rows)
809                data_df = pd.concat(
810                    [data_events.features, data_events.metadata], axis=1
811                )
812                data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore")
813
814                # Drop particular columns for "interesting"
815                interesting_df = interesting_df.drop(
816                    [
817                        "clust",
818                        "hcpc",
819                        "frame_id",
820                        "cell_id",
821                        "unique_id",
822                        "ocular_interesting",
823                    ],
824                    axis=1,
825                    errors="ignore",
826                )
827                # Save both .csv and .rds
828                file_stub = os.path.join(output_path, "ocular_interesting")
829                interesting_df.to_csv(f"{file_stub}.csv")
830                # Suppress pandas FutureWarning
831                with warnings.catch_warnings():
832                    warnings.simplefilter(action="ignore", category=FutureWarning)
833                    pyreadr.write_rds(f"{file_stub}.rds", interesting_df)
834            else:
835                data_df = pd.concat([self.features, metadata], axis=1)
836        else:
837            # Get all data and reset_index (will copy it)
838            data_df = pd.concat([self.features, metadata], axis=1)
839
840        # Split based on cluster number to conform to *-final[1-4].rds
841        n_clusters = max(data_df["clust"]) + 1
842        split_idx = [round(i * n_clusters / 4) for i in range(5)]
843        for i in range(4):
844            subset = (split_idx[i] <= data_df["clust"]) & (
845                data_df["clust"] < split_idx[i + 1]
846            )
847            data_df.loc[subset, "hcpc"] = i + 1
848            subset = data_df[subset].reset_index(drop=True)
849            # Suppress pandas FutureWarning
850            with warnings.catch_warnings():
851                warnings.simplefilter(action="ignore", category=FutureWarning)
852                pyreadr.write_rds(
853                    os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset
854                )
855
856        # Create new example cell strings
857        data_df["example_cell_id"] = (
858            data_df["slide_id"]
859            + " "
860            + data_df["frame_id"].astype(str)
861            + " "
862            + data_df["cell_id"].astype(str)
863            + " "
864            + data_df["cellx"].astype(int).astype(str)
865            + " "
866            + data_df["celly"].astype(int).astype(str)
867        )
868        # Find averagable data columns
869        if "cellcluster_id" in data_df.columns:
870            end_idx = data_df.columns.get_loc("cellcluster_id")
871        else:
872            end_idx = data_df.columns.get_loc("slide_id")
873        avg_cols = data_df.columns[:end_idx].tolist()
874        # Group by cluster and average
875        data_df = data_df.groupby("clust").agg(
876            **{col: (col, "mean") for col in avg_cols},
877            count=("clust", "size"),  # count rows in each cluster
878            example_cells=("example_cell_id", lambda x: ",".join(x)),
879            hcpc=("hcpc", lambda x: x.iloc[0]),
880        )
881        data_df = data_df.reset_index()  # Do NOT drop, index is "clust"
882        # Create new columns
883        metadata = pd.DataFrame(
884            {
885                "count": data_df["count"],
886                "example_cells": data_df["example_cells"],
887                "clust": data_df["clust"].astype(int),
888                "hcpc": data_df["hcpc"].astype(int),
889                "id": data_df["clust"].astype(int).astype(str),
890                "cccluster": "0",  # Dummy value
891                "ccdistance": 0.0,  # Dummy value
892                "rownum": list(range(len(data_df))),
893                "framegroup": 0,  # Dummy value
894            }
895        )
896        # Need to pad the features to 761 columns, as per OCULAR report needs
897        additional_columns = range(len(avg_cols), 761)
898        if len(additional_columns) > 0:
899            padding = pd.DataFrame(
900                np.zeros((len(data_df), len(additional_columns))),
901                columns=[f"pad{i}" for i in additional_columns],
902            )
903            data_df = pd.concat([data_df[avg_cols], padding, metadata], axis=1)
904        else:
905            data_df = pd.concat([data_df[avg_cols], metadata], axis=1)
906
907        # Save the cluster data
908        data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv"))
909        # Suppress pandas FutureWarning
910        with warnings.catch_warnings():
911            warnings.simplefilter(action="ignore", category=FutureWarning)
912            pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df)

Save the events to an OCULAR file. Relies on the dataframe originating from an OCULAR file (same columns; duplicate metadata/info).

Parameters
  • output_path:
  • event_type:
Returns
@classmethod
def load_ocular( cls, input_path: str, event_type='cells', cell_data_files=('rc-final1.rds', 'rc-final2.rds', 'rc-final3.rds', 'rc-final4.rds', 'ocular_interesting.rds'), others_data_files=('others-final1.rds', 'others-final2.rds', 'others-final3.rds', 'others-final4.rds'), atlas_data_files=('ocular_interesting.rds', 'ocular_not_interesting.rds'), drop_common_events=True, log=None) -> Self:
 914    @classmethod
 915    def load_ocular(
 916        cls,
 917        input_path: str,
 918        event_type="cells",
 919        cell_data_files=(
 920            "rc-final1.rds",
 921            "rc-final2.rds",
 922            "rc-final3.rds",
 923            "rc-final4.rds",
 924            "ocular_interesting.rds",
 925        ),
 926        others_data_files=(
 927            "others-final1.rds",
 928            "others-final2.rds",
 929            "others-final3.rds",
 930            "others-final4.rds",
 931        ),
 932        atlas_data_files=(
 933            "ocular_interesting.rds",
 934            "ocular_not_interesting.rds",
 935        ),
 936        drop_common_events=True,
 937        log=None,
 938    ) -> Self:
 939        """
 940
 941        :param input_path:
 942        :param event_type:
 943        :param cell_data_files:
 944        :param others_data_files:
 945        :param atlas_data_files:
 946        :param drop_common_events:
 947        :param log:
 948        :return:
 949        """
 950        if pyreadr is None:
 951            raise ModuleNotFoundError(
 952                "pyreadr not installed. Install pyreadr directly "
 953                "or install csi-images with [rds] option to resolve."
 954            )
 955        # Check if the input path is a directory or a file
 956        if os.path.isfile(input_path):
 957            data_files = [os.path.basename(input_path)]
 958            input_path = os.path.dirname(input_path)
 959        if event_type == "cells":
 960            data_files = cell_data_files
 961        elif event_type == "others":
 962            data_files = others_data_files
 963        else:
 964            raise ValueError("Invalid event type.")
 965
 966        # Load the data from the OCULAR files
 967        file_data = {}
 968        for file in data_files:
 969            file_path = os.path.join(input_path, file)
 970            if not os.path.isfile(file_path):
 971                if log is not None:
 972                    log.warning(f"{file} not found for in {input_path}")
 973                continue
 974            file_data[file] = pyreadr.read_r(file_path)
 975            # Get the DataFrame associated with None (pyreadr dict quirk)
 976            file_data[file] = file_data[file][None]
 977            if len(file_data[file]) == 0:
 978                # File gets dropped from the dict
 979                file_data.pop(file)
 980                if log is not None:
 981                    log.warning(f"{file} has no cells")
 982                continue
 983
 984            if log is not None:
 985                log.debug(f"{file} has {len(file_data[file])} cells")
 986
 987            # Drop common cells if requested and in this file
 988            if file in atlas_data_files and drop_common_events:
 989                common_cell_indices = (
 990                    file_data[file]["catalogue_classification"] == "common_cell"
 991                )
 992                if log is not None:
 993                    log.debug(
 994                        f"Dropping {int(pd.Series.sum(common_cell_indices))}"
 995                        f"common cells from {file}"
 996                    )
 997                file_data[file] = file_data[file][common_cell_indices == False]
 998
 999            if len(file_data[file]) == 0:
1000                # File gets dropped from the dict
1001                file_data.pop(file)
1002                if log is not None:
1003                    log.warning(f"{file} has no cells after dropping common cells")
1004                continue
1005
1006            # Extract frame_id and cell_id
1007            # DAPI- events already have frame_id cell_id outside rowname
1008            if event_type == "cells" and "frame_id" not in file_data[file].columns:
1009                file_data[file]["rowname"] = file_data[file]["rowname"].astype("str")
1010                # get frame_id cell_id from rownames column and split into two columns
1011                split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True)
1012                if len(split_res.columns) != 2:
1013                    log.warning(
1014                        f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}'
1015                    )
1016                # then assign it back to the dataframe
1017                file_data[file][["frame_id", "cell_id"]] = split_res.astype("int")
1018            # reset indexes since they can cause NaN values in concat
1019            file_data[file] = file_data[file].reset_index(drop=True)
1020
1021        # Merge the data from all files
1022        if len(file_data) == 0:
1023            return EventArray()
1024        elif len(file_data) == 1:
1025            data = [file_data[file] for file in file_data.keys()][0]
1026        else:
1027            data = pd.concat(file_data.values())
1028
1029        if log is not None:
1030            log.debug(f"Gathered a total of {len(data)} events")
1031
1032        # Others is missing the "slide_id". Insert it right before "frame_id" column
1033        if event_type == "others" and "slide_id" not in data.columns:
1034            if os.path.basename(input_path) == "ocular":
1035                slide_id = os.path.basename(os.path.dirname(input_path))
1036            else:
1037                slide_id = "UNKNOWN"
1038            data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id)
1039
1040        # Sort according to ascending cell_id to keep the original, which is in manual_df
1041        data = data.sort_values(by=["cell_id"], ascending=True)
1042        # Filter out duplicates by x & y
1043        data = data.assign(
1044            unique_id=data["slide_id"]
1045            + "_"
1046            + data["frame_id"].astype(str)
1047            + "_"
1048            + data["cellx"].astype(int).astype(str)
1049            + "_"
1050            + data["celly"].astype(int).astype(str)
1051        )
1052        data = data.drop_duplicates(subset=["unique_id"], keep="first")
1053        # Normal unique_id is with cell_id
1054        data = data.assign(
1055            unique_id=data["slide_id"]
1056            + "_"
1057            + data["frame_id"].astype(str)
1058            + "_"
1059            + data["cell_id"].astype(str)
1060        )
1061        data = data.reset_index(drop=True)
1062        # All columns up to "slide_id" are features; drop the "slide_id"
1063        features = data.loc[:, :"slide_id"].iloc[:, :-1]
1064        data = data.loc[:, "slide_id":]
1065        # Grab the info columns
1066        info = data[["slide_id", "frame_id", "cellx", "celly"]]
1067        info.columns = ["slide_id", "tile", "x", "y"]
1068        info = info.assign(
1069            roi=0,  # OCULAR only works on 1 ROI, as far as known
1070            size=25,  # Static, for later montaging
1071        )
1072        info = info[["slide_id", "tile", "roi", "x", "y", "size"]]
1073        # Metadata has duplicate columns for later convenience
1074        metadata = data
1075        # Certain columns tend to be problematic with mixed data formats...
1076        for col in ["TRITC", "CY5", "FITC"]:
1077            if col in metadata:
1078                labels = {
1079                    "False": False,
1080                    "True": True,
1081                    "FALSE": False,
1082                    "TRUE": True,
1083                }
1084                metadata[col] = metadata[col].map(labels).astype(bool)
1085        for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]:
1086            if col in metadata:
1087                metadata[col] = metadata[col].fillna(-1).astype(int)
1088        return EventArray(info, metadata, features)
Parameters
  • input_path:
  • event_type:
  • cell_data_files:
  • others_data_files:
  • atlas_data_files:
  • drop_common_events:
  • log:
Returns