csi_images.csi_events

Contains the Event class, which represents a single event in a scan. The Event class optionally holds metadata and features. Lists of events with similar metadata or features can be combined into DataFrames for analysis.

The Event class holds the position of the event in the frame, which can be converted to the position in the scanner or slide coordinate positions. See the csi_utils.csi_scans documentation page for more information on the coordinate systems.

   1"""
   2Contains the Event class, which represents a single event in a scan.
   3The Event class optionally holds metadata and features. Lists of events with
   4similar metadata or features can be combined into DataFrames for analysis.
   5
   6The Event class holds the position of the event in the frame, which can be converted
   7to the position in the scanner or slide coordinate positions. See the
   8csi_utils.csi_scans documentation page for more information on the coordinate systems.
   9"""
  10
  11import os
  12import math
  13import warnings
  14from typing import Self
  15
  16import numpy as np
  17import pandas as pd
  18
  19from .csi_scans import Scan
  20from .csi_tiles import Tile
  21from .csi_frames import Frame
  22
  23# Optional dependencies; will raise errors in particular functions if not installed
  24try:
  25    from .csi_images import extract_mask_info
  26except ImportError:
  27    extract_mask_info = None
  28try:
  29    import pyreadr
  30except ImportError:
  31    pyreadr = None
  32
  33
  34class Event:
  35    """
  36    A class that represents a single event in a scan, making it easy to evaluate
  37    singular events. Required metadata is exposed as attributes, and optional
  38    metadata and features are stored as DataFrames.
  39    """
  40
  41    SCAN_TO_SLIDE_TRANSFORM = {
  42        # Axioscan zero is in the top-right corner instead of top-left
  43        Scan.Type.AXIOSCAN7: np.array(
  44            [
  45                [1, 0, 75000],
  46                [0, 1, 0],
  47                [0, 0, 1],
  48            ]
  49        ),
  50        # BZScanner coordinates are a special kind of messed up:
  51        # - The slide is upside-down.
  52        # - The slide is oriented vertically, with the barcode at the bottom.
  53        # - Tiles are numbered from the top-right
  54        Scan.Type.BZSCANNER: np.array(
  55            [
  56                [0, -1, 75000],
  57                [-1, 0, 25000],
  58                [0, 0, 1],
  59            ]
  60        ),
  61    }
  62    """
  63    Homogeneous transformation matrices for converting between scanner and slide
  64    coordinates. The matrices are 3x3, with the final column representing the
  65    translation in micrometers (um). For more information, see 
  66    [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations).
  67    
  68    Transformations are nominal, and accuracy is not guaranteed; this is due to 
  69    imperfections in slides and alignment in the scanners. Units are in micrometers.
  70    """
  71
  72    def __init__(
  73        self,
  74        scan: Scan,
  75        tile: Tile,
  76        x: int,
  77        y: int,
  78        size: int = 12,  # End-to-end size in pixels
  79        metadata: pd.Series = None,
  80        features: pd.Series = None,
  81    ):
  82        self.scan = scan
  83        self.tile = tile
  84        self.x = int(x)
  85        self.y = int(y)
  86        self.size = int(size)
  87        self.metadata = metadata
  88        self.features = features
  89
  90    def __repr__(self) -> str:
  91        return f"{self.scan.slide_id}-{self.tile.n}-{self.x}-{self.y}"
  92
  93    def __eq__(self, other) -> bool:
  94        return self.__repr__() == other.__repr__()
  95
  96    def __lt__(self, other):
  97        return self.__repr__() < other.__repr__()
  98
  99    def get_scan_position(self) -> tuple[float, float]:
 100        """
 101        Get the position of the event in the scanner's coordinate frame.
 102        :return: the scan position of the event in micrometers (um).
 103        """
 104        # Get overall pixel position
 105        pixel_x = self.x + (self.scan.tile_width_px * self.tile.x)
 106        pixel_y = self.y + (self.scan.tile_height_px * self.tile.y)
 107        # Convert to micrometers
 108        x_um = pixel_x * self.scan.pixel_size_um
 109        y_um = pixel_y * self.scan.pixel_size_um
 110        # Add the scan's origin in the scanner frame
 111        x_um += self.scan.roi[self.tile.n_roi].origin_x_um
 112        y_um += self.scan.roi[self.tile.n_roi].origin_y_um
 113        return x_um, y_um
 114
 115    def get_slide_position(self) -> tuple[float, float]:
 116        """
 117        Get the slide position of the event in micrometers (um).
 118        :return: the slide position of the event.
 119        """
 120        # Turn scan_position into a 3x1 vector
 121        scan_position = self.get_scan_position()
 122        scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]])
 123
 124        # Multiply by the appropriate homogeneous matrix
 125        if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value):
 126            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7]
 127        elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value):
 128            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER]
 129        else:
 130            raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.")
 131        slide_position = np.matmul(transform, scan_position)
 132        return float(slide_position[0][0]), float(slide_position[1][0])
 133
 134    def crop_images(
 135        self, images: list[np.ndarray], crop_size: int = 100, in_pixels: bool = True
 136    ) -> list[np.ndarray]:
 137        """
 138        Get the event crops from the frame images. Called "get" because it does not
 139        need to extract anything; it is very quick for extracting multiple events from
 140        the same tile.
 141        Use this if you're interested in many events.
 142        :param images: the frame images.
 143        :param crop_size: the square size of the image crop to get for this event.
 144        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
 145        :return: image_size x image_size crops of the event in the provided frames. If
 146        the event is too close to the edge, the crop will be smaller and not centered.
 147        """
 148        # Convert a crop size in micrometers to pixels
 149        if not in_pixels:
 150            crop_size = round(crop_size / self.scan.pixel_size_um)
 151        # Find the crop bounds
 152        bounds = [
 153            self.x - crop_size // 2,
 154            self.y - crop_size // 2,
 155            self.x + math.ceil(crop_size / 2),
 156            self.y + math.ceil(crop_size / 2),
 157        ]
 158        # Determine how much the bounds violate the image size
 159        displacements = [
 160            max(0, -bounds[0]),
 161            max(0, -bounds[1]),
 162            max(0, bounds[2] - images[0].shape[1]),
 163            max(0, bounds[3] - images[0].shape[0]),
 164        ]
 165        # Cap off the bounds
 166        bounds = [
 167            max(0, bounds[0]),
 168            max(0, bounds[1]),
 169            min(images[0].shape[1], bounds[2]),
 170            min(images[0].shape[0], bounds[3]),
 171        ]
 172
 173        # Crop the images
 174        cropped_images = []
 175        for image in images:
 176            # Create a blank image of the right size
 177            cropped_image = np.zeros((crop_size, crop_size), dtype=image.dtype)
 178
 179            # Insert the cropped image into the blank image, leaving a black buffer
 180            # around the edges if the crop would go beyond the original image bounds
 181            cropped_image[
 182                displacements[1] : crop_size - displacements[3],
 183                displacements[0] : crop_size - displacements[2],
 184            ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]]
 185            cropped_images.append(cropped_image)
 186        return cropped_images
 187
 188    def extract_images(
 189        self, crop_size: int = 100, in_pixels: bool = True
 190    ) -> list[np.ndarray]:
 191        """
 192        Extract the images from the scan and tile, reading from the file. Called
 193        "extract" because it must read and extract the images from file, which is slow.
 194        Use this if you're interested in only a few events, as it is inefficient when
 195        reading multiple events from the same tile.
 196        :param crop_size: the square size of the image crop to get for this event.
 197        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
 198        :return: a list of cropped images from the scan in the order of the channels.
 199        """
 200        frames = Frame.get_frames(self.tile)
 201        images = [frame.get_image() for frame in frames]
 202        return self.crop_images(images, crop_size, in_pixels)
 203
 204    @classmethod
 205    def extract_images_for_list(
 206        cls,
 207        events: list[Self],
 208        crop_size: int | list[int] = None,
 209        in_pixels: bool = True,
 210    ) -> list[list[np.ndarray]]:
 211        """
 212        Get the images for a list of events, ensuring that there is no wasteful reading
 213        of the same tile multiple times. This function is more efficient than calling
 214        extract_event_images for each event.
 215        :param events: the events to extract images for.
 216        :param crop_size: the square size of the image crop to get for this event.
 217                          Defaults to four times the size of the event.
 218        :param in_pixels: whether the crop size is in pixels or micrometers.
 219                          Defaults to pixels, and is ignored if crop_size is None.
 220        :return: a list of lists of cropped images for each event.
 221        """
 222        if len(events) == 0:
 223            return []
 224
 225        # Populate a crop size if none provided
 226        if crop_size is None:
 227            crop_size = [4 * event.size for event in events]
 228            in_pixels = True
 229        # Propagate a constant crop size
 230        elif isinstance(crop_size, int):
 231            crop_size = [crop_size] * len(events)
 232
 233        # Sort the events by tile; use a shallow copy to avoid modifying the original
 234        order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__()))
 235
 236        # Allocate the list to size
 237        images = [None] * len(events)
 238        last_tile = None
 239        frame_images = None  # Holds large numpy arrays, so expensive to compare
 240        # Iterate through in sorted order
 241        for i in order:
 242            if last_tile != events[i].tile:
 243                # Gather the frame images, preserving them for the next event
 244                frames = Frame.get_frames(events[i].tile)
 245                frame_images = [frame.get_image() for frame in frames]
 246
 247                last_tile = events[i].tile
 248            # Use the frame images to crop the event images
 249            # Preserve the original order using order[i]
 250            images[i] = events[i].crop_images(frame_images, crop_size[i], in_pixels)
 251        return images
 252
 253
 254class EventArray:
 255    """
 256    A class that holds a large number of events' data, making it easy to analyze and
 257    manipulate many events at once. A more separated version of the Event class.
 258    """
 259
 260    INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y", "size"]
 261
 262    def __init__(
 263        self,
 264        info: pd.DataFrame = None,
 265        metadata: pd.DataFrame = None,
 266        features: pd.DataFrame = None,
 267    ):
 268        # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y", "size"
 269        if info is not None:
 270            if list(info.columns) != self.INFO_COLUMNS:
 271                raise ValueError(
 272                    "EventArray.info must have columns 'slide_id', 'tile', 'roi', 'x', 'y', 'size'"
 273                )
 274            # Copy first to avoid modifying the original
 275            info = info.copy()
 276            # Ensure that the columns are the right types
 277            info["slide_id"] = info["slide_id"].astype(str)
 278            info["tile"] = info["tile"].astype(np.uint16)
 279            info["roi"] = info["roi"].astype(np.uint8)
 280            info["x"] = info["x"].round().astype(np.uint16)
 281            info["y"] = info["y"].round().astype(np.uint16)
 282            info["size"] = info["size"].round().astype(np.uint16)
 283        # All DataFrames must all have the same number of rows
 284        if metadata is not None and (info is None or len(info) != len(metadata)):
 285            raise ValueError(
 286                "If EventArray.metadata is not None, it should match rows with .info"
 287            )
 288        if features is not None and (info is None or len(info) != len(features)):
 289            raise ValueError(
 290                "If EventArray.features is not None, it should match rows with .info"
 291            )
 292        self.info = info
 293        self.metadata = metadata
 294        self.features = features
 295
 296    def __len__(self) -> int:
 297        # Convenience method to get the number of events
 298        if self.info is None:
 299            return 0
 300        else:
 301            return len(self.info)
 302
 303    def __eq__(self, other):
 304        is_equal = True
 305        # Parse all possibilities for info
 306        if isinstance(self.info, pd.DataFrame):
 307            if isinstance(other.info, pd.DataFrame):
 308                is_equal = self.info.equals(other.info)
 309                if not is_equal:
 310                    return False
 311            else:
 312                return False
 313        elif self.info is None:
 314            if other.info is not None:
 315                return False
 316
 317        # Parse all possibilities for metadata
 318        if isinstance(self.metadata, pd.DataFrame):
 319            if isinstance(other.metadata, pd.DataFrame):
 320                is_equal = self.metadata.equals(other.metadata)
 321                if not is_equal:
 322                    return False
 323            else:
 324                return False
 325        elif self.metadata is None:
 326            if other.metadata is not None:
 327                return False
 328
 329        # Parse all possibilities for features
 330        if isinstance(self.features, pd.DataFrame):
 331            if isinstance(other.features, pd.DataFrame):
 332                is_equal = self.features.equals(other.features)
 333                if not is_equal:
 334                    return False
 335            else:
 336                return False
 337        elif self.features is None:
 338            if other.features is not None:
 339                return False
 340
 341        return is_equal
 342
 343    def get_sort_order(self, by: str | list[str], ascending: bool | list[bool] = True):
 344        """
 345        Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.
 346        :param by: name of the column(s) to sort by.
 347        :param ascending: whether to sort in ascending order; can be a list to match by
 348        :return: the order of the indices to sort by.
 349        """
 350        columns = self.get(by)
 351        return columns.sort_values(by=by, ascending=ascending).index
 352
 353    def sort(self, by: str | list[str], ascending: bool | list[bool] = True) -> Self:
 354        """
 355        Sort the EventArray by column(s) in the info, metadata, or features DataFrames.
 356        :param by: name of the column(s) to sort by.
 357        :param ascending: whether to sort in ascending order; can be a list to match by
 358        :return: a new, sorted EventArray.
 359        """
 360        order = self.get_sort_order(by, ascending)
 361        info = self.info.loc[order].reset_index(drop=True)
 362        if self.metadata is not None:
 363            metadata = self.metadata.loc[order].reset_index(drop=True)
 364        else:
 365            metadata = None
 366        if self.features is not None:
 367            features = self.features.loc[order].reset_index(drop=True)
 368        else:
 369            features = None
 370        return EventArray(info, metadata, features)
 371
 372    def get(self, column_names: int | str | list[int] | list[str]) -> pd.DataFrame:
 373        """
 374        Get a DataFrame with the specified columns from the EventArray, by value.
 375        :param column_names: the names of the columns to get.
 376        :return: a DataFrame with the specified columns.
 377        """
 378        if isinstance(column_names, int) or isinstance(column_names, str):
 379            column_names = [column_names]
 380        columns = []
 381        for column_name in column_names:
 382            if column_name in self.info.columns:
 383                columns.append(self.info[column_name])
 384            elif self.metadata is not None and column_name in self.metadata.columns:
 385                columns.append(self.metadata[column_name])
 386            elif self.features is not None and column_name in self.features.columns:
 387                columns.append(self.features[column_name])
 388            else:
 389                raise ValueError(f"Column {column_name} not found in EventArray")
 390        return pd.concat(columns, axis=1)
 391
 392    def rows(self, rows) -> Self:
 393        """
 394        Get a subset of the EventArray rows based on a boolean or integer index, by value.
 395        :param rows: the indices to get as a 1D boolean/integer list/array/series
 396        :return: a new EventArray with the subset of events.
 397        """
 398        info = self.info.loc[rows].reset_index(drop=True)
 399        if self.metadata is not None:
 400            metadata = self.metadata.loc[rows].reset_index(drop=True)
 401        else:
 402            metadata = None
 403        if self.features is not None:
 404            features = self.features.loc[rows].reset_index(drop=True)
 405        else:
 406            features = None
 407        return EventArray(info, metadata, features)
 408
 409    def copy(self) -> Self:
 410        """
 411        Create a deep copy of the EventArray.
 412        :return: a deep copy of the EventArray.
 413        """
 414        return EventArray(
 415            info=self.info.copy(),
 416            metadata=None if self.metadata is None else self.metadata.copy(),
 417            features=None if self.features is None else self.features.copy(),
 418        )
 419
 420    def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None:
 421        """
 422        Add metadata to the EventArray. Removes the need to check if metadata is None.
 423        Overwrites any existing metadata with the same column names as the new metadata.
 424        :param new_metadata: the metadata to add.
 425        """
 426        if len(self) != len(new_metadata):
 427            raise ValueError("New metadata must match length of existing info")
 428
 429        if self.metadata is None:
 430            self.metadata = new_metadata
 431        else:
 432            if isinstance(new_metadata, pd.Series):
 433                self.metadata[new_metadata.name] = new_metadata
 434            else:
 435                # It's a DataFrame
 436                self.metadata[new_metadata.columns] = new_metadata
 437
 438    def add_features(self, new_features: pd.Series | pd.DataFrame) -> None:
 439        """
 440        Add features to the EventArray. Removes the need to check if features is None.
 441        Overwrites any existing features with the same column names as the new features.
 442        :param new_features: the features to add.
 443        """
 444        if len(self) != len(new_features):
 445            raise ValueError("New features must match length of existing info")
 446
 447        if self.features is None:
 448            self.features = new_features
 449        else:
 450            if isinstance(new_features, pd.Series):
 451                self.features[new_features.name] = new_features
 452            else:
 453                # It's a DataFrame
 454                self.features[new_features.columns] = new_features
 455
 456    @classmethod
 457    def merge(cls, events: list[Self]) -> Self:
 458        """
 459        Combine EventArrays in a list into a single EventArray.
 460        :param events: the new list of events.
 461        """
 462        all_info = []
 463        all_metadata = []
 464        all_features = []
 465        for event_array in events:
 466            # Skip empty EventArrays
 467            if event_array.info is not None:
 468                all_info.append(event_array.info)
 469            if event_array.metadata is not None:
 470                all_metadata.append(event_array.metadata)
 471            if event_array.features is not None:
 472                all_features.append(event_array.features)
 473        if len(all_info) == 0:
 474            return EventArray()
 475        else:
 476            all_info = pd.concat(all_info, ignore_index=True)
 477        if len(all_metadata) == 0:
 478            all_metadata = None
 479        else:
 480            all_metadata = pd.concat(all_metadata, ignore_index=True)
 481        if len(all_features) == 0:
 482            all_features = None
 483        else:
 484            all_features = pd.concat(all_features, ignore_index=True)
 485
 486        return EventArray(all_info, all_metadata, all_features)
 487
 488    def to_events(
 489        self,
 490        scans: Scan | list[Scan],
 491        ignore_missing_scans=True,
 492        ignore_metadata=False,
 493        ignore_features=False,
 494    ) -> list[Event]:
 495        """
 496        Get the events in the EventArray as a list of events.
 497        :param scans: the scans that the events belong to, auto-matched by slide_id.
 498        Pass None if you don't care about scan metadata (pass ignore_missing_scans).
 499        :param ignore_missing_scans: whether to create blank scans for events without scans.
 500        :param ignore_metadata: whether to ignore metadata or not
 501        :param ignore_features: whether to ignore features or not
 502        :return:
 503        """
 504        if isinstance(scans, Scan):
 505            scans = [scans] * len(self.info)
 506        events = []
 507        for i in range(len(self.info)):
 508            # Determine the associated scan
 509            scan = None
 510            for s in scans:
 511                if s.slide_id == self.info["slide_id"][i]:
 512                    scan = s
 513                    break
 514            if scan is None:
 515                if ignore_missing_scans:
 516                    # Create a placeholder scan if the scan is missing
 517                    scan = Scan.make_placeholder(
 518                        self.info["slide_id"][i],
 519                        self.info["tile"][i],
 520                        self.info["roi"][i],
 521                    )
 522                else:
 523                    raise ValueError(
 524                        f"Scan {self.info['slide_id'][i]} not found for event {i}."
 525                    )
 526            # Prepare the metadata and features
 527            if ignore_metadata or self.metadata is None:
 528                metadata = None
 529            else:
 530                # This Series creation method is less efficient,
 531                # but required for preserving dtypes
 532                metadata = pd.Series(
 533                    {col: self.metadata.loc[i, col] for col in self.metadata.columns},
 534                    dtype=object,
 535                )
 536            if ignore_features or self.features is None:
 537                features = None
 538            else:
 539                features = pd.Series(
 540                    {col: self.features.loc[i, col] for col in self.features.columns},
 541                    dtype=object,
 542                )
 543            # Create the event and append it to the list
 544            events.append(
 545                Event(
 546                    scan,
 547                    Tile(scan, self.info["tile"][i], self.info["roi"][i]),
 548                    self.info["x"][i],
 549                    self.info["y"][i],
 550                    size=self.info["size"][i],
 551                    metadata=metadata,
 552                    features=features,
 553                )
 554            )
 555        return events
 556
 557    @classmethod
 558    def from_events(cls, events: list[Event]) -> Self:
 559        """
 560        Set the events in the EventArray to a new list of events.
 561        :param events: the new list of events.
 562        """
 563        # Return an empty array if we were passed nothing
 564        if events is None or len(events) == 0:
 565            return EventArray()
 566        # Otherwise, grab the info
 567        info = pd.DataFrame(
 568            {
 569                "slide_id": [event.scan.slide_id for event in events],
 570                "tile": [event.tile.n for event in events],
 571                "roi": [event.tile.n_roi for event in events],
 572                "x": [event.x for event in events],
 573                "y": [event.y for event in events],
 574                "size": [event.size for event in events],
 575            }
 576        )
 577        metadata_list = [event.metadata for event in events]
 578        # Iterate through and ensure that all metadata is the same shape
 579        for metadata in metadata_list:
 580            if type(metadata) != type(metadata_list[0]):
 581                raise ValueError("All metadata must be the same type.")
 582            if metadata is not None and metadata.shape != metadata_list[0].shape:
 583                raise ValueError("All metadata must be the same shape.")
 584        if metadata_list[0] is None:
 585            metadata = None
 586        else:
 587            metadata = pd.DataFrame(metadata_list)
 588        features_list = [event.features for event in events]
 589        # Iterate through and ensure that all features are the same shape
 590        for features in features_list:
 591            if type(features) != type(features_list[0]):
 592                raise ValueError("All features must be the same type.")
 593            if features is not None and features.shape != features_list[0].shape:
 594                raise ValueError("All features must be the same shape.")
 595        if features_list[0] is None:
 596            features = None
 597        else:
 598            features = pd.DataFrame(features_list)
 599        return EventArray(info=info, metadata=metadata, features=features)
 600
 601    def to_dataframe(self) -> pd.DataFrame:
 602        """
 603        Convert all the data in the EventArray to a single DataFrame.
 604        :return: a DataFrame with all the data in the EventArray.
 605        """
 606        # Make a copy of the info DataFrame and prepend "info_" to the column names
 607        output = self.info.copy()
 608        output.columns = [f"info_{col}" for col in output.columns]
 609        # Combine with the metadata and prepend "metadata_" to the column names
 610        if self.metadata is not None:
 611            metadata = self.metadata.copy()
 612            metadata.columns = [f"metadata_{col}" for col in metadata.columns]
 613            output = pd.concat([output, metadata], axis=1)
 614        # Combine with the features and prepend "features_" to the column names
 615        if self.features is not None:
 616            features = self.features.copy()
 617            features.columns = [f"features_{col}" for col in features.columns]
 618            output = pd.concat([output, features], axis=1)
 619        return output
 620
 621    @classmethod
 622    def from_dataframe(cls, df) -> Self:
 623        """
 624        From a single, special DataFrame, create an EventArray.
 625        :return: a DataFrame with all the data in the EventArray.
 626        """
 627        # Split the columns into info, metadata, and features and strip prefix
 628        info = df[[col for col in df.columns if col.startswith("info_")]].copy()
 629        info.columns = [col.replace("info_", "") for col in info.columns]
 630        if info.size == 0:
 631            info = None
 632        metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy()
 633        metadata.columns = [col.replace("metadata_", "") for col in metadata.columns]
 634        if metadata.size == 0:
 635            metadata = None
 636        features = df[[col for col in df.columns if col.startswith("features_")]].copy()
 637        features.columns = [col.replace("features_", "") for col in features.columns]
 638        if features.size == 0:
 639            features = None
 640        return cls(info=info, metadata=metadata, features=features)
 641
 642    @classmethod
 643    def from_mask(
 644        cls,
 645        mask: np.ndarray,
 646        slide_id: str,
 647        tile_n: int,
 648        n_roi: int = 0,
 649        include_cell_id: bool = True,
 650        images: list[np.ndarray] = None,
 651        image_labels: list[str] = None,
 652        properties: list[str] = None,
 653    ) -> Self:
 654        """
 655        Extract events from a mask DataFrame, including metadata and features.
 656        :param mask: the mask to extract events from.
 657        :param slide_id: the slide ID the mask is from.
 658        :param tile_n: the tile number the mask is from.
 659        :param n_roi: the ROI number the mask is from.
 660        :param include_cell_id: whether to include the cell_id, or numerical
 661        mask label, as metadata in the EventArray.
 662        :param images: the intensity images to extract features from.
 663        :param image_labels: the labels for the intensity images.
 664        :param properties: list of properties to extract in addition to the defaults:
 665        :return: EventArray corresponding to the mask labels.
 666        """
 667        if extract_mask_info is None:
 668            raise ModuleNotFoundError(
 669                "csi_images.csi_images dependencies not installed. Install csi-images "
 670                "with [imageio] option to resolve."
 671            )
 672        # Gather mask_info
 673        if images is not None and image_labels is not None:
 674            if len(images) != len(image_labels):
 675                raise ValueError("Intensity images and labels must match lengths.")
 676
 677        mask_info = extract_mask_info(mask, images, image_labels, properties)
 678
 679        if len(mask_info) == 0:
 680            return EventArray()
 681
 682        # Combine provided info and mask info
 683        info = pd.DataFrame(
 684            {
 685                "slide_id": slide_id,
 686                "tile": tile_n,
 687                "roi": n_roi,
 688                "x": mask_info["x"],
 689                "y": mask_info["y"],
 690                "size": mask_info["size"],
 691            },
 692        )
 693        # Extract a metadata column if desired
 694        if include_cell_id:
 695            metadata = pd.DataFrame({"cell_id": mask_info["id"]})
 696        else:
 697            metadata = None
 698        # If any additional properties were extracted, add them as features
 699        mask_info = mask_info.drop(columns=["id", "x", "y", "size"], errors="ignore")
 700        if len(mask_info.columns) > 0:
 701            features = mask_info
 702        else:
 703            features = None
 704        return EventArray(info, metadata, features)
 705
 706    def save_csv(self, output_path: str) -> bool:
 707        """
 708        Save the events to an CSV file, including metadata and features.
 709        :param output_path:
 710        :return:
 711        """
 712        self.to_dataframe().to_csv(output_path, index=False)
 713        return os.path.exists(output_path)
 714
 715    @classmethod
 716    def load_csv(cls, input_path: str) -> Self:
 717        """
 718        Load the events from an CSV file, including metadata and features.
 719        :param input_path:
 720        :return:
 721        """
 722        # Load the CSV file
 723        df = pd.read_csv(input_path)
 724        return cls.from_dataframe(df)
 725
 726    def save_hdf5(self, output_path: str) -> bool:
 727        """
 728        Save the events to an HDF5 file, including metadata and features.
 729        Uses the pandas-provided HDF5 functions for ease, and external compatibility,
 730        though these files are slightly harder to view in HDFView or similar.
 731        :param output_path:
 732        :return:
 733        """
 734        # Open the output_path as an HDF5 file
 735        with pd.HDFStore(output_path) as store:
 736            # Store the dataframes in the HDF5 file
 737            if self.info is not None:
 738                store.put("info", self.info, index=False)
 739            if self.metadata is not None:
 740                store.put("metadata", self.metadata, index=False)
 741            if self.features is not None:
 742                store.put("features", self.features, index=False)
 743        return os.path.exists(output_path)
 744
 745    @classmethod
 746    def load_hdf5(cls, input_path: str) -> Self:
 747        """
 748        Load the events from an HDF5 file, including metadata and features.
 749        :param input_path:
 750        :return:
 751        """
 752        # Open the input_path as an HDF5 file
 753        with pd.HDFStore(input_path) as store:
 754            # Load the dataframes from the HDF5 file
 755            info = store.get("info") if "info" in store else None
 756            metadata = store.get("metadata") if "metadata" in store else None
 757            features = store.get("features") if "features" in store else None
 758        return cls(info=info, metadata=metadata, features=features)
 759
 760    def save_ocular(self, output_path: str, event_type: str = "cells"):
 761        """
 762        Save the events to an OCULAR file. Relies on the dataframe originating
 763        from an OCULAR file (same columns; duplicate metadata/info).
 764        :param output_path:
 765        :param event_type:
 766        :return:
 767        """
 768        if pyreadr is None:
 769            raise ModuleNotFoundError(
 770                "pyreadr not installed. Install pyreadr directly "
 771                "or install csi-images with [rds] option to resolve."
 772            )
 773        if event_type == "cells":
 774            file_stub = "rc-final"
 775        elif event_type == "others":
 776            file_stub = "others-final"
 777        else:
 778            raise ValueError("Invalid event type. Must be cells or others.")
 779
 780        # Ensure good metadata
 781        metadata = pd.DataFrame(
 782            {
 783                "slide_id": self.info["slide_id"],
 784                "frame_id": self.info["tile"],
 785                "cell_id": (
 786                    self.metadata["cell_id"]
 787                    if "cell_id" in self.metadata.columns
 788                    else range(len(self.info))
 789                ),
 790                "cellx": self.info["x"],
 791                "celly": self.info["y"],
 792            }
 793        )
 794        if self.metadata is not None:
 795            metadata[self.metadata.columns] = self.metadata.copy()
 796
 797        # Check for the "ocular_interesting" column
 798        if event_type == "cells":
 799            if "ocular_interesting" in metadata.columns:
 800                interesting_rows = metadata["ocular_interesting"].to_numpy(dtype=bool)
 801            elif "hcpc" in metadata.columns:
 802                # Interesting cells don't get an hcpc designation, leaving them as -1
 803                interesting_rows = (
 804                    metadata["hcpc"].to_numpy() == -1
 805                )  # interesting cells
 806            else:
 807                interesting_rows = []
 808            if sum(interesting_rows) > 0:
 809                # Split the metadata into interesting and regular
 810                interesting_events = self.rows(interesting_rows)
 811                interesting_df = pd.concat(
 812                    [interesting_events.features, interesting_events.metadata], axis=1
 813                )
 814                data_events = self.rows(~interesting_rows)
 815                data_df = pd.concat(
 816                    [data_events.features, data_events.metadata], axis=1
 817                )
 818                data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore")
 819
 820                # Drop particular columns for "interesting"
 821                interesting_df = interesting_df.drop(
 822                    [
 823                        "clust",
 824                        "hcpc",
 825                        "frame_id",
 826                        "cell_id",
 827                        "unique_id",
 828                        "ocular_interesting",
 829                    ],
 830                    axis=1,
 831                    errors="ignore",
 832                )
 833                # Save both .csv and .rds
 834                interesting_stub = os.path.join(output_path, "ocular_interesting")
 835                interesting_df.to_csv(f"{interesting_stub}.csv")
 836                # Suppress pandas FutureWarning
 837                with warnings.catch_warnings():
 838                    warnings.simplefilter(action="ignore", category=FutureWarning)
 839                    pyreadr.write_rds(f"{interesting_stub}.rds", interesting_df)
 840            else:
 841                data_df = pd.concat([self.features, metadata], axis=1)
 842        else:
 843            # Get all data and reset_index (will copy it)
 844            data_df = pd.concat([self.features, metadata], axis=1)
 845
 846        # Split based on cluster number to conform to *-final[1-4].rds
 847        n_clusters = max(data_df["clust"]) + 1
 848        split_idx = [round(i * n_clusters / 4) for i in range(5)]
 849        for i in range(4):
 850            subset = (split_idx[i] <= data_df["clust"]) & (
 851                data_df["clust"] < split_idx[i + 1]
 852            )
 853            data_df.loc[subset, "hcpc"] = i + 1
 854            subset = data_df[subset].reset_index(drop=True)
 855            # Suppress pandas FutureWarning
 856            with warnings.catch_warnings():
 857                warnings.simplefilter(action="ignore", category=FutureWarning)
 858                pyreadr.write_rds(
 859                    os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset
 860                )
 861
 862        # Create new example cell strings
 863        data_df["example_cell_id"] = (
 864            data_df["slide_id"]
 865            + " "
 866            + data_df["frame_id"].astype(str)
 867            + " "
 868            + data_df["cell_id"].astype(str)
 869            + " "
 870            + data_df["cellx"].astype(int).astype(str)
 871            + " "
 872            + data_df["celly"].astype(int).astype(str)
 873        )
 874        # Find averagable data columns
 875        if "cellcluster_id" in data_df.columns:
 876            end_idx = data_df.columns.get_loc("cellcluster_id")
 877        else:
 878            end_idx = data_df.columns.get_loc("slide_id")
 879        avg_cols = data_df.columns[:end_idx].tolist()
 880        # Group by cluster and average
 881        data_df = data_df.groupby("clust").agg(
 882            **{col: (col, "mean") for col in avg_cols},
 883            count=("clust", "size"),  # count rows in each cluster
 884            example_cells=("example_cell_id", lambda x: ",".join(x)),
 885            hcpc=("hcpc", lambda x: x.iloc[0]),
 886        )
 887        data_df = data_df.reset_index()  # Do NOT drop, index is "clust"
 888        # Create new columns
 889        metadata = pd.DataFrame(
 890            {
 891                "count": data_df["count"],
 892                "example_cells": data_df["example_cells"],
 893                "clust": data_df["clust"].astype(int),
 894                "hcpc": data_df["hcpc"].astype(int),
 895                "id": data_df["clust"].astype(int).astype(str),
 896                "cccluster": "0",  # Dummy value
 897                "ccdistance": 0.0,  # Dummy value
 898                "rownum": list(range(len(data_df))),
 899                "framegroup": 0,  # Dummy value
 900            }
 901        )
 902        # Need to pad the features to 761 columns, as per OCULAR report needs
 903        additional_columns = range(len(avg_cols), 761)
 904        if len(additional_columns) > 0:
 905            padding = pd.DataFrame(
 906                np.zeros((len(data_df), len(additional_columns))),
 907                columns=[f"pad{i}" for i in additional_columns],
 908            )
 909            data_df = pd.concat([data_df[avg_cols], padding, metadata], axis=1)
 910        else:
 911            data_df = pd.concat([data_df[avg_cols], metadata], axis=1)
 912
 913        # Save the cluster data
 914        data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv"))
 915        # Suppress pandas FutureWarning
 916        with warnings.catch_warnings():
 917            warnings.simplefilter(action="ignore", category=FutureWarning)
 918            pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df)
 919
 920    @classmethod
 921    def load_ocular(
 922        cls,
 923        input_path: str,
 924        event_type="cells",
 925        cell_data_files=(
 926            "rc-final1.rds",
 927            "rc-final2.rds",
 928            "rc-final3.rds",
 929            "rc-final4.rds",
 930            "ocular_interesting.rds",
 931        ),
 932        others_data_files=(
 933            "others-final1.rds",
 934            "others-final2.rds",
 935            "others-final3.rds",
 936            "others-final4.rds",
 937        ),
 938        atlas_data_files=(
 939            "ocular_interesting.rds",
 940            "ocular_not_interesting.rds",
 941        ),
 942        drop_common_events=True,
 943        log=None,
 944    ) -> Self:
 945        """
 946
 947        :param input_path:
 948        :param event_type:
 949        :param cell_data_files:
 950        :param others_data_files:
 951        :param atlas_data_files:
 952        :param drop_common_events:
 953        :param log:
 954        :return:
 955        """
 956        if pyreadr is None:
 957            raise ModuleNotFoundError(
 958                "pyreadr not installed. Install pyreadr directly "
 959                "or install csi-images with [rds] option to resolve."
 960            )
 961        # Check if the input path is a directory or a file
 962        if os.path.isfile(input_path):
 963            data_files = [os.path.basename(input_path)]
 964            input_path = os.path.dirname(input_path)
 965        if event_type == "cells":
 966            data_files = cell_data_files
 967        elif event_type == "others":
 968            data_files = others_data_files
 969        else:
 970            raise ValueError("Invalid event type.")
 971
 972        # Load the data from the OCULAR files
 973        file_data = {}
 974        for file in data_files:
 975            file_path = os.path.join(input_path, file)
 976            if not os.path.isfile(file_path):
 977                if log is not None:
 978                    log.warning(f"{file} not found for in {input_path}")
 979                continue
 980            file_data[file] = pyreadr.read_r(file_path)
 981            # Get the DataFrame associated with None (pyreadr dict quirk)
 982            file_data[file] = file_data[file][None]
 983            if len(file_data[file]) == 0:
 984                # File gets dropped from the dict
 985                file_data.pop(file)
 986                if log is not None:
 987                    log.warning(f"{file} has no cells")
 988                continue
 989
 990            if log is not None:
 991                log.debug(f"{file} has {len(file_data[file])} cells")
 992
 993            # Drop common cells if requested and in this file
 994            if (
 995                file in atlas_data_files
 996                and drop_common_events
 997                and "catalogue_classification" in file_data[file]
 998            ):
 999                common_cell_indices = (
1000                    file_data[file]["catalogue_classification"] == "common_cell"
1001                )
1002                if log is not None:
1003                    log.debug(
1004                        f"Dropping {int(pd.Series.sum(common_cell_indices))}"
1005                        f"common cells from {file}"
1006                    )
1007                file_data[file] = file_data[file][common_cell_indices == False]
1008
1009            if len(file_data[file]) == 0:
1010                # File gets dropped from the dict
1011                file_data.pop(file)
1012                if log is not None:
1013                    log.warning(f"{file} has no cells after dropping common cells")
1014                continue
1015
1016            # Extract frame_id and cell_id
1017            # DAPI- events already have frame_id cell_id outside rowname
1018            if event_type == "cells" and "frame_id" not in file_data[file].columns:
1019                file_data[file]["rowname"] = file_data[file]["rowname"].astype("str")
1020                # get frame_id cell_id from rownames column and split into two columns
1021                split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True)
1022                if len(split_res.columns) != 2:
1023                    log.warning(
1024                        f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}'
1025                    )
1026                # then assign it back to the dataframe
1027                file_data[file][["frame_id", "cell_id"]] = split_res.astype("int")
1028            # reset indexes since they can cause NaN values in concat
1029            file_data[file] = file_data[file].reset_index(drop=True)
1030
1031        # Merge the data from all files
1032        if len(file_data) == 0:
1033            return EventArray()
1034        elif len(file_data) == 1:
1035            data = [file_data[file] for file in file_data.keys()][0]
1036        else:
1037            data = pd.concat(file_data.values())
1038
1039        if log is not None:
1040            log.debug(f"Gathered a total of {len(data)} events")
1041
1042        # Others is missing the "slide_id". Insert it right before "frame_id" column
1043        if event_type == "others" and "slide_id" not in data.columns:
1044            if os.path.basename(input_path) == "ocular":
1045                slide_id = os.path.basename(os.path.dirname(input_path))
1046            else:
1047                slide_id = "UNKNOWN"
1048            data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id)
1049
1050        # Sort according to ascending cell_id to keep the original, which is in manual_df
1051        data = data.sort_values(by=["cell_id"], ascending=True)
1052        # Filter out duplicates by x & y
1053        data = data.assign(
1054            unique_id=data["slide_id"]
1055            + "_"
1056            + data["frame_id"].astype(str)
1057            + "_"
1058            + data["cellx"].astype(int).astype(str)
1059            + "_"
1060            + data["celly"].astype(int).astype(str)
1061        )
1062        data = data.drop_duplicates(subset=["unique_id"], keep="first")
1063        # Normal unique_id is with cell_id
1064        data = data.assign(
1065            unique_id=data["slide_id"]
1066            + "_"
1067            + data["frame_id"].astype(str)
1068            + "_"
1069            + data["cell_id"].astype(str)
1070        )
1071        data = data.reset_index(drop=True)
1072        # All columns up to "slide_id" are features; drop the "slide_id"
1073        features = data.loc[:, :"slide_id"].iloc[:, :-1]
1074        data = data.loc[:, "slide_id":]
1075        # Grab the info columns
1076        info = data[["slide_id", "frame_id", "cellx", "celly"]]
1077        info.columns = ["slide_id", "tile", "x", "y"]
1078        info = info.assign(
1079            roi=0,  # OCULAR only works on 1 ROI, as far as known
1080            size=25,  # Static, for later montaging
1081        )
1082        info = info[["slide_id", "tile", "roi", "x", "y", "size"]]
1083        # Metadata has duplicate columns for later convenience
1084        metadata = data
1085        # Certain columns tend to be problematic with mixed data formats...
1086        for col in ["TRITC", "CY5", "FITC"]:
1087            if col in metadata:
1088                labels = {
1089                    "False": False,
1090                    "True": True,
1091                    "FALSE": False,
1092                    "TRUE": True,
1093                }
1094                metadata[col] = metadata[col].map(labels).astype(bool)
1095        for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]:
1096            if col in metadata:
1097                metadata[col] = metadata[col].fillna(-1).astype(int)
1098        return EventArray(info, metadata, features)
class Event:
 35class Event:
 36    """
 37    A class that represents a single event in a scan, making it easy to evaluate
 38    singular events. Required metadata is exposed as attributes, and optional
 39    metadata and features are stored as DataFrames.
 40    """
 41
 42    SCAN_TO_SLIDE_TRANSFORM = {
 43        # Axioscan zero is in the top-right corner instead of top-left
 44        Scan.Type.AXIOSCAN7: np.array(
 45            [
 46                [1, 0, 75000],
 47                [0, 1, 0],
 48                [0, 0, 1],
 49            ]
 50        ),
 51        # BZScanner coordinates are a special kind of messed up:
 52        # - The slide is upside-down.
 53        # - The slide is oriented vertically, with the barcode at the bottom.
 54        # - Tiles are numbered from the top-right
 55        Scan.Type.BZSCANNER: np.array(
 56            [
 57                [0, -1, 75000],
 58                [-1, 0, 25000],
 59                [0, 0, 1],
 60            ]
 61        ),
 62    }
 63    """
 64    Homogeneous transformation matrices for converting between scanner and slide
 65    coordinates. The matrices are 3x3, with the final column representing the
 66    translation in micrometers (um). For more information, see 
 67    [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations).
 68    
 69    Transformations are nominal, and accuracy is not guaranteed; this is due to 
 70    imperfections in slides and alignment in the scanners. Units are in micrometers.
 71    """
 72
 73    def __init__(
 74        self,
 75        scan: Scan,
 76        tile: Tile,
 77        x: int,
 78        y: int,
 79        size: int = 12,  # End-to-end size in pixels
 80        metadata: pd.Series = None,
 81        features: pd.Series = None,
 82    ):
 83        self.scan = scan
 84        self.tile = tile
 85        self.x = int(x)
 86        self.y = int(y)
 87        self.size = int(size)
 88        self.metadata = metadata
 89        self.features = features
 90
 91    def __repr__(self) -> str:
 92        return f"{self.scan.slide_id}-{self.tile.n}-{self.x}-{self.y}"
 93
 94    def __eq__(self, other) -> bool:
 95        return self.__repr__() == other.__repr__()
 96
 97    def __lt__(self, other):
 98        return self.__repr__() < other.__repr__()
 99
100    def get_scan_position(self) -> tuple[float, float]:
101        """
102        Get the position of the event in the scanner's coordinate frame.
103        :return: the scan position of the event in micrometers (um).
104        """
105        # Get overall pixel position
106        pixel_x = self.x + (self.scan.tile_width_px * self.tile.x)
107        pixel_y = self.y + (self.scan.tile_height_px * self.tile.y)
108        # Convert to micrometers
109        x_um = pixel_x * self.scan.pixel_size_um
110        y_um = pixel_y * self.scan.pixel_size_um
111        # Add the scan's origin in the scanner frame
112        x_um += self.scan.roi[self.tile.n_roi].origin_x_um
113        y_um += self.scan.roi[self.tile.n_roi].origin_y_um
114        return x_um, y_um
115
116    def get_slide_position(self) -> tuple[float, float]:
117        """
118        Get the slide position of the event in micrometers (um).
119        :return: the slide position of the event.
120        """
121        # Turn scan_position into a 3x1 vector
122        scan_position = self.get_scan_position()
123        scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]])
124
125        # Multiply by the appropriate homogeneous matrix
126        if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value):
127            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7]
128        elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value):
129            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER]
130        else:
131            raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.")
132        slide_position = np.matmul(transform, scan_position)
133        return float(slide_position[0][0]), float(slide_position[1][0])
134
135    def crop_images(
136        self, images: list[np.ndarray], crop_size: int = 100, in_pixels: bool = True
137    ) -> list[np.ndarray]:
138        """
139        Get the event crops from the frame images. Called "get" because it does not
140        need to extract anything; it is very quick for extracting multiple events from
141        the same tile.
142        Use this if you're interested in many events.
143        :param images: the frame images.
144        :param crop_size: the square size of the image crop to get for this event.
145        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
146        :return: image_size x image_size crops of the event in the provided frames. If
147        the event is too close to the edge, the crop will be smaller and not centered.
148        """
149        # Convert a crop size in micrometers to pixels
150        if not in_pixels:
151            crop_size = round(crop_size / self.scan.pixel_size_um)
152        # Find the crop bounds
153        bounds = [
154            self.x - crop_size // 2,
155            self.y - crop_size // 2,
156            self.x + math.ceil(crop_size / 2),
157            self.y + math.ceil(crop_size / 2),
158        ]
159        # Determine how much the bounds violate the image size
160        displacements = [
161            max(0, -bounds[0]),
162            max(0, -bounds[1]),
163            max(0, bounds[2] - images[0].shape[1]),
164            max(0, bounds[3] - images[0].shape[0]),
165        ]
166        # Cap off the bounds
167        bounds = [
168            max(0, bounds[0]),
169            max(0, bounds[1]),
170            min(images[0].shape[1], bounds[2]),
171            min(images[0].shape[0], bounds[3]),
172        ]
173
174        # Crop the images
175        cropped_images = []
176        for image in images:
177            # Create a blank image of the right size
178            cropped_image = np.zeros((crop_size, crop_size), dtype=image.dtype)
179
180            # Insert the cropped image into the blank image, leaving a black buffer
181            # around the edges if the crop would go beyond the original image bounds
182            cropped_image[
183                displacements[1] : crop_size - displacements[3],
184                displacements[0] : crop_size - displacements[2],
185            ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]]
186            cropped_images.append(cropped_image)
187        return cropped_images
188
189    def extract_images(
190        self, crop_size: int = 100, in_pixels: bool = True
191    ) -> list[np.ndarray]:
192        """
193        Extract the images from the scan and tile, reading from the file. Called
194        "extract" because it must read and extract the images from file, which is slow.
195        Use this if you're interested in only a few events, as it is inefficient when
196        reading multiple events from the same tile.
197        :param crop_size: the square size of the image crop to get for this event.
198        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
199        :return: a list of cropped images from the scan in the order of the channels.
200        """
201        frames = Frame.get_frames(self.tile)
202        images = [frame.get_image() for frame in frames]
203        return self.crop_images(images, crop_size, in_pixels)
204
205    @classmethod
206    def extract_images_for_list(
207        cls,
208        events: list[Self],
209        crop_size: int | list[int] = None,
210        in_pixels: bool = True,
211    ) -> list[list[np.ndarray]]:
212        """
213        Get the images for a list of events, ensuring that there is no wasteful reading
214        of the same tile multiple times. This function is more efficient than calling
215        extract_event_images for each event.
216        :param events: the events to extract images for.
217        :param crop_size: the square size of the image crop to get for this event.
218                          Defaults to four times the size of the event.
219        :param in_pixels: whether the crop size is in pixels or micrometers.
220                          Defaults to pixels, and is ignored if crop_size is None.
221        :return: a list of lists of cropped images for each event.
222        """
223        if len(events) == 0:
224            return []
225
226        # Populate a crop size if none provided
227        if crop_size is None:
228            crop_size = [4 * event.size for event in events]
229            in_pixels = True
230        # Propagate a constant crop size
231        elif isinstance(crop_size, int):
232            crop_size = [crop_size] * len(events)
233
234        # Sort the events by tile; use a shallow copy to avoid modifying the original
235        order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__()))
236
237        # Allocate the list to size
238        images = [None] * len(events)
239        last_tile = None
240        frame_images = None  # Holds large numpy arrays, so expensive to compare
241        # Iterate through in sorted order
242        for i in order:
243            if last_tile != events[i].tile:
244                # Gather the frame images, preserving them for the next event
245                frames = Frame.get_frames(events[i].tile)
246                frame_images = [frame.get_image() for frame in frames]
247
248                last_tile = events[i].tile
249            # Use the frame images to crop the event images
250            # Preserve the original order using order[i]
251            images[i] = events[i].crop_images(frame_images, crop_size[i], in_pixels)
252        return images

A class that represents a single event in a scan, making it easy to evaluate singular events. Required metadata is exposed as attributes, and optional metadata and features are stored as DataFrames.

Event( scan: csi_images.csi_scans.Scan, tile: csi_images.csi_tiles.Tile, x: int, y: int, size: int = 12, metadata: pandas.core.series.Series = None, features: pandas.core.series.Series = None)
73    def __init__(
74        self,
75        scan: Scan,
76        tile: Tile,
77        x: int,
78        y: int,
79        size: int = 12,  # End-to-end size in pixels
80        metadata: pd.Series = None,
81        features: pd.Series = None,
82    ):
83        self.scan = scan
84        self.tile = tile
85        self.x = int(x)
86        self.y = int(y)
87        self.size = int(size)
88        self.metadata = metadata
89        self.features = features
SCAN_TO_SLIDE_TRANSFORM = {<Type.AXIOSCAN7: 'axioscan7'>: array([[ 1, 0, 75000], [ 0, 1, 0], [ 0, 0, 1]]), <Type.BZSCANNER: 'bzscanner'>: array([[ 0, -1, 75000], [ -1, 0, 25000], [ 0, 0, 1]])}

Homogeneous transformation matrices for converting between scanner and slide coordinates. The matrices are 3x3, with the final column representing the translation in micrometers (um). For more information, see affine transformations.

Transformations are nominal, and accuracy is not guaranteed; this is due to imperfections in slides and alignment in the scanners. Units are in micrometers.

scan
tile
x
y
size
metadata
features
def get_scan_position(self) -> tuple[float, float]:
100    def get_scan_position(self) -> tuple[float, float]:
101        """
102        Get the position of the event in the scanner's coordinate frame.
103        :return: the scan position of the event in micrometers (um).
104        """
105        # Get overall pixel position
106        pixel_x = self.x + (self.scan.tile_width_px * self.tile.x)
107        pixel_y = self.y + (self.scan.tile_height_px * self.tile.y)
108        # Convert to micrometers
109        x_um = pixel_x * self.scan.pixel_size_um
110        y_um = pixel_y * self.scan.pixel_size_um
111        # Add the scan's origin in the scanner frame
112        x_um += self.scan.roi[self.tile.n_roi].origin_x_um
113        y_um += self.scan.roi[self.tile.n_roi].origin_y_um
114        return x_um, y_um

Get the position of the event in the scanner's coordinate frame.

Returns

the scan position of the event in micrometers (um).

def get_slide_position(self) -> tuple[float, float]:
116    def get_slide_position(self) -> tuple[float, float]:
117        """
118        Get the slide position of the event in micrometers (um).
119        :return: the slide position of the event.
120        """
121        # Turn scan_position into a 3x1 vector
122        scan_position = self.get_scan_position()
123        scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]])
124
125        # Multiply by the appropriate homogeneous matrix
126        if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value):
127            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7]
128        elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value):
129            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER]
130        else:
131            raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.")
132        slide_position = np.matmul(transform, scan_position)
133        return float(slide_position[0][0]), float(slide_position[1][0])

Get the slide position of the event in micrometers (um).

Returns

the slide position of the event.

def crop_images( self, images: list[numpy.ndarray], crop_size: int = 100, in_pixels: bool = True) -> list[numpy.ndarray]:
135    def crop_images(
136        self, images: list[np.ndarray], crop_size: int = 100, in_pixels: bool = True
137    ) -> list[np.ndarray]:
138        """
139        Get the event crops from the frame images. Called "get" because it does not
140        need to extract anything; it is very quick for extracting multiple events from
141        the same tile.
142        Use this if you're interested in many events.
143        :param images: the frame images.
144        :param crop_size: the square size of the image crop to get for this event.
145        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
146        :return: image_size x image_size crops of the event in the provided frames. If
147        the event is too close to the edge, the crop will be smaller and not centered.
148        """
149        # Convert a crop size in micrometers to pixels
150        if not in_pixels:
151            crop_size = round(crop_size / self.scan.pixel_size_um)
152        # Find the crop bounds
153        bounds = [
154            self.x - crop_size // 2,
155            self.y - crop_size // 2,
156            self.x + math.ceil(crop_size / 2),
157            self.y + math.ceil(crop_size / 2),
158        ]
159        # Determine how much the bounds violate the image size
160        displacements = [
161            max(0, -bounds[0]),
162            max(0, -bounds[1]),
163            max(0, bounds[2] - images[0].shape[1]),
164            max(0, bounds[3] - images[0].shape[0]),
165        ]
166        # Cap off the bounds
167        bounds = [
168            max(0, bounds[0]),
169            max(0, bounds[1]),
170            min(images[0].shape[1], bounds[2]),
171            min(images[0].shape[0], bounds[3]),
172        ]
173
174        # Crop the images
175        cropped_images = []
176        for image in images:
177            # Create a blank image of the right size
178            cropped_image = np.zeros((crop_size, crop_size), dtype=image.dtype)
179
180            # Insert the cropped image into the blank image, leaving a black buffer
181            # around the edges if the crop would go beyond the original image bounds
182            cropped_image[
183                displacements[1] : crop_size - displacements[3],
184                displacements[0] : crop_size - displacements[2],
185            ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]]
186            cropped_images.append(cropped_image)
187        return cropped_images

Get the event crops from the frame images. Called "get" because it does not need to extract anything; it is very quick for extracting multiple events from the same tile. Use this if you're interested in many events.

Parameters
  • images: the frame images.
  • crop_size: the square size of the image crop to get for this event.
  • in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
Returns

image_size x image_size crops of the event in the provided frames. If the event is too close to the edge, the crop will be smaller and not centered.

def extract_images( self, crop_size: int = 100, in_pixels: bool = True) -> list[numpy.ndarray]:
189    def extract_images(
190        self, crop_size: int = 100, in_pixels: bool = True
191    ) -> list[np.ndarray]:
192        """
193        Extract the images from the scan and tile, reading from the file. Called
194        "extract" because it must read and extract the images from file, which is slow.
195        Use this if you're interested in only a few events, as it is inefficient when
196        reading multiple events from the same tile.
197        :param crop_size: the square size of the image crop to get for this event.
198        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
199        :return: a list of cropped images from the scan in the order of the channels.
200        """
201        frames = Frame.get_frames(self.tile)
202        images = [frame.get_image() for frame in frames]
203        return self.crop_images(images, crop_size, in_pixels)

Extract the images from the scan and tile, reading from the file. Called "extract" because it must read and extract the images from file, which is slow. Use this if you're interested in only a few events, as it is inefficient when reading multiple events from the same tile.

Parameters
  • crop_size: the square size of the image crop to get for this event.
  • in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
Returns

a list of cropped images from the scan in the order of the channels.

@classmethod
def extract_images_for_list( cls, events: list[typing.Self], crop_size: int | list[int] = None, in_pixels: bool = True) -> list[list[numpy.ndarray]]:
205    @classmethod
206    def extract_images_for_list(
207        cls,
208        events: list[Self],
209        crop_size: int | list[int] = None,
210        in_pixels: bool = True,
211    ) -> list[list[np.ndarray]]:
212        """
213        Get the images for a list of events, ensuring that there is no wasteful reading
214        of the same tile multiple times. This function is more efficient than calling
215        extract_event_images for each event.
216        :param events: the events to extract images for.
217        :param crop_size: the square size of the image crop to get for this event.
218                          Defaults to four times the size of the event.
219        :param in_pixels: whether the crop size is in pixels or micrometers.
220                          Defaults to pixels, and is ignored if crop_size is None.
221        :return: a list of lists of cropped images for each event.
222        """
223        if len(events) == 0:
224            return []
225
226        # Populate a crop size if none provided
227        if crop_size is None:
228            crop_size = [4 * event.size for event in events]
229            in_pixels = True
230        # Propagate a constant crop size
231        elif isinstance(crop_size, int):
232            crop_size = [crop_size] * len(events)
233
234        # Sort the events by tile; use a shallow copy to avoid modifying the original
235        order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__()))
236
237        # Allocate the list to size
238        images = [None] * len(events)
239        last_tile = None
240        frame_images = None  # Holds large numpy arrays, so expensive to compare
241        # Iterate through in sorted order
242        for i in order:
243            if last_tile != events[i].tile:
244                # Gather the frame images, preserving them for the next event
245                frames = Frame.get_frames(events[i].tile)
246                frame_images = [frame.get_image() for frame in frames]
247
248                last_tile = events[i].tile
249            # Use the frame images to crop the event images
250            # Preserve the original order using order[i]
251            images[i] = events[i].crop_images(frame_images, crop_size[i], in_pixels)
252        return images

Get the images for a list of events, ensuring that there is no wasteful reading of the same tile multiple times. This function is more efficient than calling extract_event_images for each event.

Parameters
  • events: the events to extract images for.
  • crop_size: the square size of the image crop to get for this event. Defaults to four times the size of the event.
  • in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels, and is ignored if crop_size is None.
Returns

a list of lists of cropped images for each event.

class EventArray:
 255class EventArray:
 256    """
 257    A class that holds a large number of events' data, making it easy to analyze and
 258    manipulate many events at once. A more separated version of the Event class.
 259    """
 260
 261    INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y", "size"]
 262
 263    def __init__(
 264        self,
 265        info: pd.DataFrame = None,
 266        metadata: pd.DataFrame = None,
 267        features: pd.DataFrame = None,
 268    ):
 269        # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y", "size"
 270        if info is not None:
 271            if list(info.columns) != self.INFO_COLUMNS:
 272                raise ValueError(
 273                    "EventArray.info must have columns 'slide_id', 'tile', 'roi', 'x', 'y', 'size'"
 274                )
 275            # Copy first to avoid modifying the original
 276            info = info.copy()
 277            # Ensure that the columns are the right types
 278            info["slide_id"] = info["slide_id"].astype(str)
 279            info["tile"] = info["tile"].astype(np.uint16)
 280            info["roi"] = info["roi"].astype(np.uint8)
 281            info["x"] = info["x"].round().astype(np.uint16)
 282            info["y"] = info["y"].round().astype(np.uint16)
 283            info["size"] = info["size"].round().astype(np.uint16)
 284        # All DataFrames must all have the same number of rows
 285        if metadata is not None and (info is None or len(info) != len(metadata)):
 286            raise ValueError(
 287                "If EventArray.metadata is not None, it should match rows with .info"
 288            )
 289        if features is not None and (info is None or len(info) != len(features)):
 290            raise ValueError(
 291                "If EventArray.features is not None, it should match rows with .info"
 292            )
 293        self.info = info
 294        self.metadata = metadata
 295        self.features = features
 296
 297    def __len__(self) -> int:
 298        # Convenience method to get the number of events
 299        if self.info is None:
 300            return 0
 301        else:
 302            return len(self.info)
 303
 304    def __eq__(self, other):
 305        is_equal = True
 306        # Parse all possibilities for info
 307        if isinstance(self.info, pd.DataFrame):
 308            if isinstance(other.info, pd.DataFrame):
 309                is_equal = self.info.equals(other.info)
 310                if not is_equal:
 311                    return False
 312            else:
 313                return False
 314        elif self.info is None:
 315            if other.info is not None:
 316                return False
 317
 318        # Parse all possibilities for metadata
 319        if isinstance(self.metadata, pd.DataFrame):
 320            if isinstance(other.metadata, pd.DataFrame):
 321                is_equal = self.metadata.equals(other.metadata)
 322                if not is_equal:
 323                    return False
 324            else:
 325                return False
 326        elif self.metadata is None:
 327            if other.metadata is not None:
 328                return False
 329
 330        # Parse all possibilities for features
 331        if isinstance(self.features, pd.DataFrame):
 332            if isinstance(other.features, pd.DataFrame):
 333                is_equal = self.features.equals(other.features)
 334                if not is_equal:
 335                    return False
 336            else:
 337                return False
 338        elif self.features is None:
 339            if other.features is not None:
 340                return False
 341
 342        return is_equal
 343
 344    def get_sort_order(self, by: str | list[str], ascending: bool | list[bool] = True):
 345        """
 346        Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.
 347        :param by: name of the column(s) to sort by.
 348        :param ascending: whether to sort in ascending order; can be a list to match by
 349        :return: the order of the indices to sort by.
 350        """
 351        columns = self.get(by)
 352        return columns.sort_values(by=by, ascending=ascending).index
 353
 354    def sort(self, by: str | list[str], ascending: bool | list[bool] = True) -> Self:
 355        """
 356        Sort the EventArray by column(s) in the info, metadata, or features DataFrames.
 357        :param by: name of the column(s) to sort by.
 358        :param ascending: whether to sort in ascending order; can be a list to match by
 359        :return: a new, sorted EventArray.
 360        """
 361        order = self.get_sort_order(by, ascending)
 362        info = self.info.loc[order].reset_index(drop=True)
 363        if self.metadata is not None:
 364            metadata = self.metadata.loc[order].reset_index(drop=True)
 365        else:
 366            metadata = None
 367        if self.features is not None:
 368            features = self.features.loc[order].reset_index(drop=True)
 369        else:
 370            features = None
 371        return EventArray(info, metadata, features)
 372
 373    def get(self, column_names: int | str | list[int] | list[str]) -> pd.DataFrame:
 374        """
 375        Get a DataFrame with the specified columns from the EventArray, by value.
 376        :param column_names: the names of the columns to get.
 377        :return: a DataFrame with the specified columns.
 378        """
 379        if isinstance(column_names, int) or isinstance(column_names, str):
 380            column_names = [column_names]
 381        columns = []
 382        for column_name in column_names:
 383            if column_name in self.info.columns:
 384                columns.append(self.info[column_name])
 385            elif self.metadata is not None and column_name in self.metadata.columns:
 386                columns.append(self.metadata[column_name])
 387            elif self.features is not None and column_name in self.features.columns:
 388                columns.append(self.features[column_name])
 389            else:
 390                raise ValueError(f"Column {column_name} not found in EventArray")
 391        return pd.concat(columns, axis=1)
 392
 393    def rows(self, rows) -> Self:
 394        """
 395        Get a subset of the EventArray rows based on a boolean or integer index, by value.
 396        :param rows: the indices to get as a 1D boolean/integer list/array/series
 397        :return: a new EventArray with the subset of events.
 398        """
 399        info = self.info.loc[rows].reset_index(drop=True)
 400        if self.metadata is not None:
 401            metadata = self.metadata.loc[rows].reset_index(drop=True)
 402        else:
 403            metadata = None
 404        if self.features is not None:
 405            features = self.features.loc[rows].reset_index(drop=True)
 406        else:
 407            features = None
 408        return EventArray(info, metadata, features)
 409
 410    def copy(self) -> Self:
 411        """
 412        Create a deep copy of the EventArray.
 413        :return: a deep copy of the EventArray.
 414        """
 415        return EventArray(
 416            info=self.info.copy(),
 417            metadata=None if self.metadata is None else self.metadata.copy(),
 418            features=None if self.features is None else self.features.copy(),
 419        )
 420
 421    def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None:
 422        """
 423        Add metadata to the EventArray. Removes the need to check if metadata is None.
 424        Overwrites any existing metadata with the same column names as the new metadata.
 425        :param new_metadata: the metadata to add.
 426        """
 427        if len(self) != len(new_metadata):
 428            raise ValueError("New metadata must match length of existing info")
 429
 430        if self.metadata is None:
 431            self.metadata = new_metadata
 432        else:
 433            if isinstance(new_metadata, pd.Series):
 434                self.metadata[new_metadata.name] = new_metadata
 435            else:
 436                # It's a DataFrame
 437                self.metadata[new_metadata.columns] = new_metadata
 438
 439    def add_features(self, new_features: pd.Series | pd.DataFrame) -> None:
 440        """
 441        Add features to the EventArray. Removes the need to check if features is None.
 442        Overwrites any existing features with the same column names as the new features.
 443        :param new_features: the features to add.
 444        """
 445        if len(self) != len(new_features):
 446            raise ValueError("New features must match length of existing info")
 447
 448        if self.features is None:
 449            self.features = new_features
 450        else:
 451            if isinstance(new_features, pd.Series):
 452                self.features[new_features.name] = new_features
 453            else:
 454                # It's a DataFrame
 455                self.features[new_features.columns] = new_features
 456
 457    @classmethod
 458    def merge(cls, events: list[Self]) -> Self:
 459        """
 460        Combine EventArrays in a list into a single EventArray.
 461        :param events: the new list of events.
 462        """
 463        all_info = []
 464        all_metadata = []
 465        all_features = []
 466        for event_array in events:
 467            # Skip empty EventArrays
 468            if event_array.info is not None:
 469                all_info.append(event_array.info)
 470            if event_array.metadata is not None:
 471                all_metadata.append(event_array.metadata)
 472            if event_array.features is not None:
 473                all_features.append(event_array.features)
 474        if len(all_info) == 0:
 475            return EventArray()
 476        else:
 477            all_info = pd.concat(all_info, ignore_index=True)
 478        if len(all_metadata) == 0:
 479            all_metadata = None
 480        else:
 481            all_metadata = pd.concat(all_metadata, ignore_index=True)
 482        if len(all_features) == 0:
 483            all_features = None
 484        else:
 485            all_features = pd.concat(all_features, ignore_index=True)
 486
 487        return EventArray(all_info, all_metadata, all_features)
 488
 489    def to_events(
 490        self,
 491        scans: Scan | list[Scan],
 492        ignore_missing_scans=True,
 493        ignore_metadata=False,
 494        ignore_features=False,
 495    ) -> list[Event]:
 496        """
 497        Get the events in the EventArray as a list of events.
 498        :param scans: the scans that the events belong to, auto-matched by slide_id.
 499        Pass None if you don't care about scan metadata (pass ignore_missing_scans).
 500        :param ignore_missing_scans: whether to create blank scans for events without scans.
 501        :param ignore_metadata: whether to ignore metadata or not
 502        :param ignore_features: whether to ignore features or not
 503        :return:
 504        """
 505        if isinstance(scans, Scan):
 506            scans = [scans] * len(self.info)
 507        events = []
 508        for i in range(len(self.info)):
 509            # Determine the associated scan
 510            scan = None
 511            for s in scans:
 512                if s.slide_id == self.info["slide_id"][i]:
 513                    scan = s
 514                    break
 515            if scan is None:
 516                if ignore_missing_scans:
 517                    # Create a placeholder scan if the scan is missing
 518                    scan = Scan.make_placeholder(
 519                        self.info["slide_id"][i],
 520                        self.info["tile"][i],
 521                        self.info["roi"][i],
 522                    )
 523                else:
 524                    raise ValueError(
 525                        f"Scan {self.info['slide_id'][i]} not found for event {i}."
 526                    )
 527            # Prepare the metadata and features
 528            if ignore_metadata or self.metadata is None:
 529                metadata = None
 530            else:
 531                # This Series creation method is less efficient,
 532                # but required for preserving dtypes
 533                metadata = pd.Series(
 534                    {col: self.metadata.loc[i, col] for col in self.metadata.columns},
 535                    dtype=object,
 536                )
 537            if ignore_features or self.features is None:
 538                features = None
 539            else:
 540                features = pd.Series(
 541                    {col: self.features.loc[i, col] for col in self.features.columns},
 542                    dtype=object,
 543                )
 544            # Create the event and append it to the list
 545            events.append(
 546                Event(
 547                    scan,
 548                    Tile(scan, self.info["tile"][i], self.info["roi"][i]),
 549                    self.info["x"][i],
 550                    self.info["y"][i],
 551                    size=self.info["size"][i],
 552                    metadata=metadata,
 553                    features=features,
 554                )
 555            )
 556        return events
 557
 558    @classmethod
 559    def from_events(cls, events: list[Event]) -> Self:
 560        """
 561        Set the events in the EventArray to a new list of events.
 562        :param events: the new list of events.
 563        """
 564        # Return an empty array if we were passed nothing
 565        if events is None or len(events) == 0:
 566            return EventArray()
 567        # Otherwise, grab the info
 568        info = pd.DataFrame(
 569            {
 570                "slide_id": [event.scan.slide_id for event in events],
 571                "tile": [event.tile.n for event in events],
 572                "roi": [event.tile.n_roi for event in events],
 573                "x": [event.x for event in events],
 574                "y": [event.y for event in events],
 575                "size": [event.size for event in events],
 576            }
 577        )
 578        metadata_list = [event.metadata for event in events]
 579        # Iterate through and ensure that all metadata is the same shape
 580        for metadata in metadata_list:
 581            if type(metadata) != type(metadata_list[0]):
 582                raise ValueError("All metadata must be the same type.")
 583            if metadata is not None and metadata.shape != metadata_list[0].shape:
 584                raise ValueError("All metadata must be the same shape.")
 585        if metadata_list[0] is None:
 586            metadata = None
 587        else:
 588            metadata = pd.DataFrame(metadata_list)
 589        features_list = [event.features for event in events]
 590        # Iterate through and ensure that all features are the same shape
 591        for features in features_list:
 592            if type(features) != type(features_list[0]):
 593                raise ValueError("All features must be the same type.")
 594            if features is not None and features.shape != features_list[0].shape:
 595                raise ValueError("All features must be the same shape.")
 596        if features_list[0] is None:
 597            features = None
 598        else:
 599            features = pd.DataFrame(features_list)
 600        return EventArray(info=info, metadata=metadata, features=features)
 601
 602    def to_dataframe(self) -> pd.DataFrame:
 603        """
 604        Convert all the data in the EventArray to a single DataFrame.
 605        :return: a DataFrame with all the data in the EventArray.
 606        """
 607        # Make a copy of the info DataFrame and prepend "info_" to the column names
 608        output = self.info.copy()
 609        output.columns = [f"info_{col}" for col in output.columns]
 610        # Combine with the metadata and prepend "metadata_" to the column names
 611        if self.metadata is not None:
 612            metadata = self.metadata.copy()
 613            metadata.columns = [f"metadata_{col}" for col in metadata.columns]
 614            output = pd.concat([output, metadata], axis=1)
 615        # Combine with the features and prepend "features_" to the column names
 616        if self.features is not None:
 617            features = self.features.copy()
 618            features.columns = [f"features_{col}" for col in features.columns]
 619            output = pd.concat([output, features], axis=1)
 620        return output
 621
 622    @classmethod
 623    def from_dataframe(cls, df) -> Self:
 624        """
 625        From a single, special DataFrame, create an EventArray.
 626        :return: a DataFrame with all the data in the EventArray.
 627        """
 628        # Split the columns into info, metadata, and features and strip prefix
 629        info = df[[col for col in df.columns if col.startswith("info_")]].copy()
 630        info.columns = [col.replace("info_", "") for col in info.columns]
 631        if info.size == 0:
 632            info = None
 633        metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy()
 634        metadata.columns = [col.replace("metadata_", "") for col in metadata.columns]
 635        if metadata.size == 0:
 636            metadata = None
 637        features = df[[col for col in df.columns if col.startswith("features_")]].copy()
 638        features.columns = [col.replace("features_", "") for col in features.columns]
 639        if features.size == 0:
 640            features = None
 641        return cls(info=info, metadata=metadata, features=features)
 642
 643    @classmethod
 644    def from_mask(
 645        cls,
 646        mask: np.ndarray,
 647        slide_id: str,
 648        tile_n: int,
 649        n_roi: int = 0,
 650        include_cell_id: bool = True,
 651        images: list[np.ndarray] = None,
 652        image_labels: list[str] = None,
 653        properties: list[str] = None,
 654    ) -> Self:
 655        """
 656        Extract events from a mask DataFrame, including metadata and features.
 657        :param mask: the mask to extract events from.
 658        :param slide_id: the slide ID the mask is from.
 659        :param tile_n: the tile number the mask is from.
 660        :param n_roi: the ROI number the mask is from.
 661        :param include_cell_id: whether to include the cell_id, or numerical
 662        mask label, as metadata in the EventArray.
 663        :param images: the intensity images to extract features from.
 664        :param image_labels: the labels for the intensity images.
 665        :param properties: list of properties to extract in addition to the defaults:
 666        :return: EventArray corresponding to the mask labels.
 667        """
 668        if extract_mask_info is None:
 669            raise ModuleNotFoundError(
 670                "csi_images.csi_images dependencies not installed. Install csi-images "
 671                "with [imageio] option to resolve."
 672            )
 673        # Gather mask_info
 674        if images is not None and image_labels is not None:
 675            if len(images) != len(image_labels):
 676                raise ValueError("Intensity images and labels must match lengths.")
 677
 678        mask_info = extract_mask_info(mask, images, image_labels, properties)
 679
 680        if len(mask_info) == 0:
 681            return EventArray()
 682
 683        # Combine provided info and mask info
 684        info = pd.DataFrame(
 685            {
 686                "slide_id": slide_id,
 687                "tile": tile_n,
 688                "roi": n_roi,
 689                "x": mask_info["x"],
 690                "y": mask_info["y"],
 691                "size": mask_info["size"],
 692            },
 693        )
 694        # Extract a metadata column if desired
 695        if include_cell_id:
 696            metadata = pd.DataFrame({"cell_id": mask_info["id"]})
 697        else:
 698            metadata = None
 699        # If any additional properties were extracted, add them as features
 700        mask_info = mask_info.drop(columns=["id", "x", "y", "size"], errors="ignore")
 701        if len(mask_info.columns) > 0:
 702            features = mask_info
 703        else:
 704            features = None
 705        return EventArray(info, metadata, features)
 706
 707    def save_csv(self, output_path: str) -> bool:
 708        """
 709        Save the events to an CSV file, including metadata and features.
 710        :param output_path:
 711        :return:
 712        """
 713        self.to_dataframe().to_csv(output_path, index=False)
 714        return os.path.exists(output_path)
 715
 716    @classmethod
 717    def load_csv(cls, input_path: str) -> Self:
 718        """
 719        Load the events from an CSV file, including metadata and features.
 720        :param input_path:
 721        :return:
 722        """
 723        # Load the CSV file
 724        df = pd.read_csv(input_path)
 725        return cls.from_dataframe(df)
 726
 727    def save_hdf5(self, output_path: str) -> bool:
 728        """
 729        Save the events to an HDF5 file, including metadata and features.
 730        Uses the pandas-provided HDF5 functions for ease, and external compatibility,
 731        though these files are slightly harder to view in HDFView or similar.
 732        :param output_path:
 733        :return:
 734        """
 735        # Open the output_path as an HDF5 file
 736        with pd.HDFStore(output_path) as store:
 737            # Store the dataframes in the HDF5 file
 738            if self.info is not None:
 739                store.put("info", self.info, index=False)
 740            if self.metadata is not None:
 741                store.put("metadata", self.metadata, index=False)
 742            if self.features is not None:
 743                store.put("features", self.features, index=False)
 744        return os.path.exists(output_path)
 745
 746    @classmethod
 747    def load_hdf5(cls, input_path: str) -> Self:
 748        """
 749        Load the events from an HDF5 file, including metadata and features.
 750        :param input_path:
 751        :return:
 752        """
 753        # Open the input_path as an HDF5 file
 754        with pd.HDFStore(input_path) as store:
 755            # Load the dataframes from the HDF5 file
 756            info = store.get("info") if "info" in store else None
 757            metadata = store.get("metadata") if "metadata" in store else None
 758            features = store.get("features") if "features" in store else None
 759        return cls(info=info, metadata=metadata, features=features)
 760
 761    def save_ocular(self, output_path: str, event_type: str = "cells"):
 762        """
 763        Save the events to an OCULAR file. Relies on the dataframe originating
 764        from an OCULAR file (same columns; duplicate metadata/info).
 765        :param output_path:
 766        :param event_type:
 767        :return:
 768        """
 769        if pyreadr is None:
 770            raise ModuleNotFoundError(
 771                "pyreadr not installed. Install pyreadr directly "
 772                "or install csi-images with [rds] option to resolve."
 773            )
 774        if event_type == "cells":
 775            file_stub = "rc-final"
 776        elif event_type == "others":
 777            file_stub = "others-final"
 778        else:
 779            raise ValueError("Invalid event type. Must be cells or others.")
 780
 781        # Ensure good metadata
 782        metadata = pd.DataFrame(
 783            {
 784                "slide_id": self.info["slide_id"],
 785                "frame_id": self.info["tile"],
 786                "cell_id": (
 787                    self.metadata["cell_id"]
 788                    if "cell_id" in self.metadata.columns
 789                    else range(len(self.info))
 790                ),
 791                "cellx": self.info["x"],
 792                "celly": self.info["y"],
 793            }
 794        )
 795        if self.metadata is not None:
 796            metadata[self.metadata.columns] = self.metadata.copy()
 797
 798        # Check for the "ocular_interesting" column
 799        if event_type == "cells":
 800            if "ocular_interesting" in metadata.columns:
 801                interesting_rows = metadata["ocular_interesting"].to_numpy(dtype=bool)
 802            elif "hcpc" in metadata.columns:
 803                # Interesting cells don't get an hcpc designation, leaving them as -1
 804                interesting_rows = (
 805                    metadata["hcpc"].to_numpy() == -1
 806                )  # interesting cells
 807            else:
 808                interesting_rows = []
 809            if sum(interesting_rows) > 0:
 810                # Split the metadata into interesting and regular
 811                interesting_events = self.rows(interesting_rows)
 812                interesting_df = pd.concat(
 813                    [interesting_events.features, interesting_events.metadata], axis=1
 814                )
 815                data_events = self.rows(~interesting_rows)
 816                data_df = pd.concat(
 817                    [data_events.features, data_events.metadata], axis=1
 818                )
 819                data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore")
 820
 821                # Drop particular columns for "interesting"
 822                interesting_df = interesting_df.drop(
 823                    [
 824                        "clust",
 825                        "hcpc",
 826                        "frame_id",
 827                        "cell_id",
 828                        "unique_id",
 829                        "ocular_interesting",
 830                    ],
 831                    axis=1,
 832                    errors="ignore",
 833                )
 834                # Save both .csv and .rds
 835                interesting_stub = os.path.join(output_path, "ocular_interesting")
 836                interesting_df.to_csv(f"{interesting_stub}.csv")
 837                # Suppress pandas FutureWarning
 838                with warnings.catch_warnings():
 839                    warnings.simplefilter(action="ignore", category=FutureWarning)
 840                    pyreadr.write_rds(f"{interesting_stub}.rds", interesting_df)
 841            else:
 842                data_df = pd.concat([self.features, metadata], axis=1)
 843        else:
 844            # Get all data and reset_index (will copy it)
 845            data_df = pd.concat([self.features, metadata], axis=1)
 846
 847        # Split based on cluster number to conform to *-final[1-4].rds
 848        n_clusters = max(data_df["clust"]) + 1
 849        split_idx = [round(i * n_clusters / 4) for i in range(5)]
 850        for i in range(4):
 851            subset = (split_idx[i] <= data_df["clust"]) & (
 852                data_df["clust"] < split_idx[i + 1]
 853            )
 854            data_df.loc[subset, "hcpc"] = i + 1
 855            subset = data_df[subset].reset_index(drop=True)
 856            # Suppress pandas FutureWarning
 857            with warnings.catch_warnings():
 858                warnings.simplefilter(action="ignore", category=FutureWarning)
 859                pyreadr.write_rds(
 860                    os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset
 861                )
 862
 863        # Create new example cell strings
 864        data_df["example_cell_id"] = (
 865            data_df["slide_id"]
 866            + " "
 867            + data_df["frame_id"].astype(str)
 868            + " "
 869            + data_df["cell_id"].astype(str)
 870            + " "
 871            + data_df["cellx"].astype(int).astype(str)
 872            + " "
 873            + data_df["celly"].astype(int).astype(str)
 874        )
 875        # Find averagable data columns
 876        if "cellcluster_id" in data_df.columns:
 877            end_idx = data_df.columns.get_loc("cellcluster_id")
 878        else:
 879            end_idx = data_df.columns.get_loc("slide_id")
 880        avg_cols = data_df.columns[:end_idx].tolist()
 881        # Group by cluster and average
 882        data_df = data_df.groupby("clust").agg(
 883            **{col: (col, "mean") for col in avg_cols},
 884            count=("clust", "size"),  # count rows in each cluster
 885            example_cells=("example_cell_id", lambda x: ",".join(x)),
 886            hcpc=("hcpc", lambda x: x.iloc[0]),
 887        )
 888        data_df = data_df.reset_index()  # Do NOT drop, index is "clust"
 889        # Create new columns
 890        metadata = pd.DataFrame(
 891            {
 892                "count": data_df["count"],
 893                "example_cells": data_df["example_cells"],
 894                "clust": data_df["clust"].astype(int),
 895                "hcpc": data_df["hcpc"].astype(int),
 896                "id": data_df["clust"].astype(int).astype(str),
 897                "cccluster": "0",  # Dummy value
 898                "ccdistance": 0.0,  # Dummy value
 899                "rownum": list(range(len(data_df))),
 900                "framegroup": 0,  # Dummy value
 901            }
 902        )
 903        # Need to pad the features to 761 columns, as per OCULAR report needs
 904        additional_columns = range(len(avg_cols), 761)
 905        if len(additional_columns) > 0:
 906            padding = pd.DataFrame(
 907                np.zeros((len(data_df), len(additional_columns))),
 908                columns=[f"pad{i}" for i in additional_columns],
 909            )
 910            data_df = pd.concat([data_df[avg_cols], padding, metadata], axis=1)
 911        else:
 912            data_df = pd.concat([data_df[avg_cols], metadata], axis=1)
 913
 914        # Save the cluster data
 915        data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv"))
 916        # Suppress pandas FutureWarning
 917        with warnings.catch_warnings():
 918            warnings.simplefilter(action="ignore", category=FutureWarning)
 919            pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df)
 920
 921    @classmethod
 922    def load_ocular(
 923        cls,
 924        input_path: str,
 925        event_type="cells",
 926        cell_data_files=(
 927            "rc-final1.rds",
 928            "rc-final2.rds",
 929            "rc-final3.rds",
 930            "rc-final4.rds",
 931            "ocular_interesting.rds",
 932        ),
 933        others_data_files=(
 934            "others-final1.rds",
 935            "others-final2.rds",
 936            "others-final3.rds",
 937            "others-final4.rds",
 938        ),
 939        atlas_data_files=(
 940            "ocular_interesting.rds",
 941            "ocular_not_interesting.rds",
 942        ),
 943        drop_common_events=True,
 944        log=None,
 945    ) -> Self:
 946        """
 947
 948        :param input_path:
 949        :param event_type:
 950        :param cell_data_files:
 951        :param others_data_files:
 952        :param atlas_data_files:
 953        :param drop_common_events:
 954        :param log:
 955        :return:
 956        """
 957        if pyreadr is None:
 958            raise ModuleNotFoundError(
 959                "pyreadr not installed. Install pyreadr directly "
 960                "or install csi-images with [rds] option to resolve."
 961            )
 962        # Check if the input path is a directory or a file
 963        if os.path.isfile(input_path):
 964            data_files = [os.path.basename(input_path)]
 965            input_path = os.path.dirname(input_path)
 966        if event_type == "cells":
 967            data_files = cell_data_files
 968        elif event_type == "others":
 969            data_files = others_data_files
 970        else:
 971            raise ValueError("Invalid event type.")
 972
 973        # Load the data from the OCULAR files
 974        file_data = {}
 975        for file in data_files:
 976            file_path = os.path.join(input_path, file)
 977            if not os.path.isfile(file_path):
 978                if log is not None:
 979                    log.warning(f"{file} not found for in {input_path}")
 980                continue
 981            file_data[file] = pyreadr.read_r(file_path)
 982            # Get the DataFrame associated with None (pyreadr dict quirk)
 983            file_data[file] = file_data[file][None]
 984            if len(file_data[file]) == 0:
 985                # File gets dropped from the dict
 986                file_data.pop(file)
 987                if log is not None:
 988                    log.warning(f"{file} has no cells")
 989                continue
 990
 991            if log is not None:
 992                log.debug(f"{file} has {len(file_data[file])} cells")
 993
 994            # Drop common cells if requested and in this file
 995            if (
 996                file in atlas_data_files
 997                and drop_common_events
 998                and "catalogue_classification" in file_data[file]
 999            ):
1000                common_cell_indices = (
1001                    file_data[file]["catalogue_classification"] == "common_cell"
1002                )
1003                if log is not None:
1004                    log.debug(
1005                        f"Dropping {int(pd.Series.sum(common_cell_indices))}"
1006                        f"common cells from {file}"
1007                    )
1008                file_data[file] = file_data[file][common_cell_indices == False]
1009
1010            if len(file_data[file]) == 0:
1011                # File gets dropped from the dict
1012                file_data.pop(file)
1013                if log is not None:
1014                    log.warning(f"{file} has no cells after dropping common cells")
1015                continue
1016
1017            # Extract frame_id and cell_id
1018            # DAPI- events already have frame_id cell_id outside rowname
1019            if event_type == "cells" and "frame_id" not in file_data[file].columns:
1020                file_data[file]["rowname"] = file_data[file]["rowname"].astype("str")
1021                # get frame_id cell_id from rownames column and split into two columns
1022                split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True)
1023                if len(split_res.columns) != 2:
1024                    log.warning(
1025                        f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}'
1026                    )
1027                # then assign it back to the dataframe
1028                file_data[file][["frame_id", "cell_id"]] = split_res.astype("int")
1029            # reset indexes since they can cause NaN values in concat
1030            file_data[file] = file_data[file].reset_index(drop=True)
1031
1032        # Merge the data from all files
1033        if len(file_data) == 0:
1034            return EventArray()
1035        elif len(file_data) == 1:
1036            data = [file_data[file] for file in file_data.keys()][0]
1037        else:
1038            data = pd.concat(file_data.values())
1039
1040        if log is not None:
1041            log.debug(f"Gathered a total of {len(data)} events")
1042
1043        # Others is missing the "slide_id". Insert it right before "frame_id" column
1044        if event_type == "others" and "slide_id" not in data.columns:
1045            if os.path.basename(input_path) == "ocular":
1046                slide_id = os.path.basename(os.path.dirname(input_path))
1047            else:
1048                slide_id = "UNKNOWN"
1049            data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id)
1050
1051        # Sort according to ascending cell_id to keep the original, which is in manual_df
1052        data = data.sort_values(by=["cell_id"], ascending=True)
1053        # Filter out duplicates by x & y
1054        data = data.assign(
1055            unique_id=data["slide_id"]
1056            + "_"
1057            + data["frame_id"].astype(str)
1058            + "_"
1059            + data["cellx"].astype(int).astype(str)
1060            + "_"
1061            + data["celly"].astype(int).astype(str)
1062        )
1063        data = data.drop_duplicates(subset=["unique_id"], keep="first")
1064        # Normal unique_id is with cell_id
1065        data = data.assign(
1066            unique_id=data["slide_id"]
1067            + "_"
1068            + data["frame_id"].astype(str)
1069            + "_"
1070            + data["cell_id"].astype(str)
1071        )
1072        data = data.reset_index(drop=True)
1073        # All columns up to "slide_id" are features; drop the "slide_id"
1074        features = data.loc[:, :"slide_id"].iloc[:, :-1]
1075        data = data.loc[:, "slide_id":]
1076        # Grab the info columns
1077        info = data[["slide_id", "frame_id", "cellx", "celly"]]
1078        info.columns = ["slide_id", "tile", "x", "y"]
1079        info = info.assign(
1080            roi=0,  # OCULAR only works on 1 ROI, as far as known
1081            size=25,  # Static, for later montaging
1082        )
1083        info = info[["slide_id", "tile", "roi", "x", "y", "size"]]
1084        # Metadata has duplicate columns for later convenience
1085        metadata = data
1086        # Certain columns tend to be problematic with mixed data formats...
1087        for col in ["TRITC", "CY5", "FITC"]:
1088            if col in metadata:
1089                labels = {
1090                    "False": False,
1091                    "True": True,
1092                    "FALSE": False,
1093                    "TRUE": True,
1094                }
1095                metadata[col] = metadata[col].map(labels).astype(bool)
1096        for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]:
1097            if col in metadata:
1098                metadata[col] = metadata[col].fillna(-1).astype(int)
1099        return EventArray(info, metadata, features)

A class that holds a large number of events' data, making it easy to analyze and manipulate many events at once. A more separated version of the Event class.

EventArray( info: pandas.core.frame.DataFrame = None, metadata: pandas.core.frame.DataFrame = None, features: pandas.core.frame.DataFrame = None)
263    def __init__(
264        self,
265        info: pd.DataFrame = None,
266        metadata: pd.DataFrame = None,
267        features: pd.DataFrame = None,
268    ):
269        # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y", "size"
270        if info is not None:
271            if list(info.columns) != self.INFO_COLUMNS:
272                raise ValueError(
273                    "EventArray.info must have columns 'slide_id', 'tile', 'roi', 'x', 'y', 'size'"
274                )
275            # Copy first to avoid modifying the original
276            info = info.copy()
277            # Ensure that the columns are the right types
278            info["slide_id"] = info["slide_id"].astype(str)
279            info["tile"] = info["tile"].astype(np.uint16)
280            info["roi"] = info["roi"].astype(np.uint8)
281            info["x"] = info["x"].round().astype(np.uint16)
282            info["y"] = info["y"].round().astype(np.uint16)
283            info["size"] = info["size"].round().astype(np.uint16)
284        # All DataFrames must all have the same number of rows
285        if metadata is not None and (info is None or len(info) != len(metadata)):
286            raise ValueError(
287                "If EventArray.metadata is not None, it should match rows with .info"
288            )
289        if features is not None and (info is None or len(info) != len(features)):
290            raise ValueError(
291                "If EventArray.features is not None, it should match rows with .info"
292            )
293        self.info = info
294        self.metadata = metadata
295        self.features = features
INFO_COLUMNS = ['slide_id', 'tile', 'roi', 'x', 'y', 'size']
info
metadata
features
def get_sort_order(self, by: str | list[str], ascending: bool | list[bool] = True):
344    def get_sort_order(self, by: str | list[str], ascending: bool | list[bool] = True):
345        """
346        Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.
347        :param by: name of the column(s) to sort by.
348        :param ascending: whether to sort in ascending order; can be a list to match by
349        :return: the order of the indices to sort by.
350        """
351        columns = self.get(by)
352        return columns.sort_values(by=by, ascending=ascending).index

Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.

Parameters
  • by: name of the column(s) to sort by.
  • ascending: whether to sort in ascending order; can be a list to match by
Returns

the order of the indices to sort by.

def sort(self, by: str | list[str], ascending: bool | list[bool] = True) -> Self:
354    def sort(self, by: str | list[str], ascending: bool | list[bool] = True) -> Self:
355        """
356        Sort the EventArray by column(s) in the info, metadata, or features DataFrames.
357        :param by: name of the column(s) to sort by.
358        :param ascending: whether to sort in ascending order; can be a list to match by
359        :return: a new, sorted EventArray.
360        """
361        order = self.get_sort_order(by, ascending)
362        info = self.info.loc[order].reset_index(drop=True)
363        if self.metadata is not None:
364            metadata = self.metadata.loc[order].reset_index(drop=True)
365        else:
366            metadata = None
367        if self.features is not None:
368            features = self.features.loc[order].reset_index(drop=True)
369        else:
370            features = None
371        return EventArray(info, metadata, features)

Sort the EventArray by column(s) in the info, metadata, or features DataFrames.

Parameters
  • by: name of the column(s) to sort by.
  • ascending: whether to sort in ascending order; can be a list to match by
Returns

a new, sorted EventArray.

def get( self, column_names: int | str | list[int] | list[str]) -> pandas.core.frame.DataFrame:
373    def get(self, column_names: int | str | list[int] | list[str]) -> pd.DataFrame:
374        """
375        Get a DataFrame with the specified columns from the EventArray, by value.
376        :param column_names: the names of the columns to get.
377        :return: a DataFrame with the specified columns.
378        """
379        if isinstance(column_names, int) or isinstance(column_names, str):
380            column_names = [column_names]
381        columns = []
382        for column_name in column_names:
383            if column_name in self.info.columns:
384                columns.append(self.info[column_name])
385            elif self.metadata is not None and column_name in self.metadata.columns:
386                columns.append(self.metadata[column_name])
387            elif self.features is not None and column_name in self.features.columns:
388                columns.append(self.features[column_name])
389            else:
390                raise ValueError(f"Column {column_name} not found in EventArray")
391        return pd.concat(columns, axis=1)

Get a DataFrame with the specified columns from the EventArray, by value.

Parameters
  • column_names: the names of the columns to get.
Returns

a DataFrame with the specified columns.

def rows(self, rows) -> Self:
393    def rows(self, rows) -> Self:
394        """
395        Get a subset of the EventArray rows based on a boolean or integer index, by value.
396        :param rows: the indices to get as a 1D boolean/integer list/array/series
397        :return: a new EventArray with the subset of events.
398        """
399        info = self.info.loc[rows].reset_index(drop=True)
400        if self.metadata is not None:
401            metadata = self.metadata.loc[rows].reset_index(drop=True)
402        else:
403            metadata = None
404        if self.features is not None:
405            features = self.features.loc[rows].reset_index(drop=True)
406        else:
407            features = None
408        return EventArray(info, metadata, features)

Get a subset of the EventArray rows based on a boolean or integer index, by value.

Parameters
  • rows: the indices to get as a 1D boolean/integer list/array/series
Returns

a new EventArray with the subset of events.

def copy(self) -> Self:
410    def copy(self) -> Self:
411        """
412        Create a deep copy of the EventArray.
413        :return: a deep copy of the EventArray.
414        """
415        return EventArray(
416            info=self.info.copy(),
417            metadata=None if self.metadata is None else self.metadata.copy(),
418            features=None if self.features is None else self.features.copy(),
419        )

Create a deep copy of the EventArray.

Returns

a deep copy of the EventArray.

def add_metadata( self, new_metadata: pandas.core.series.Series | pandas.core.frame.DataFrame) -> None:
421    def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None:
422        """
423        Add metadata to the EventArray. Removes the need to check if metadata is None.
424        Overwrites any existing metadata with the same column names as the new metadata.
425        :param new_metadata: the metadata to add.
426        """
427        if len(self) != len(new_metadata):
428            raise ValueError("New metadata must match length of existing info")
429
430        if self.metadata is None:
431            self.metadata = new_metadata
432        else:
433            if isinstance(new_metadata, pd.Series):
434                self.metadata[new_metadata.name] = new_metadata
435            else:
436                # It's a DataFrame
437                self.metadata[new_metadata.columns] = new_metadata

Add metadata to the EventArray. Removes the need to check if metadata is None. Overwrites any existing metadata with the same column names as the new metadata.

Parameters
  • new_metadata: the metadata to add.
def add_features( self, new_features: pandas.core.series.Series | pandas.core.frame.DataFrame) -> None:
439    def add_features(self, new_features: pd.Series | pd.DataFrame) -> None:
440        """
441        Add features to the EventArray. Removes the need to check if features is None.
442        Overwrites any existing features with the same column names as the new features.
443        :param new_features: the features to add.
444        """
445        if len(self) != len(new_features):
446            raise ValueError("New features must match length of existing info")
447
448        if self.features is None:
449            self.features = new_features
450        else:
451            if isinstance(new_features, pd.Series):
452                self.features[new_features.name] = new_features
453            else:
454                # It's a DataFrame
455                self.features[new_features.columns] = new_features

Add features to the EventArray. Removes the need to check if features is None. Overwrites any existing features with the same column names as the new features.

Parameters
  • new_features: the features to add.
@classmethod
def merge(cls, events: list[typing.Self]) -> Self:
457    @classmethod
458    def merge(cls, events: list[Self]) -> Self:
459        """
460        Combine EventArrays in a list into a single EventArray.
461        :param events: the new list of events.
462        """
463        all_info = []
464        all_metadata = []
465        all_features = []
466        for event_array in events:
467            # Skip empty EventArrays
468            if event_array.info is not None:
469                all_info.append(event_array.info)
470            if event_array.metadata is not None:
471                all_metadata.append(event_array.metadata)
472            if event_array.features is not None:
473                all_features.append(event_array.features)
474        if len(all_info) == 0:
475            return EventArray()
476        else:
477            all_info = pd.concat(all_info, ignore_index=True)
478        if len(all_metadata) == 0:
479            all_metadata = None
480        else:
481            all_metadata = pd.concat(all_metadata, ignore_index=True)
482        if len(all_features) == 0:
483            all_features = None
484        else:
485            all_features = pd.concat(all_features, ignore_index=True)
486
487        return EventArray(all_info, all_metadata, all_features)

Combine EventArrays in a list into a single EventArray.

Parameters
  • events: the new list of events.
def to_events( self, scans: csi_images.csi_scans.Scan | list[csi_images.csi_scans.Scan], ignore_missing_scans=True, ignore_metadata=False, ignore_features=False) -> list[Event]:
489    def to_events(
490        self,
491        scans: Scan | list[Scan],
492        ignore_missing_scans=True,
493        ignore_metadata=False,
494        ignore_features=False,
495    ) -> list[Event]:
496        """
497        Get the events in the EventArray as a list of events.
498        :param scans: the scans that the events belong to, auto-matched by slide_id.
499        Pass None if you don't care about scan metadata (pass ignore_missing_scans).
500        :param ignore_missing_scans: whether to create blank scans for events without scans.
501        :param ignore_metadata: whether to ignore metadata or not
502        :param ignore_features: whether to ignore features or not
503        :return:
504        """
505        if isinstance(scans, Scan):
506            scans = [scans] * len(self.info)
507        events = []
508        for i in range(len(self.info)):
509            # Determine the associated scan
510            scan = None
511            for s in scans:
512                if s.slide_id == self.info["slide_id"][i]:
513                    scan = s
514                    break
515            if scan is None:
516                if ignore_missing_scans:
517                    # Create a placeholder scan if the scan is missing
518                    scan = Scan.make_placeholder(
519                        self.info["slide_id"][i],
520                        self.info["tile"][i],
521                        self.info["roi"][i],
522                    )
523                else:
524                    raise ValueError(
525                        f"Scan {self.info['slide_id'][i]} not found for event {i}."
526                    )
527            # Prepare the metadata and features
528            if ignore_metadata or self.metadata is None:
529                metadata = None
530            else:
531                # This Series creation method is less efficient,
532                # but required for preserving dtypes
533                metadata = pd.Series(
534                    {col: self.metadata.loc[i, col] for col in self.metadata.columns},
535                    dtype=object,
536                )
537            if ignore_features or self.features is None:
538                features = None
539            else:
540                features = pd.Series(
541                    {col: self.features.loc[i, col] for col in self.features.columns},
542                    dtype=object,
543                )
544            # Create the event and append it to the list
545            events.append(
546                Event(
547                    scan,
548                    Tile(scan, self.info["tile"][i], self.info["roi"][i]),
549                    self.info["x"][i],
550                    self.info["y"][i],
551                    size=self.info["size"][i],
552                    metadata=metadata,
553                    features=features,
554                )
555            )
556        return events

Get the events in the EventArray as a list of events.

Parameters
  • scans: the scans that the events belong to, auto-matched by slide_id. Pass None if you don't care about scan metadata (pass ignore_missing_scans).
  • ignore_missing_scans: whether to create blank scans for events without scans.
  • ignore_metadata: whether to ignore metadata or not
  • ignore_features: whether to ignore features or not
Returns
@classmethod
def from_events(cls, events: list[Event]) -> Self:
558    @classmethod
559    def from_events(cls, events: list[Event]) -> Self:
560        """
561        Set the events in the EventArray to a new list of events.
562        :param events: the new list of events.
563        """
564        # Return an empty array if we were passed nothing
565        if events is None or len(events) == 0:
566            return EventArray()
567        # Otherwise, grab the info
568        info = pd.DataFrame(
569            {
570                "slide_id": [event.scan.slide_id for event in events],
571                "tile": [event.tile.n for event in events],
572                "roi": [event.tile.n_roi for event in events],
573                "x": [event.x for event in events],
574                "y": [event.y for event in events],
575                "size": [event.size for event in events],
576            }
577        )
578        metadata_list = [event.metadata for event in events]
579        # Iterate through and ensure that all metadata is the same shape
580        for metadata in metadata_list:
581            if type(metadata) != type(metadata_list[0]):
582                raise ValueError("All metadata must be the same type.")
583            if metadata is not None and metadata.shape != metadata_list[0].shape:
584                raise ValueError("All metadata must be the same shape.")
585        if metadata_list[0] is None:
586            metadata = None
587        else:
588            metadata = pd.DataFrame(metadata_list)
589        features_list = [event.features for event in events]
590        # Iterate through and ensure that all features are the same shape
591        for features in features_list:
592            if type(features) != type(features_list[0]):
593                raise ValueError("All features must be the same type.")
594            if features is not None and features.shape != features_list[0].shape:
595                raise ValueError("All features must be the same shape.")
596        if features_list[0] is None:
597            features = None
598        else:
599            features = pd.DataFrame(features_list)
600        return EventArray(info=info, metadata=metadata, features=features)

Set the events in the EventArray to a new list of events.

Parameters
  • events: the new list of events.
def to_dataframe(self) -> pandas.core.frame.DataFrame:
602    def to_dataframe(self) -> pd.DataFrame:
603        """
604        Convert all the data in the EventArray to a single DataFrame.
605        :return: a DataFrame with all the data in the EventArray.
606        """
607        # Make a copy of the info DataFrame and prepend "info_" to the column names
608        output = self.info.copy()
609        output.columns = [f"info_{col}" for col in output.columns]
610        # Combine with the metadata and prepend "metadata_" to the column names
611        if self.metadata is not None:
612            metadata = self.metadata.copy()
613            metadata.columns = [f"metadata_{col}" for col in metadata.columns]
614            output = pd.concat([output, metadata], axis=1)
615        # Combine with the features and prepend "features_" to the column names
616        if self.features is not None:
617            features = self.features.copy()
618            features.columns = [f"features_{col}" for col in features.columns]
619            output = pd.concat([output, features], axis=1)
620        return output

Convert all the data in the EventArray to a single DataFrame.

Returns

a DataFrame with all the data in the EventArray.

@classmethod
def from_dataframe(cls, df) -> Self:
622    @classmethod
623    def from_dataframe(cls, df) -> Self:
624        """
625        From a single, special DataFrame, create an EventArray.
626        :return: a DataFrame with all the data in the EventArray.
627        """
628        # Split the columns into info, metadata, and features and strip prefix
629        info = df[[col for col in df.columns if col.startswith("info_")]].copy()
630        info.columns = [col.replace("info_", "") for col in info.columns]
631        if info.size == 0:
632            info = None
633        metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy()
634        metadata.columns = [col.replace("metadata_", "") for col in metadata.columns]
635        if metadata.size == 0:
636            metadata = None
637        features = df[[col for col in df.columns if col.startswith("features_")]].copy()
638        features.columns = [col.replace("features_", "") for col in features.columns]
639        if features.size == 0:
640            features = None
641        return cls(info=info, metadata=metadata, features=features)

From a single, special DataFrame, create an EventArray.

Returns

a DataFrame with all the data in the EventArray.

@classmethod
def from_mask( cls, mask: numpy.ndarray, slide_id: str, tile_n: int, n_roi: int = 0, include_cell_id: bool = True, images: list[numpy.ndarray] = None, image_labels: list[str] = None, properties: list[str] = None) -> Self:
643    @classmethod
644    def from_mask(
645        cls,
646        mask: np.ndarray,
647        slide_id: str,
648        tile_n: int,
649        n_roi: int = 0,
650        include_cell_id: bool = True,
651        images: list[np.ndarray] = None,
652        image_labels: list[str] = None,
653        properties: list[str] = None,
654    ) -> Self:
655        """
656        Extract events from a mask DataFrame, including metadata and features.
657        :param mask: the mask to extract events from.
658        :param slide_id: the slide ID the mask is from.
659        :param tile_n: the tile number the mask is from.
660        :param n_roi: the ROI number the mask is from.
661        :param include_cell_id: whether to include the cell_id, or numerical
662        mask label, as metadata in the EventArray.
663        :param images: the intensity images to extract features from.
664        :param image_labels: the labels for the intensity images.
665        :param properties: list of properties to extract in addition to the defaults:
666        :return: EventArray corresponding to the mask labels.
667        """
668        if extract_mask_info is None:
669            raise ModuleNotFoundError(
670                "csi_images.csi_images dependencies not installed. Install csi-images "
671                "with [imageio] option to resolve."
672            )
673        # Gather mask_info
674        if images is not None and image_labels is not None:
675            if len(images) != len(image_labels):
676                raise ValueError("Intensity images and labels must match lengths.")
677
678        mask_info = extract_mask_info(mask, images, image_labels, properties)
679
680        if len(mask_info) == 0:
681            return EventArray()
682
683        # Combine provided info and mask info
684        info = pd.DataFrame(
685            {
686                "slide_id": slide_id,
687                "tile": tile_n,
688                "roi": n_roi,
689                "x": mask_info["x"],
690                "y": mask_info["y"],
691                "size": mask_info["size"],
692            },
693        )
694        # Extract a metadata column if desired
695        if include_cell_id:
696            metadata = pd.DataFrame({"cell_id": mask_info["id"]})
697        else:
698            metadata = None
699        # If any additional properties were extracted, add them as features
700        mask_info = mask_info.drop(columns=["id", "x", "y", "size"], errors="ignore")
701        if len(mask_info.columns) > 0:
702            features = mask_info
703        else:
704            features = None
705        return EventArray(info, metadata, features)

Extract events from a mask DataFrame, including metadata and features.

Parameters
  • mask: the mask to extract events from.
  • slide_id: the slide ID the mask is from.
  • tile_n: the tile number the mask is from.
  • n_roi: the ROI number the mask is from.
  • include_cell_id: whether to include the cell_id, or numerical mask label, as metadata in the EventArray.
  • images: the intensity images to extract features from.
  • image_labels: the labels for the intensity images.
  • properties: list of properties to extract in addition to the defaults:
Returns

EventArray corresponding to the mask labels.

def save_csv(self, output_path: str) -> bool:
707    def save_csv(self, output_path: str) -> bool:
708        """
709        Save the events to an CSV file, including metadata and features.
710        :param output_path:
711        :return:
712        """
713        self.to_dataframe().to_csv(output_path, index=False)
714        return os.path.exists(output_path)

Save the events to an CSV file, including metadata and features.

Parameters
  • output_path:
Returns
@classmethod
def load_csv(cls, input_path: str) -> Self:
716    @classmethod
717    def load_csv(cls, input_path: str) -> Self:
718        """
719        Load the events from an CSV file, including metadata and features.
720        :param input_path:
721        :return:
722        """
723        # Load the CSV file
724        df = pd.read_csv(input_path)
725        return cls.from_dataframe(df)

Load the events from an CSV file, including metadata and features.

Parameters
  • input_path:
Returns
def save_hdf5(self, output_path: str) -> bool:
727    def save_hdf5(self, output_path: str) -> bool:
728        """
729        Save the events to an HDF5 file, including metadata and features.
730        Uses the pandas-provided HDF5 functions for ease, and external compatibility,
731        though these files are slightly harder to view in HDFView or similar.
732        :param output_path:
733        :return:
734        """
735        # Open the output_path as an HDF5 file
736        with pd.HDFStore(output_path) as store:
737            # Store the dataframes in the HDF5 file
738            if self.info is not None:
739                store.put("info", self.info, index=False)
740            if self.metadata is not None:
741                store.put("metadata", self.metadata, index=False)
742            if self.features is not None:
743                store.put("features", self.features, index=False)
744        return os.path.exists(output_path)

Save the events to an HDF5 file, including metadata and features. Uses the pandas-provided HDF5 functions for ease, and external compatibility, though these files are slightly harder to view in HDFView or similar.

Parameters
  • output_path:
Returns
@classmethod
def load_hdf5(cls, input_path: str) -> Self:
746    @classmethod
747    def load_hdf5(cls, input_path: str) -> Self:
748        """
749        Load the events from an HDF5 file, including metadata and features.
750        :param input_path:
751        :return:
752        """
753        # Open the input_path as an HDF5 file
754        with pd.HDFStore(input_path) as store:
755            # Load the dataframes from the HDF5 file
756            info = store.get("info") if "info" in store else None
757            metadata = store.get("metadata") if "metadata" in store else None
758            features = store.get("features") if "features" in store else None
759        return cls(info=info, metadata=metadata, features=features)

Load the events from an HDF5 file, including metadata and features.

Parameters
  • input_path:
Returns
def save_ocular(self, output_path: str, event_type: str = 'cells'):
761    def save_ocular(self, output_path: str, event_type: str = "cells"):
762        """
763        Save the events to an OCULAR file. Relies on the dataframe originating
764        from an OCULAR file (same columns; duplicate metadata/info).
765        :param output_path:
766        :param event_type:
767        :return:
768        """
769        if pyreadr is None:
770            raise ModuleNotFoundError(
771                "pyreadr not installed. Install pyreadr directly "
772                "or install csi-images with [rds] option to resolve."
773            )
774        if event_type == "cells":
775            file_stub = "rc-final"
776        elif event_type == "others":
777            file_stub = "others-final"
778        else:
779            raise ValueError("Invalid event type. Must be cells or others.")
780
781        # Ensure good metadata
782        metadata = pd.DataFrame(
783            {
784                "slide_id": self.info["slide_id"],
785                "frame_id": self.info["tile"],
786                "cell_id": (
787                    self.metadata["cell_id"]
788                    if "cell_id" in self.metadata.columns
789                    else range(len(self.info))
790                ),
791                "cellx": self.info["x"],
792                "celly": self.info["y"],
793            }
794        )
795        if self.metadata is not None:
796            metadata[self.metadata.columns] = self.metadata.copy()
797
798        # Check for the "ocular_interesting" column
799        if event_type == "cells":
800            if "ocular_interesting" in metadata.columns:
801                interesting_rows = metadata["ocular_interesting"].to_numpy(dtype=bool)
802            elif "hcpc" in metadata.columns:
803                # Interesting cells don't get an hcpc designation, leaving them as -1
804                interesting_rows = (
805                    metadata["hcpc"].to_numpy() == -1
806                )  # interesting cells
807            else:
808                interesting_rows = []
809            if sum(interesting_rows) > 0:
810                # Split the metadata into interesting and regular
811                interesting_events = self.rows(interesting_rows)
812                interesting_df = pd.concat(
813                    [interesting_events.features, interesting_events.metadata], axis=1
814                )
815                data_events = self.rows(~interesting_rows)
816                data_df = pd.concat(
817                    [data_events.features, data_events.metadata], axis=1
818                )
819                data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore")
820
821                # Drop particular columns for "interesting"
822                interesting_df = interesting_df.drop(
823                    [
824                        "clust",
825                        "hcpc",
826                        "frame_id",
827                        "cell_id",
828                        "unique_id",
829                        "ocular_interesting",
830                    ],
831                    axis=1,
832                    errors="ignore",
833                )
834                # Save both .csv and .rds
835                interesting_stub = os.path.join(output_path, "ocular_interesting")
836                interesting_df.to_csv(f"{interesting_stub}.csv")
837                # Suppress pandas FutureWarning
838                with warnings.catch_warnings():
839                    warnings.simplefilter(action="ignore", category=FutureWarning)
840                    pyreadr.write_rds(f"{interesting_stub}.rds", interesting_df)
841            else:
842                data_df = pd.concat([self.features, metadata], axis=1)
843        else:
844            # Get all data and reset_index (will copy it)
845            data_df = pd.concat([self.features, metadata], axis=1)
846
847        # Split based on cluster number to conform to *-final[1-4].rds
848        n_clusters = max(data_df["clust"]) + 1
849        split_idx = [round(i * n_clusters / 4) for i in range(5)]
850        for i in range(4):
851            subset = (split_idx[i] <= data_df["clust"]) & (
852                data_df["clust"] < split_idx[i + 1]
853            )
854            data_df.loc[subset, "hcpc"] = i + 1
855            subset = data_df[subset].reset_index(drop=True)
856            # Suppress pandas FutureWarning
857            with warnings.catch_warnings():
858                warnings.simplefilter(action="ignore", category=FutureWarning)
859                pyreadr.write_rds(
860                    os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset
861                )
862
863        # Create new example cell strings
864        data_df["example_cell_id"] = (
865            data_df["slide_id"]
866            + " "
867            + data_df["frame_id"].astype(str)
868            + " "
869            + data_df["cell_id"].astype(str)
870            + " "
871            + data_df["cellx"].astype(int).astype(str)
872            + " "
873            + data_df["celly"].astype(int).astype(str)
874        )
875        # Find averagable data columns
876        if "cellcluster_id" in data_df.columns:
877            end_idx = data_df.columns.get_loc("cellcluster_id")
878        else:
879            end_idx = data_df.columns.get_loc("slide_id")
880        avg_cols = data_df.columns[:end_idx].tolist()
881        # Group by cluster and average
882        data_df = data_df.groupby("clust").agg(
883            **{col: (col, "mean") for col in avg_cols},
884            count=("clust", "size"),  # count rows in each cluster
885            example_cells=("example_cell_id", lambda x: ",".join(x)),
886            hcpc=("hcpc", lambda x: x.iloc[0]),
887        )
888        data_df = data_df.reset_index()  # Do NOT drop, index is "clust"
889        # Create new columns
890        metadata = pd.DataFrame(
891            {
892                "count": data_df["count"],
893                "example_cells": data_df["example_cells"],
894                "clust": data_df["clust"].astype(int),
895                "hcpc": data_df["hcpc"].astype(int),
896                "id": data_df["clust"].astype(int).astype(str),
897                "cccluster": "0",  # Dummy value
898                "ccdistance": 0.0,  # Dummy value
899                "rownum": list(range(len(data_df))),
900                "framegroup": 0,  # Dummy value
901            }
902        )
903        # Need to pad the features to 761 columns, as per OCULAR report needs
904        additional_columns = range(len(avg_cols), 761)
905        if len(additional_columns) > 0:
906            padding = pd.DataFrame(
907                np.zeros((len(data_df), len(additional_columns))),
908                columns=[f"pad{i}" for i in additional_columns],
909            )
910            data_df = pd.concat([data_df[avg_cols], padding, metadata], axis=1)
911        else:
912            data_df = pd.concat([data_df[avg_cols], metadata], axis=1)
913
914        # Save the cluster data
915        data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv"))
916        # Suppress pandas FutureWarning
917        with warnings.catch_warnings():
918            warnings.simplefilter(action="ignore", category=FutureWarning)
919            pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df)

Save the events to an OCULAR file. Relies on the dataframe originating from an OCULAR file (same columns; duplicate metadata/info).

Parameters
  • output_path:
  • event_type:
Returns
@classmethod
def load_ocular( cls, input_path: str, event_type='cells', cell_data_files=('rc-final1.rds', 'rc-final2.rds', 'rc-final3.rds', 'rc-final4.rds', 'ocular_interesting.rds'), others_data_files=('others-final1.rds', 'others-final2.rds', 'others-final3.rds', 'others-final4.rds'), atlas_data_files=('ocular_interesting.rds', 'ocular_not_interesting.rds'), drop_common_events=True, log=None) -> Self:
 921    @classmethod
 922    def load_ocular(
 923        cls,
 924        input_path: str,
 925        event_type="cells",
 926        cell_data_files=(
 927            "rc-final1.rds",
 928            "rc-final2.rds",
 929            "rc-final3.rds",
 930            "rc-final4.rds",
 931            "ocular_interesting.rds",
 932        ),
 933        others_data_files=(
 934            "others-final1.rds",
 935            "others-final2.rds",
 936            "others-final3.rds",
 937            "others-final4.rds",
 938        ),
 939        atlas_data_files=(
 940            "ocular_interesting.rds",
 941            "ocular_not_interesting.rds",
 942        ),
 943        drop_common_events=True,
 944        log=None,
 945    ) -> Self:
 946        """
 947
 948        :param input_path:
 949        :param event_type:
 950        :param cell_data_files:
 951        :param others_data_files:
 952        :param atlas_data_files:
 953        :param drop_common_events:
 954        :param log:
 955        :return:
 956        """
 957        if pyreadr is None:
 958            raise ModuleNotFoundError(
 959                "pyreadr not installed. Install pyreadr directly "
 960                "or install csi-images with [rds] option to resolve."
 961            )
 962        # Check if the input path is a directory or a file
 963        if os.path.isfile(input_path):
 964            data_files = [os.path.basename(input_path)]
 965            input_path = os.path.dirname(input_path)
 966        if event_type == "cells":
 967            data_files = cell_data_files
 968        elif event_type == "others":
 969            data_files = others_data_files
 970        else:
 971            raise ValueError("Invalid event type.")
 972
 973        # Load the data from the OCULAR files
 974        file_data = {}
 975        for file in data_files:
 976            file_path = os.path.join(input_path, file)
 977            if not os.path.isfile(file_path):
 978                if log is not None:
 979                    log.warning(f"{file} not found for in {input_path}")
 980                continue
 981            file_data[file] = pyreadr.read_r(file_path)
 982            # Get the DataFrame associated with None (pyreadr dict quirk)
 983            file_data[file] = file_data[file][None]
 984            if len(file_data[file]) == 0:
 985                # File gets dropped from the dict
 986                file_data.pop(file)
 987                if log is not None:
 988                    log.warning(f"{file} has no cells")
 989                continue
 990
 991            if log is not None:
 992                log.debug(f"{file} has {len(file_data[file])} cells")
 993
 994            # Drop common cells if requested and in this file
 995            if (
 996                file in atlas_data_files
 997                and drop_common_events
 998                and "catalogue_classification" in file_data[file]
 999            ):
1000                common_cell_indices = (
1001                    file_data[file]["catalogue_classification"] == "common_cell"
1002                )
1003                if log is not None:
1004                    log.debug(
1005                        f"Dropping {int(pd.Series.sum(common_cell_indices))}"
1006                        f"common cells from {file}"
1007                    )
1008                file_data[file] = file_data[file][common_cell_indices == False]
1009
1010            if len(file_data[file]) == 0:
1011                # File gets dropped from the dict
1012                file_data.pop(file)
1013                if log is not None:
1014                    log.warning(f"{file} has no cells after dropping common cells")
1015                continue
1016
1017            # Extract frame_id and cell_id
1018            # DAPI- events already have frame_id cell_id outside rowname
1019            if event_type == "cells" and "frame_id" not in file_data[file].columns:
1020                file_data[file]["rowname"] = file_data[file]["rowname"].astype("str")
1021                # get frame_id cell_id from rownames column and split into two columns
1022                split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True)
1023                if len(split_res.columns) != 2:
1024                    log.warning(
1025                        f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}'
1026                    )
1027                # then assign it back to the dataframe
1028                file_data[file][["frame_id", "cell_id"]] = split_res.astype("int")
1029            # reset indexes since they can cause NaN values in concat
1030            file_data[file] = file_data[file].reset_index(drop=True)
1031
1032        # Merge the data from all files
1033        if len(file_data) == 0:
1034            return EventArray()
1035        elif len(file_data) == 1:
1036            data = [file_data[file] for file in file_data.keys()][0]
1037        else:
1038            data = pd.concat(file_data.values())
1039
1040        if log is not None:
1041            log.debug(f"Gathered a total of {len(data)} events")
1042
1043        # Others is missing the "slide_id". Insert it right before "frame_id" column
1044        if event_type == "others" and "slide_id" not in data.columns:
1045            if os.path.basename(input_path) == "ocular":
1046                slide_id = os.path.basename(os.path.dirname(input_path))
1047            else:
1048                slide_id = "UNKNOWN"
1049            data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id)
1050
1051        # Sort according to ascending cell_id to keep the original, which is in manual_df
1052        data = data.sort_values(by=["cell_id"], ascending=True)
1053        # Filter out duplicates by x & y
1054        data = data.assign(
1055            unique_id=data["slide_id"]
1056            + "_"
1057            + data["frame_id"].astype(str)
1058            + "_"
1059            + data["cellx"].astype(int).astype(str)
1060            + "_"
1061            + data["celly"].astype(int).astype(str)
1062        )
1063        data = data.drop_duplicates(subset=["unique_id"], keep="first")
1064        # Normal unique_id is with cell_id
1065        data = data.assign(
1066            unique_id=data["slide_id"]
1067            + "_"
1068            + data["frame_id"].astype(str)
1069            + "_"
1070            + data["cell_id"].astype(str)
1071        )
1072        data = data.reset_index(drop=True)
1073        # All columns up to "slide_id" are features; drop the "slide_id"
1074        features = data.loc[:, :"slide_id"].iloc[:, :-1]
1075        data = data.loc[:, "slide_id":]
1076        # Grab the info columns
1077        info = data[["slide_id", "frame_id", "cellx", "celly"]]
1078        info.columns = ["slide_id", "tile", "x", "y"]
1079        info = info.assign(
1080            roi=0,  # OCULAR only works on 1 ROI, as far as known
1081            size=25,  # Static, for later montaging
1082        )
1083        info = info[["slide_id", "tile", "roi", "x", "y", "size"]]
1084        # Metadata has duplicate columns for later convenience
1085        metadata = data
1086        # Certain columns tend to be problematic with mixed data formats...
1087        for col in ["TRITC", "CY5", "FITC"]:
1088            if col in metadata:
1089                labels = {
1090                    "False": False,
1091                    "True": True,
1092                    "FALSE": False,
1093                    "TRUE": True,
1094                }
1095                metadata[col] = metadata[col].map(labels).astype(bool)
1096        for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]:
1097            if col in metadata:
1098                metadata[col] = metadata[col].fillna(-1).astype(int)
1099        return EventArray(info, metadata, features)
Parameters
  • input_path:
  • event_type:
  • cell_data_files:
  • others_data_files:
  • atlas_data_files:
  • drop_common_events:
  • log:
Returns