csi_images.csi_events
Contains the Event class, which represents a single event in a scan. The Event class optionally holds metadata and features. Lists of events with similar metadata or features can be combined into DataFrames for analysis.
The Event class holds the position of the event in the frame, which can be converted to the position in the scanner or slide coordinate positions. See the csi_utils.csi_scans documentation page for more information on the coordinate systems.
1""" 2Contains the Event class, which represents a single event in a scan. 3The Event class optionally holds metadata and features. Lists of events with 4similar metadata or features can be combined into DataFrames for analysis. 5 6The Event class holds the position of the event in the frame, which can be converted 7to the position in the scanner or slide coordinate positions. See the 8csi_utils.csi_scans documentation page for more information on the coordinate systems. 9""" 10 11import os 12import glob 13import math 14import warnings 15from typing import Self, Iterable, Hashable, Sequence 16 17import numpy as np 18import pandas as pd 19 20from .csi_scans import Scan 21from .csi_tiles import Tile 22from .csi_frames import Frame 23 24# Optional dependencies; will raise errors in particular functions if not installed 25try: 26 from . import csi_images 27except ImportError: 28 csi_images = None 29try: 30 import imageio.v3 as imageio 31except ImportError: 32 imageio = None 33try: 34 import pyreadr 35except ImportError: 36 pyreadr = None 37 38 39class Event: 40 """ 41 A class that represents a single event in a scan, making it easy to evaluate 42 singular events. Required metadata is exposed as attributes, and optional 43 metadata and features are stored as DataFrames. 44 """ 45 46 SCAN_TO_SLIDE_TRANSFORM = { 47 # Axioscan zero is in the top-right corner instead of top-left 48 Scan.Type.AXIOSCAN7: np.array( 49 [ 50 [1, 0, 75000], 51 [0, 1, 0], 52 [0, 0, 1], 53 ] 54 ), 55 # BZScanner coordinates are a special kind of messed up: 56 # - The slide is upside-down. 57 # - The slide is oriented vertically, with the barcode at the bottom. 58 # - Tiles are numbered from the top-right 59 Scan.Type.BZSCANNER: np.array( 60 [ 61 [0, -1, 75000], 62 [-1, 0, 25000], 63 [0, 0, 1], 64 ] 65 ), 66 } 67 """ 68 Homogeneous transformation matrices for converting between scanner and slide 69 coordinates. The matrices are 3x3, with the final column representing the 70 translation in micrometers (um). For more information, see 71 [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations). 72 73 Transformations are nominal, and accuracy is not guaranteed; this is due to 74 imperfections in slides and alignment in the scanners. Units are in micrometers. 75 """ 76 77 def __init__( 78 self, 79 tile: Tile, 80 x: int, 81 y: int, 82 metadata: pd.Series = None, 83 features: pd.Series = None, 84 ): 85 self.tile = tile 86 self.x = int(x) 87 self.y = int(y) 88 self.metadata = metadata 89 self.features = features 90 91 def __repr__(self) -> str: 92 return f"{self.tile}-{self.x}-{self.y}" 93 94 def __eq__(self, other) -> bool: 95 return self.__repr__() == other.__repr__() 96 97 def __lt__(self, other): 98 return self.__repr__() < other.__repr__() 99 100 def get_scan_position(self) -> tuple[float, float]: 101 """ 102 Get the position of the event in the scanner's coordinate frame. 103 :return: the scan position of the event in micrometers (um). 104 """ 105 # Get overall pixel position 106 real_tile_height, real_tile_width = self.tile.scan.get_image_size() 107 pixel_x = self.x + (real_tile_width * self.tile.x) 108 pixel_y = self.y + (real_tile_height * self.tile.y) 109 # Convert to micrometers 110 x_um = pixel_x * self.tile.scan.pixel_size_um 111 y_um = pixel_y * self.tile.scan.pixel_size_um 112 # Add the scan's origin in the scanner frame 113 x_um += self.tile.scan.roi[self.tile.n_roi].origin_x_um 114 y_um += self.tile.scan.roi[self.tile.n_roi].origin_y_um 115 return x_um, y_um 116 117 def get_slide_position(self) -> tuple[float, float]: 118 """ 119 Get the slide position of the event in micrometers (um). 120 :return: the slide position of the event. 121 """ 122 # Turn scan_position into a 3x1 vector 123 scan_position = self.get_scan_position() 124 scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]]) 125 126 # Multiply by the appropriate homogeneous matrix 127 if self.tile.scan.scanner_id.startswith(self.tile.scan.Type.AXIOSCAN7.value): 128 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.tile.scan.Type.AXIOSCAN7] 129 elif self.tile.scan.scanner_id.startswith(self.tile.scan.Type.BZSCANNER.value): 130 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.tile.scan.Type.BZSCANNER] 131 else: 132 raise ValueError(f"Scanner type {self.tile.scan.scanner_id} not supported.") 133 slide_position = np.matmul(transform, scan_position) 134 return float(slide_position[0][0]), float(slide_position[1][0]) 135 136 def crop( 137 self, images: Iterable[np.ndarray], crop_size: int = 100, in_pixels: bool = True 138 ) -> list[np.ndarray]: 139 """ 140 Crop the event from the provided frame images. Use if you have already gotten 141 frame images; useful for cropping multiple events from the same frame image. 142 :param images: the frame images. 143 :param crop_size: the square size of the image crop to get for this event. 144 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 145 :return: image_size x image_size crops of the event in the provided frames. If 146 the event is too close to the edge, the crop will be smaller and not centered. 147 """ 148 # Convert a crop size in micrometers to pixels 149 if not in_pixels: 150 crop_size = round(crop_size / self.tile.scan.pixel_size_um) 151 image_height, image_width = 0, 0 152 for image in images: 153 if image_height == 0 and image_width == 0: 154 image_height, image_width = image.shape 155 else: 156 if image_height != image.shape[0] or image_width != image.shape[1]: 157 raise ValueError("All images must be the same size") 158 if image_height == 0 or image_width == 0: 159 raise ValueError("No images provided") 160 161 # Find the crop bounds 162 bounds = [ 163 self.x - (crop_size // 2) + 1, 164 self.y - (crop_size // 2) + 1, 165 self.x + math.ceil(crop_size / 2) + 1, 166 self.y + math.ceil(crop_size / 2) + 1, 167 ] 168 # Determine how much the bounds violate the image size 169 displacements = [ 170 max(0, -bounds[0]), 171 max(0, -bounds[1]), 172 max(0, bounds[2] - image_width), 173 max(0, bounds[3] - image_height), 174 ] 175 # Cap off the bounds 176 bounds = [ 177 max(0, bounds[0]), 178 max(0, bounds[1]), 179 min(image_width, bounds[2]), 180 min(image_height, bounds[3]), 181 ] 182 183 # Crop the images 184 crops = [] 185 for image in images: 186 # Create a blank image of the right size 187 crop = np.zeros((crop_size, crop_size), dtype=image.dtype) 188 189 # Insert the cropped image into the blank image, leaving a black buffer 190 # around the edges if the crop would go beyond the original image bounds 191 crop[ 192 displacements[1] : crop_size - displacements[3], 193 displacements[0] : crop_size - displacements[2], 194 ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]] 195 crops.append(crop) 196 return crops 197 198 def get_crops( 199 self, 200 crop_size: int = 100, 201 in_pixels: bool = True, 202 input_path: str = None, 203 channels: Iterable[int | str] = None, 204 apply_gain: bool | Iterable[bool] = True, 205 ) -> list[np.ndarray]: 206 """ 207 Gets the frame images for this event and then crops the event from the images. 208 Convenient for retrieving a single event's crops, but less efficient when 209 retrieving multiple events from the same tile as it will reread the images. 210 :param crop_size: the square size of the image crop to get for this event. 211 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 212 :param input_path: the path to the input images. Defaults to None (uses the scan's path). 213 :param channels: the channels to extract images for. Defaults to all channels. 214 :param apply_gain: whether to apply scanner-calculated gain to the images, if 215 not already applied. If a list, matches the channels. 216 :return: a list of cropped images from the scan in the order of the channels. 217 """ 218 # This function validates channels 219 frames = Frame.get_frames(self.tile, channels) 220 # Convert individual inputs to lists of appropriate length 221 if isinstance(apply_gain, bool): 222 apply_gain = [apply_gain] * len(frames) 223 images = [f.get_image(input_path, a) for f, a in zip(frames, apply_gain)] 224 return self.crop(images, crop_size, in_pixels) 225 226 def save_crops( 227 self, 228 crops: Sequence[np.ndarray], 229 output_path: str, 230 labels: Sequence[str], 231 ext: str = "auto", 232 ): 233 """ 234 Save the crops to image files. 235 :param crops: the crops to save. Will save as RGB if 3 channel [h, w, 3] or 236 grayscale if 1 channel [h, w] or [h, w, 1]. 237 :param labels: the labels to append to the file name, usually the channel names 238 associated with each crop. 239 :param output_path: the folder to save the crops to. Will make if needed. 240 :param ext: the file extension to save the crops as. Defaults to "auto", which 241 will save as .tif for grayscale images and .jpg for RGB images. 242 :return: None 243 """ 244 if len(crops) != len(labels): 245 raise ValueError("Crops and labels must be the same length") 246 247 if csi_images is None or imageio is None: 248 raise ModuleNotFoundError( 249 "imageio libraries not installed! " 250 "run `pip install csi_images[imageio]` to resolve." 251 ) 252 253 os.makedirs(output_path, exist_ok=True) 254 255 for crop, label in zip(crops, labels): 256 if ext == "auto": 257 if len(crop.shape) == 2 or crop.shape[2] == 1: 258 file_extension = ".tif" 259 elif crop.shape[2] == 3: 260 file_extension = ".jpg" 261 else: 262 warnings.warn( 263 f"Image shape {crop.shape} not recognized; saving as .tif" 264 ) 265 file_extension = ".tif" 266 else: 267 file_extension = ext 268 file = os.path.join(output_path, f"{self}-{label}{file_extension}") 269 # TODO: add more file types here 270 if file_extension == ".tif": 271 imageio.imwrite(file, crop, compression="deflate") 272 elif file_extension in [".jpg", ".jpeg"]: 273 crop = csi_images.scale_bit_depth(crop, np.uint8) 274 imageio.imwrite(file, crop, quality=80) 275 else: 276 imageio.imwrite(file, crop) 277 278 def load_crops( 279 self, input_path: str, labels: list[str] = None 280 ) -> dict[str, np.ndarray]: 281 """ 282 Loads previously saved crop files from a folder. 283 :param input_path: folder containing crop files. 284 :param labels: optional label filter, will only return crops with these labels. 285 :return: a tuple of lists containing the crops and their labels. 286 """ 287 crops = {} 288 for file in glob.glob(os.path.join(input_path, f"{self}-*")): 289 label = os.path.splitext(os.path.basename(file))[0].split("-")[-1] 290 # Skip if we have labels to target 291 if labels is not None and label not in labels: 292 continue 293 crops[label] = imageio.imread(file) 294 return crops 295 296 def get_montage_channels( 297 self, 298 channels: Sequence[int | str] | None, 299 composites: dict[int | str, tuple[float, float, float]] | None, 300 ) -> tuple[list[int], list[int], dict[int, tuple[float, float, float]]]: 301 """ 302 Get the channel names for the montage from the event's tile. 303 :param channels: channel indices or names for grayscale channels 304 :param composites: dictionary of channel indices or names and RGB values 305 :return: (1) channel indices to retrieve, 306 (2) relative grayscale channel indices, and 307 (3) composite channel indices and RGB values. 308 """ 309 if channels is None: 310 channels = list(range(len(self.tile.scan.channels))) 311 if (len(channels) == 0) and (composites is None or len(composites) == 0): 312 raise ValueError("Must provide at least one channel type to montage") 313 314 channels_to_get = [] 315 316 # Build the list of channels to retrieve 317 if channels is not None: 318 if isinstance(channels[0], str): 319 channels = self.tile.scan.get_channel_indices(channels) 320 channels_to_get += channels 321 order = list(range(len(channels))) # Always the first n channels 322 else: 323 order = None 324 325 if composites is not None: 326 relative_composites = {} # Relative indices for retrieved channels 327 # Convert to scan indices 328 rgb_channels = list(composites.keys()) 329 if isinstance(rgb_channels[0], str): 330 rgb_channels = self.tile.scan.get_channel_indices(rgb_channels) 331 # Find the index or add to the end 332 for channel, rgb in zip(rgb_channels, composites.values()): 333 if channel not in channels_to_get: 334 channels_to_get.append(channel) 335 relative_composites[channel] = rgb 336 else: 337 relative_composites[channels_to_get.index(channel)] = rgb 338 else: 339 relative_composites = None 340 341 return channels_to_get, order, relative_composites 342 343 def get_montage( 344 self, 345 channels: Sequence[int | str] = None, 346 composites: dict[int | str, tuple[float, float, float]] = None, 347 crop_size: int = 100, 348 in_pixels: bool = True, 349 input_path: str = None, 350 apply_gain: bool = True, 351 **kwargs, 352 ) -> np.ndarray: 353 """ 354 Convenience function for getting frame images and creating a montage. Mirrors 355 csi_images.make_montage(). Convenient for a single event's montage, but less 356 efficient when for multiple events from the same tile. 357 :param channels: the channels to use for black-and-white montages. 358 :param composites: dictionary of indices and RGB tuples for a composite. 359 :param crop_size: the square size of the image crop to get for this event. 360 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 361 :param input_path: the path to the input images. Defaults to None (uses the scan's path). 362 :param apply_gain: whether to apply scanner-calculated gain to the images, if 363 not already applied. If a list, matches the channels. 364 :param kwargs: montage options. See csi_images.make_montage() for more details. 365 :return: numpy array representing the montage. 366 """ 367 channels, order, composites = self.get_montage_channels(channels, composites) 368 images = self.get_crops(crop_size, in_pixels, input_path, channels, apply_gain) 369 return csi_images.make_montage(images, order, composites, **kwargs) 370 371 def save_montage( 372 self, 373 montage: np.ndarray, 374 output_path: str, 375 ocular_names: bool = False, 376 tag: str = "", 377 ): 378 """ 379 Save the montage as a JPEG image with a set name. 380 :param montage: the montage to save. 381 :param output_path: the folder to save the montage in. Wil make if needed. 382 :param ocular_names: whether to use the OCULAR naming convention. 383 :param tag: a tag to append to the file name. Ignored if ocular_names is True. 384 :return: None 385 """ 386 if csi_images is None or imageio is None: 387 raise ModuleNotFoundError( 388 "imageio libraries not installed! " 389 "run `pip install csi_images[imageio]` to resolve." 390 ) 391 392 montage = csi_images.scale_bit_depth(montage, np.uint8) 393 394 if ocular_names: 395 if "cell_id" not in self.metadata.index: 396 raise ValueError( 397 "Event metadata must include 'cell_id' for OCULAR naming." 398 ) 399 file = f"{self.tile.n}-{self.metadata['cell_id']}-{self.x}-{self.y}.jpeg" 400 else: 401 file = f"{self}{tag}.jpeg" 402 403 os.makedirs(output_path, exist_ok=True) 404 imageio.imwrite(os.path.join(output_path, file), montage, quality=80) 405 406 def load_montage(self, input_path: str, tag: str = "") -> np.ndarray: 407 """ 408 Loads the montage from a file saved by Event.save_montage. 409 :param input_path: the path to the folder where the montage was saved. 410 :param tag: a string to add to the file name, before the extension. 411 :return: 412 """ 413 file = f"{self}{tag}.jpeg" 414 return imageio.imread(os.path.join(input_path, file)) 415 416 @classmethod 417 def get_many_crops( 418 cls, 419 events: Sequence[Self], 420 crop_size: int | Sequence[int] = 100, 421 in_pixels: bool = True, 422 input_path: str | Sequence[str] = None, 423 channels: Sequence[int | str] = None, 424 apply_gain: bool | Sequence[bool] = True, 425 ) -> list[list[np.ndarray]]: 426 """ 427 Get the crops for a list of events, ensuring that there is no wasteful reading 428 of the same tile multiple times. This function is more efficient than calling 429 get_crops() for each event. 430 :param events: the events to get crops for. 431 :param crop_size: the square size of the image crop to get for this event. 432 Defaults to four times the size of the event. 433 :param in_pixels: whether the crop size is in pixels or micrometers. 434 Defaults to pixels, and is ignored if crop_size is None. 435 :param input_path: the path to the input images. Will only work for lists of events 436 from the same scan. Defaults to None (uses the scan's path). 437 :param channels: the channels to extract images for. Defaults to all channels. 438 :param apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True. 439 Can be supplied as a list to apply gain to individual channels. 440 :return: a list of lists of cropped images for each event. 441 """ 442 if len(events) == 0: 443 return [] 444 # Adapt singular inputs to lists of appropriate length 445 if isinstance(crop_size, int): 446 crop_size = [crop_size] * len(events) 447 if input_path is None or isinstance(input_path, str): 448 input_path = [input_path] * len(events) 449 450 # Get the order of the events when sorted by slide/tile 451 order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__())) 452 453 # Allocate the list to size 454 crops = [[]] * len(events) 455 last_tile = None 456 images = None # Holds large numpy arrays, so expensive to compare 457 # Iterate through in slide/tile sorted order 458 for i in order: 459 if last_tile != events[i].tile: 460 # Gather the frame images, preserving them for the next event 461 frames = Frame.get_frames(events[i].tile, channels) 462 if isinstance(apply_gain, bool): 463 apply = [apply_gain] * len(frames) 464 else: 465 apply = apply_gain 466 images = [f.get_image(input_path[i], a) for f, a in zip(frames, apply)] 467 last_tile = events[i].tile 468 # Use the frame images to crop the event images 469 crops[i] = events[i].crop(images, crop_size[i], in_pixels) 470 return crops 471 472 @classmethod 473 def get_many_montages( 474 cls, 475 events: Sequence[Self], 476 channels: Sequence[int | str] = None, 477 composites: dict[int | str, tuple[float, float, float]] = None, 478 crop_size: int = 100, 479 in_pixels: bool = True, 480 input_path: str = None, 481 apply_gain: bool | Iterable[bool] = True, 482 **kwargs, 483 ) -> list[np.ndarray]: 484 """ 485 Convenience function for get_montage(), but for a list of events. More efficient 486 thank get_montage() when working with multiple events from the same tile. 487 :param events: a list of Event objects. 488 :param channels: the channels to extract images for. Defaults to all channels. 489 :param composites: dictionary of indices and RGB tuples for a composite. 490 :param crop_size: the square size of the image crop to get for this event. 491 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 492 :param input_path: the path to the input images. Defaults to None (uses the scan's path). 493 :param apply_gain: whether to apply scanner-calculated gain to the images, if 494 not already applied. If a list, matches the channels. 495 :param kwargs: montage options. See csi_images.make_montage() for more details. 496 :return: a list of numpy arrays representing the montages. 497 """ 498 if len(events) == 0: 499 return [] 500 # Adapt singular inputs to lists of appropriate length 501 if isinstance(crop_size, int): 502 crop_size = [crop_size] * len(events) 503 if input_path is None or isinstance(input_path, str): 504 input_path = [input_path] * len(events) 505 506 # Get the order of the events when sorted by slide/tile 507 event_order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__())) 508 509 # Allocate the list to size 510 montages = [np.empty(0)] * len(events) 511 # Placeholder variables to avoid rereading the same tile 512 images = None # Holds large numpy arrays, so expensive to compare 513 order = None 514 rel_composites = None 515 last_tile = None 516 # Iterate through in slide/tile sorted order 517 for i in event_order: 518 if last_tile != events[i].tile: 519 channels_to_get, order, rel_composites = events[i].get_montage_channels( 520 channels, composites 521 ) 522 # Gather the frame images, preserving them for the next event 523 frames = Frame.get_frames(events[i].tile, channels_to_get) 524 if isinstance(apply_gain, bool): 525 apply = [apply_gain] * len(frames) 526 else: 527 apply = apply_gain 528 images = [f.get_image(input_path[i], a) for f, a in zip(frames, apply)] 529 last_tile = events[i].tile 530 # Use the frame images to crop the event images and make montages 531 crops = events[i].crop(images, crop_size[i], in_pixels) 532 montages[i] = csi_images.make_montage( 533 crops, order, rel_composites, **kwargs 534 ) 535 536 return montages 537 538 @classmethod 539 def get_and_save_many_crops( 540 cls, 541 events: list[Self], 542 output_path: str, 543 labels: Sequence[str], 544 ext: str = "auto", 545 additional_gain: Sequence[float] = None, 546 **kwargs, 547 ) -> None: 548 """ 549 Get and save the crops for a list of events, ensuring that there is no wasteful 550 reading and limiting the image data in memory to 1 tile at a time. This function 551 is more efficient that chaining get_crops() and save_crops() for each event or 552 get_many_crops() and then save_crops(). 553 :param events: list of events to get, crop, and save. 554 :param output_path: the folder to save the crops in. Will make if needed. 555 :param labels: the labels to save the crops with. See save_crops(). 556 :param ext: the file extension to save the crops as. See save_crops(). 557 :param additional_gain: additional gain to apply to the crops. If not None, must 558 match the length of the number of crop channels. 559 :param kwargs: see get_many_crops() for more parameters. 560 :return: 561 """ 562 unique_tiles = set([event.tile for event in events]) 563 564 for tile in unique_tiles: 565 # Get one tile's worth of event crops 566 tile_events = [e for e in events if e.tile == tile] 567 crops_list = cls.get_many_crops(tile_events, **kwargs) 568 for event, crops in zip(tile_events, crops_list): 569 # Apply any additional gains 570 if additional_gain is not None: 571 crops = [gain * crop for gain, crop in zip(additional_gain, crops)] 572 event.save_crops(crops, output_path, labels, ext) 573 574 @classmethod 575 def get_and_save_many_montages( 576 cls, 577 events: list[Self], 578 output_path: str, 579 ocular_names: bool = False, 580 tag: str = "", 581 **kwargs, 582 ) -> None: 583 """ 584 Save montages of the events to image files. 585 :param events: the events to get, montage, and save. 586 :param output_path: the folder to save the montages to. Will make if needed. 587 :param ocular_names: whether to use the OCULAR naming convention. 588 :param tag: a tag to append to the file name. Ignored if ocular_names is True. 589 :param kwargs: see get_many_montages() for more parameters. 590 """ 591 unique_tiles = set([event.tile for event in events]) 592 593 for tile in unique_tiles: 594 # Get one tile's worth of event crops 595 tile_events = [e for e in events if e.tile == tile] 596 montages = cls.get_many_montages(tile_events, **kwargs) 597 for event, montage in zip(tile_events, montages): 598 event.save_montage(montage, output_path, ocular_names, tag) 599 600 601class EventArray: 602 """ 603 A class that holds a large number of events' data, making it easy to analyze and 604 manipulate many events at once. A more separated version of the Event class. 605 """ 606 607 INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y"] 608 609 def __init__( 610 self, 611 info: pd.DataFrame = None, 612 metadata: pd.DataFrame = None, 613 features: pd.DataFrame = None, 614 ): 615 # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y" 616 if info is not None: 617 # Special case: "roi" is often not required, so we'll fill in if its missing 618 if "roi" not in info.columns: 619 info["roi"] = 0 620 if set(info.columns) != set(self.INFO_COLUMNS): 621 raise ValueError( 622 f"EventArray.info must have columns:" 623 f"{self.INFO_COLUMNS}; had {list(info.columns)}" 624 ) 625 # Copy first to avoid modifying the original 626 info = info.copy() 627 # Ensure that the columns are the right types 628 info["slide_id"] = info["slide_id"].astype(str) 629 info["tile"] = info["tile"].astype(np.uint16) 630 info["roi"] = info["roi"].astype(np.uint8) 631 info["x"] = info["x"].round().astype(np.uint16) 632 info["y"] = info["y"].round().astype(np.uint16) 633 # Ensure that the columns are in the right order 634 info = info[self.INFO_COLUMNS] 635 # All DataFrames must all have the same number of rows 636 if metadata is not None and (info is None or len(info) != len(metadata)): 637 raise ValueError( 638 "If EventArray.metadata is not None, it should match rows with .info" 639 ) 640 if features is not None and (info is None or len(info) != len(features)): 641 raise ValueError( 642 "If EventArray.features is not None, it should match rows with .info" 643 ) 644 # No columns named "metadata_", "features_", or "None" 645 column_names = [] 646 if metadata is not None: 647 column_names += metadata.columns.tolist() 648 if features is not None: 649 column_names += features.columns.tolist() 650 if any([col.lower().startswith("metadata_") for col in column_names]): 651 raise ValueError("EventArray column names cannot start with 'metadata_'") 652 if any([col.lower().startswith("features_") for col in column_names]): 653 raise ValueError("EventArray column names cannot start with 'features_'") 654 if any([col.lower() == "none" for col in column_names]): 655 raise ValueError("EventArray column names cannot be 'none'") 656 657 self.info = info 658 self.metadata = metadata 659 self.features = features 660 661 def __len__(self) -> int: 662 # Convenience method to get the number of events 663 if self.info is None: 664 return 0 665 else: 666 return len(self.info) 667 668 def __eq__(self, other): 669 # Parse all possibilities for info 670 if isinstance(self.info, pd.DataFrame): 671 if isinstance(other.info, pd.DataFrame): 672 if not self.info.equals(other.info): 673 return False 674 else: 675 return False 676 elif self.info is None: 677 if other.info is not None: 678 return False 679 680 # Parse all possibilities for metadata 681 if isinstance(self.metadata, pd.DataFrame): 682 if isinstance(other.metadata, pd.DataFrame): 683 is_equal = self.metadata.equals(other.metadata) 684 if not is_equal: 685 return False 686 else: 687 return False 688 elif self.metadata is None: 689 if other.metadata is not None: 690 return False 691 692 # Parse all possibilities for features 693 if isinstance(self.features, pd.DataFrame): 694 if isinstance(other.features, pd.DataFrame): 695 is_equal = self.features.equals(other.features) 696 if not is_equal: 697 return False 698 else: 699 return False 700 elif self.features is None: 701 if other.features is not None: 702 return False 703 704 return is_equal 705 706 def get_sort_order( 707 self, by: Hashable | Sequence[Hashable], ascending: bool | Sequence[bool] = True 708 ): 709 """ 710 Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames. 711 :param by: name of the column(s) to sort by. 712 :param ascending: whether to sort in ascending order; can be a list to match by 713 :return: the order of the indices to sort by. 714 """ 715 columns = self.get(by) 716 return columns.sort_values(by=by, ascending=ascending).index 717 718 def sort( 719 self, 720 by: Hashable | Sequence[Hashable], 721 ascending: bool | Sequence[bool] = True, 722 ) -> Self: 723 """ 724 Sort the EventArray by column(s) in the info, metadata, or features DataFrames. 725 :param by: name of the column(s) to sort by. 726 :param ascending: whether to sort in ascending order; can be a list to match by 727 :return: a new, sorted EventArray. 728 """ 729 order = self.get_sort_order(by, ascending) 730 info = self.info.loc[order].reset_index(drop=True) 731 if self.metadata is not None: 732 metadata = self.metadata.loc[order].reset_index(drop=True) 733 else: 734 metadata = None 735 if self.features is not None: 736 features = self.features.loc[order].reset_index(drop=True) 737 else: 738 features = None 739 return EventArray(info, metadata, features) 740 741 def get(self, column_names: Hashable | Sequence[Hashable]) -> pd.DataFrame: 742 """ 743 Get a DataFrame with the specified columns from the EventArray, by value. 744 :param column_names: the names of the columns to get. 745 :return: a DataFrame with the specified columns. 746 """ 747 if isinstance(column_names, Hashable): 748 column_names = [column_names] # Drop into a list for the loop 749 columns = [] 750 for column_name in column_names: 751 if column_name in self.info.columns: 752 columns.append(self.info[column_name]) 753 elif self.metadata is not None and column_name in self.metadata.columns: 754 columns.append(self.metadata[column_name]) 755 elif self.features is not None and column_name in self.features.columns: 756 columns.append(self.features[column_name]) 757 else: 758 raise ValueError(f"Column {column_name} not found in EventArray") 759 return pd.concat(columns, axis=1) 760 761 def rows(self, rows: Sequence[Hashable]) -> Self: 762 """ 763 Get a subset of the EventArray rows based on a boolean or integer index, by value. 764 :param rows: row labels, indices, or boolean mask; anything for .loc[] 765 :return: a new EventArray with the subset of events. 766 """ 767 info = self.info.loc[rows].reset_index(drop=True) 768 if self.metadata is not None: 769 metadata = self.metadata.loc[rows].reset_index(drop=True) 770 else: 771 metadata = None 772 if self.features is not None: 773 features = self.features.loc[rows].reset_index(drop=True) 774 else: 775 features = None 776 return EventArray(info, metadata, features) 777 778 def copy(self) -> Self: 779 """ 780 Create a deep copy of the EventArray. 781 :return: a deep copy of the EventArray. 782 """ 783 return EventArray( 784 info=self.info.copy(), 785 metadata=None if self.metadata is None else self.metadata.copy(), 786 features=None if self.features is None else self.features.copy(), 787 ) 788 789 # TODO: add a "filter" convenience function that takes a column name and values to filter by 790 791 def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None: 792 """ 793 Add metadata to the EventArray. Removes the need to check if metadata is None. 794 Overwrites any existing metadata with the same column names as the new metadata. 795 :param new_metadata: the metadata to add. 796 """ 797 if len(self) != len(new_metadata): 798 raise ValueError("New metadata must match length of existing info") 799 800 if self.metadata is None: 801 self.metadata = new_metadata 802 else: 803 if isinstance(new_metadata, pd.Series): 804 self.metadata[new_metadata.name] = new_metadata 805 else: 806 # It's a DataFrame 807 self.metadata[new_metadata.columns] = new_metadata 808 809 def add_features(self, new_features: pd.Series | pd.DataFrame) -> None: 810 """ 811 Add features to the EventArray. Removes the need to check if features is None. 812 Overwrites any existing features with the same column names as the new features. 813 :param new_features: the features to add. 814 """ 815 if len(self) != len(new_features): 816 raise ValueError("New features must match length of existing info") 817 818 if self.features is None: 819 self.features = new_features 820 else: 821 if isinstance(new_features, pd.Series): 822 self.features[new_features.name] = new_features 823 else: 824 # It's a DataFrame 825 self.features[new_features.columns] = new_features 826 827 @classmethod 828 def merge(cls, events: Iterable[Self]) -> Self: 829 """ 830 Combine EventArrays in a list into a single EventArray. 831 :param events: the new list of events. 832 """ 833 all_info = [] 834 all_metadata = [] 835 all_features = [] 836 for event_array in events: 837 # Skip empty EventArrays 838 if event_array.info is not None: 839 all_info.append(event_array.info) 840 if event_array.metadata is not None: 841 all_metadata.append(event_array.metadata) 842 if event_array.features is not None: 843 all_features.append(event_array.features) 844 if len(all_info) == 0: 845 return EventArray() 846 else: 847 all_info = pd.concat(all_info, ignore_index=True) 848 if len(all_metadata) == 0: 849 all_metadata = None 850 else: 851 all_metadata = pd.concat(all_metadata, ignore_index=True) 852 if len(all_features) == 0: 853 all_features = None 854 else: 855 all_features = pd.concat(all_features, ignore_index=True) 856 857 return EventArray(all_info, all_metadata, all_features) 858 859 def to_events( 860 self, 861 scans: Scan | Iterable[Scan], 862 ignore_missing_scans=True, 863 ignore_metadata=False, 864 ignore_features=False, 865 ) -> list[Event]: 866 """ 867 Get the events in the EventArray as a list of events. 868 :param scans: the scans that the events belong to, auto-matched by slide_id. 869 Pass None if you don't care about scan metadata (pass ignore_missing_scans). 870 :param ignore_missing_scans: whether to create blank scans for events without scans. 871 :param ignore_metadata: whether to ignore metadata or not 872 :param ignore_features: whether to ignore features or not 873 :return: 874 """ 875 if isinstance(scans, Scan): 876 scans = [scans] 877 scans = {scan.slide_id: scan for scan in scans} 878 events = [] 879 for i in range(len(self.info)): 880 # Determine the associated scan 881 slide_id = self.info["slide_id"][i] 882 if slide_id not in scans: 883 if ignore_missing_scans: 884 # Create a placeholder scan if the scan is missing 885 scan = Scan.make_placeholder( 886 slide_id, 887 self.info["tile"][i], 888 self.info["roi"][i], 889 ) 890 else: 891 raise ValueError( 892 f"Scan {self.info['slide_id'][i]} not found for event {i}." 893 ) 894 else: 895 scan = scans[slide_id] 896 897 # Prepare the metadata and features 898 if ignore_metadata or self.metadata is None: 899 metadata = None 900 else: 901 # This Series creation method is less efficient, 902 # but required for preserving dtypes 903 metadata = pd.Series( 904 {col: self.metadata.loc[i, col] for col in self.metadata.columns}, 905 dtype=object, 906 ) 907 if ignore_features or self.features is None: 908 features = None 909 else: 910 features = pd.Series( 911 {col: self.features.loc[i, col] for col in self.features.columns}, 912 dtype=object, 913 ) 914 # Create the event and append it to the list 915 events.append( 916 Event( 917 Tile(scan, self.info["tile"][i], self.info["roi"][i]), 918 self.info["x"][i], 919 self.info["y"][i], 920 metadata=metadata, 921 features=features, 922 ) 923 ) 924 return events 925 926 @classmethod 927 def from_events(cls, events: Iterable[Event]) -> Self: 928 """ 929 Set the events in the EventArray to a new list of events. 930 :param events: the new list of events. 931 """ 932 info = pd.DataFrame( 933 { 934 "slide_id": [event.tile.scan.slide_id for event in events], 935 "tile": [event.tile.n for event in events], 936 "roi": [event.tile.n_roi for event in events], 937 "x": [event.x for event in events], 938 "y": [event.y for event in events], 939 } 940 ) 941 metadata_list = [event.metadata for event in events] 942 # Iterate through and ensure that all metadata is the same shape 943 for metadata in metadata_list: 944 if type(metadata) != type(metadata_list[0]): 945 raise ValueError("All metadata must be the same type.") 946 if metadata is not None and metadata.shape != metadata_list[0].shape: 947 raise ValueError("All metadata must be the same shape.") 948 if metadata_list[0] is None: 949 metadata = None 950 else: 951 metadata = pd.DataFrame(metadata_list) 952 features_list = [event.features for event in events] 953 # Iterate through and ensure that all features are the same shape 954 for features in features_list: 955 if type(features) != type(features_list[0]): 956 raise ValueError("All features must be the same type.") 957 if features is not None and features.shape != features_list[0].shape: 958 raise ValueError("All features must be the same shape.") 959 if features_list[0] is None: 960 features = None 961 else: 962 features = pd.DataFrame(features_list) 963 return EventArray(info=info, metadata=metadata, features=features) 964 965 def to_dataframe(self) -> pd.DataFrame: 966 """ 967 Convert all the data in the EventArray to a single DataFrame. 968 :return: a DataFrame with all the data in the EventArray. 969 """ 970 # Make a copy of the info DataFrame and prepend "info_" to the column names 971 output = self.info.copy() 972 # Combine with the metadata and prepend "metadata_" to the column names 973 if self.metadata is not None: 974 metadata = self.metadata.copy() 975 metadata.columns = [f"metadata_{col}" for col in metadata.columns] 976 output = pd.concat([output, metadata], axis=1) 977 # Combine with the features and prepend "features_" to the column names 978 if self.features is not None: 979 features = self.features.copy() 980 features.columns = [f"features_{col}" for col in features.columns] 981 output = pd.concat([output, features], axis=1) 982 return output 983 984 @classmethod 985 def from_dataframe( 986 cls, df, metadata_prefix: str = "metadata_", features_prefix: str = "features_" 987 ) -> Self: 988 """ 989 From a single, special DataFrame, create an EventArray. 990 :param df: the DataFrame to convert to an EventArray. 991 :param metadata_prefix: the prefix for metadata columns. 992 :param features_prefix: the prefix for features columns. 993 :return: a DataFrame with all the data in the EventArray. 994 """ 995 # Split the columns into info, metadata, and features and strip prefix 996 info = df[[col for col in df.columns if col in cls.INFO_COLUMNS]].copy() 997 if info.size == 0: 998 info = None 999 metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy() 1000 metadata.columns = [ 1001 col.replace(metadata_prefix, "") for col in metadata.columns 1002 ] 1003 if metadata.size == 0: 1004 metadata = None 1005 features = df[[col for col in df.columns if col.startswith("features_")]].copy() 1006 features.columns = [ 1007 col.replace(features_prefix, "") for col in features.columns 1008 ] 1009 if features.size == 0: 1010 features = None 1011 return cls(info=info, metadata=metadata, features=features) 1012 1013 @classmethod 1014 def from_mask( 1015 cls, 1016 mask: np.ndarray, 1017 slide_id: str, 1018 tile_n: int, 1019 n_roi: int = 0, 1020 include_cell_id: bool = True, 1021 images: list[np.ndarray] = None, 1022 image_labels: list[str] = None, 1023 properties: list[str] = None, 1024 ) -> Self: 1025 """ 1026 Extract events from a mask DataFrame, including metadata and features. 1027 :param mask: the mask to extract events from. 1028 :param slide_id: the slide ID the mask is from. 1029 :param tile_n: the tile number the mask is from. 1030 :param n_roi: the ROI number the mask is from. 1031 :param include_cell_id: whether to include the cell_id, or numerical 1032 mask label, as metadata in the EventArray. 1033 :param images: the intensity images to extract features from. 1034 :param image_labels: the labels for the intensity images. 1035 :param properties: list of properties to extract in addition to the defaults: 1036 :return: EventArray corresponding to the mask labels. 1037 """ 1038 if csi_images is None: 1039 raise ModuleNotFoundError( 1040 "imageio libraries not installed! " 1041 "run `pip install csi_images[imageio]` to resolve." 1042 ) 1043 # Gather mask_info 1044 if images is not None and image_labels is not None: 1045 if len(images) != len(image_labels): 1046 raise ValueError("Intensity images and labels must match lengths.") 1047 1048 mask_info = csi_images.extract_mask_info(mask, images, image_labels, properties) 1049 1050 if len(mask_info) == 0: 1051 return EventArray() 1052 1053 # Combine provided info and mask info 1054 info = pd.DataFrame( 1055 { 1056 "slide_id": slide_id, 1057 "tile": tile_n, 1058 "roi": n_roi, 1059 "x": mask_info["x"], 1060 "y": mask_info["y"], 1061 }, 1062 ) 1063 # Extract a metadata column if desired 1064 if include_cell_id: 1065 metadata = pd.DataFrame({"cell_id": mask_info["id"]}) 1066 else: 1067 metadata = None 1068 # If any additional properties were extracted, add them as features 1069 mask_info = mask_info.drop(columns=["id", "x", "y"], errors="ignore") 1070 if len(mask_info.columns) > 0: 1071 features = mask_info 1072 else: 1073 features = None 1074 return EventArray(info, metadata, features) 1075 1076 def save_csv(self, output_path: str) -> bool: 1077 """ 1078 Save the events to an CSV file, including metadata and features. 1079 :param output_path: 1080 :return: 1081 """ 1082 if not output_path.endswith(".csv"): 1083 output_path += ".csv" 1084 self.to_dataframe().to_csv(output_path, index=False) 1085 return os.path.exists(output_path) 1086 1087 @classmethod 1088 def load_csv( 1089 cls, 1090 input_path: str, 1091 metadata_prefix: str = "metadata_", 1092 features_prefix: str = "features_", 1093 ) -> Self: 1094 """ 1095 Load the events from an CSV file, including metadata and features. 1096 :param input_path: 1097 :param metadata_prefix: 1098 :param features_prefix: 1099 :return: 1100 """ 1101 # Load the CSV file 1102 df = pd.read_csv(input_path) 1103 return cls.from_dataframe(df, metadata_prefix, features_prefix) 1104 1105 def save_hdf5(self, output_path: str) -> bool: 1106 """ 1107 Save the events to an HDF5 file, including metadata and features. 1108 Uses the pandas-provided HDF5 functions for ease, and external compatibility, 1109 though these files are slightly harder to view in HDFView or similar. 1110 :param output_path: 1111 :return: 1112 """ 1113 if not output_path.endswith(".hdf5") and not output_path.endswith(".h5"): 1114 output_path += ".hdf5" 1115 # Open the output_path as an HDF5 file 1116 with pd.HDFStore(output_path) as store: 1117 # Store the dataframes in the HDF5 file 1118 if self.info is not None: 1119 store.put("info", self.info, index=False) 1120 if self.metadata is not None: 1121 store.put("metadata", self.metadata, index=False) 1122 if self.features is not None: 1123 store.put("features", self.features, index=False) 1124 return os.path.exists(output_path) 1125 1126 @classmethod 1127 def load_hdf5(cls, input_path: str) -> Self: 1128 """ 1129 Load the events from an HDF5 file, including metadata and features. 1130 :param input_path: 1131 :return: 1132 """ 1133 # Open the input_path as an HDF5 file 1134 with pd.HDFStore(input_path, "r") as store: 1135 # Load the dataframes from the HDF5 file 1136 info = store.get("info") if "info" in store else None 1137 metadata = store.get("metadata") if "metadata" in store else None 1138 features = store.get("features") if "features" in store else None 1139 return cls(info=info, metadata=metadata, features=features) 1140 1141 def save_ocular(self, output_path: str, event_type: str = "cells"): 1142 """ 1143 Save the events to an OCULAR file. Relies on the dataframe originating 1144 from an OCULAR file (same columns; duplicate metadata/info). 1145 :param output_path: 1146 :param event_type: 1147 :return: 1148 """ 1149 if pyreadr is None: 1150 raise ModuleNotFoundError( 1151 "pyreadr not installed! Install pyreadr directly " 1152 "or run `pip install csi-images[rds]` option to resolve." 1153 ) 1154 if event_type == "cells": 1155 file_stub = "rc-final" 1156 elif event_type == "others": 1157 file_stub = "others-final" 1158 else: 1159 raise ValueError("Invalid event type. Must be cells or others.") 1160 1161 # Ensure good metadata 1162 metadata = pd.DataFrame( 1163 { 1164 "slide_id": self.info["slide_id"], 1165 "frame_id": self.info["tile"], 1166 "cell_id": ( 1167 self.metadata["cell_id"] 1168 if "cell_id" in self.metadata.columns 1169 else range(len(self.info)) 1170 ), 1171 "cellx": self.info["x"], 1172 "celly": self.info["y"], 1173 } 1174 ) 1175 if self.metadata is not None: 1176 metadata[self.metadata.columns] = self.metadata.copy() 1177 1178 # Check for the "ocular_interesting" column 1179 if event_type == "cells": 1180 if "ocular_interesting" in metadata.columns: 1181 interesting_rows = metadata["ocular_interesting"].to_numpy(dtype=bool) 1182 elif "hcpc" in metadata.columns: 1183 # Interesting cells don't get an hcpc designation, leaving them as -1 1184 interesting_rows = ( 1185 metadata["hcpc"].to_numpy() == -1 1186 ) # interesting cells 1187 else: 1188 interesting_rows = [] 1189 if sum(interesting_rows) > 0: 1190 # Split the metadata into interesting and regular 1191 interesting_events = self.rows(interesting_rows) 1192 interesting_df = pd.concat( 1193 [interesting_events.features, interesting_events.metadata], axis=1 1194 ) 1195 data_events = self.rows(~interesting_rows) 1196 data_df = pd.concat( 1197 [data_events.features, data_events.metadata], axis=1 1198 ) 1199 data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore") 1200 1201 # Drop particular columns for "interesting" 1202 interesting_df = interesting_df.drop( 1203 [ 1204 "clust", 1205 "hcpc", 1206 "frame_id", 1207 "cell_id", 1208 "unique_id", 1209 "ocular_interesting", 1210 ], 1211 axis=1, 1212 errors="ignore", 1213 ) 1214 # Save both .csv and .rds 1215 interesting_stub = os.path.join(output_path, "ocular_interesting") 1216 interesting_df.to_csv(f"{interesting_stub}.csv") 1217 # Suppress pandas FutureWarning 1218 with warnings.catch_warnings(): 1219 warnings.simplefilter(action="ignore", category=FutureWarning) 1220 pyreadr.write_rds(f"{interesting_stub}.rds", interesting_df) 1221 else: 1222 data_df = pd.concat([self.features, metadata], axis=1) 1223 else: 1224 # Get all data and reset_index (will copy it) 1225 data_df = pd.concat([self.features, metadata], axis=1) 1226 1227 # Split based on cluster number to conform to *-final[1-4].rds 1228 n_clusters = max(data_df["clust"]) + 1 1229 split_idx = [round(i * n_clusters / 4) for i in range(5)] 1230 for i in range(4): 1231 subset = (split_idx[i] <= data_df["clust"]) & ( 1232 data_df["clust"] < split_idx[i + 1] 1233 ) 1234 data_df.loc[subset, "hcpc"] = i + 1 1235 subset = data_df[subset].reset_index(drop=True) 1236 # Suppress pandas FutureWarning 1237 with warnings.catch_warnings(): 1238 warnings.simplefilter(action="ignore", category=FutureWarning) 1239 pyreadr.write_rds( 1240 os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset 1241 ) 1242 1243 # Create new example cell strings 1244 data_df["example_cell_id"] = ( 1245 data_df["slide_id"] 1246 + " " 1247 + data_df["frame_id"].astype(str) 1248 + " " 1249 + data_df["cell_id"].astype(str) 1250 + " " 1251 + data_df["cellx"].astype(int).astype(str) 1252 + " " 1253 + data_df["celly"].astype(int).astype(str) 1254 ) 1255 # Find averagable data columns 1256 if "cellcluster_id" in data_df.columns: 1257 end_idx = data_df.columns.get_loc("cellcluster_id") 1258 else: 1259 end_idx = data_df.columns.get_loc("slide_id") 1260 avg_cols = data_df.columns[:end_idx].tolist() 1261 # Group by cluster and average 1262 data_df = data_df.groupby("clust").agg( 1263 **{col: (col, "mean") for col in avg_cols}, 1264 count=("clust", "size"), # count rows in each cluster 1265 example_cells=("example_cell_id", lambda x: ",".join(x)), 1266 hcpc=("hcpc", lambda x: x.iloc[0]), 1267 ) 1268 data_df = data_df.reset_index() # Do NOT drop, index is "clust" 1269 # Create new columns 1270 metadata = pd.DataFrame( 1271 { 1272 "count": data_df["count"], 1273 "example_cells": data_df["example_cells"], 1274 "clust": data_df["clust"].astype(int), 1275 "hcpc": data_df["hcpc"].astype(int), 1276 "id": data_df["clust"].astype(int).astype(str), 1277 "cccluster": "0", # Dummy value 1278 "ccdistance": 0.0, # Dummy value 1279 "rownum": list(range(len(data_df))), 1280 "framegroup": 0, # Dummy value 1281 } 1282 ) 1283 # Need to pad the features to 761 columns, as per OCULAR report needs 1284 additional_columns = range(len(avg_cols), 761) 1285 if len(additional_columns) > 0: 1286 padding = pd.DataFrame( 1287 np.zeros((len(data_df), len(additional_columns))), 1288 columns=[f"pad{i}" for i in additional_columns], 1289 ) 1290 data_df = pd.concat([data_df[avg_cols], padding, metadata], axis=1) 1291 else: 1292 data_df = pd.concat([data_df[avg_cols], metadata], axis=1) 1293 1294 # Save the cluster data 1295 data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv")) 1296 # Suppress pandas FutureWarning 1297 with warnings.catch_warnings(): 1298 warnings.simplefilter(action="ignore", category=FutureWarning) 1299 pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df) 1300 1301 @classmethod 1302 def load_ocular( 1303 cls, 1304 input_path: str, 1305 event_type="cells", 1306 cell_data_files=( 1307 "rc-final1.rds", 1308 "rc-final2.rds", 1309 "rc-final3.rds", 1310 "rc-final4.rds", 1311 "ocular_interesting.rds", 1312 ), 1313 others_data_files=( 1314 "others-final1.rds", 1315 "others-final2.rds", 1316 "others-final3.rds", 1317 "others-final4.rds", 1318 ), 1319 atlas_data_files=( 1320 "ocular_interesting.rds", 1321 "ocular_not_interesting.rds", 1322 ), 1323 drop_common_events=True, 1324 ) -> Self: 1325 """ 1326 1327 :param input_path: 1328 :param event_type: 1329 :param cell_data_files: 1330 :param others_data_files: 1331 :param atlas_data_files: 1332 :param drop_common_events: 1333 :return: 1334 """ 1335 if pyreadr is None: 1336 raise ModuleNotFoundError( 1337 "pyreadr not installed! Install pyreadr directly " 1338 "or run `pip install csi-images[rds]` option to resolve." 1339 ) 1340 # Check if the input path is a directory or a file 1341 if os.path.isfile(input_path): 1342 data_files = [os.path.basename(input_path)] 1343 input_path = os.path.dirname(input_path) 1344 if event_type == "cells": 1345 data_files = cell_data_files 1346 elif event_type == "others": 1347 data_files = others_data_files 1348 else: 1349 raise ValueError("Invalid event type.") 1350 1351 # Load the data from the OCULAR files 1352 file_data = {} 1353 for file in data_files: 1354 file_path = os.path.join(input_path, file) 1355 if not os.path.isfile(file_path): 1356 warnings.warn(f"{file} not found for in {input_path}") 1357 continue 1358 file_data[file] = pyreadr.read_r(file_path) 1359 # Get the DataFrame associated with None (pyreadr dict quirk) 1360 file_data[file] = file_data[file][None] 1361 if len(file_data[file]) == 0: 1362 # File gets dropped from the dict 1363 file_data.pop(file) 1364 warnings.warn(f"{file} has no cells") 1365 continue 1366 1367 # Drop common cells if requested and in this file 1368 if ( 1369 file in atlas_data_files 1370 and drop_common_events 1371 and "catalogue_classification" in file_data[file] 1372 ): 1373 common_cell_indices = ( 1374 file_data[file]["catalogue_classification"] == "common_cell" 1375 ) 1376 file_data[file] = file_data[file][common_cell_indices == False] 1377 1378 if len(file_data[file]) == 0: 1379 # File gets dropped from the dict 1380 file_data.pop(file) 1381 warnings.warn(f"{file} has no cells after dropping common cells") 1382 continue 1383 1384 # Extract frame_id and cell_id 1385 # DAPI- events already have frame_id cell_id outside rowname 1386 if event_type == "cells" and "frame_id" not in file_data[file].columns: 1387 file_data[file]["rowname"] = file_data[file]["rowname"].astype("str") 1388 # get frame_id cell_id from rownames column and split into two columns 1389 split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True) 1390 if len(split_res.columns) != 2: 1391 warnings.warn( 1392 f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}' 1393 ) 1394 # then assign it back to the dataframe 1395 file_data[file][["frame_id", "cell_id"]] = split_res.astype("int") 1396 # Ensure frame_id and cell_id are integers 1397 file_data[file]["frame_id"] = file_data[file]["frame_id"].astype("int") 1398 file_data[file]["cell_id"] = file_data[file]["cell_id"].astype("int") 1399 # reset indexes since they can cause NaN values in concat 1400 file_data[file] = file_data[file].reset_index(drop=True) 1401 1402 # Merge the data from all files 1403 if len(file_data) == 0: 1404 return EventArray() 1405 elif len(file_data) == 1: 1406 data = [file_data[file] for file in file_data.keys()][0] 1407 else: 1408 data = pd.concat(file_data.values()) 1409 1410 # Others is missing the "slide_id". Insert it right before "frame_id" column 1411 if event_type == "others" and "slide_id" not in data.columns: 1412 if os.path.basename(input_path) == "ocular": 1413 slide_id = os.path.basename(os.path.dirname(input_path)) 1414 else: 1415 slide_id = "UNKNOWN" 1416 data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id) 1417 1418 # Sort according to ascending cell_id to keep the original, which is in manual_df 1419 data = data.sort_values(by=["cell_id"], ascending=True) 1420 # Filter out duplicates by x & y 1421 data = data.assign( 1422 unique_id=data["slide_id"] 1423 + "_" 1424 + data["frame_id"].astype(str) 1425 + "_" 1426 + data["cellx"].astype(int).astype(str) 1427 + "_" 1428 + data["celly"].astype(int).astype(str) 1429 ) 1430 data = data.drop_duplicates(subset=["unique_id"], keep="first") 1431 # Normal unique_id is with cell_id 1432 data = data.assign( 1433 unique_id=data["slide_id"] 1434 + "_" 1435 + data["frame_id"].astype(str) 1436 + "_" 1437 + data["cell_id"].astype(str) 1438 ) 1439 data = data.reset_index(drop=True) 1440 # All columns up to "slide_id" are features; drop the "slide_id" 1441 features = data.loc[:, :"slide_id"].iloc[:, :-1] 1442 data = data.loc[:, "slide_id":] 1443 # Grab the info columns 1444 info = data[["slide_id", "frame_id", "cellx", "celly"]] 1445 info.columns = ["slide_id", "tile", "x", "y"] 1446 info = info.assign(roi=0) # OCULAR only works on 1 ROI, as far as known 1447 info = info[["slide_id", "tile", "roi", "x", "y"]] 1448 # Metadata has duplicate columns for later convenience 1449 metadata = data 1450 # Certain columns tend to be problematic with mixed data formats... 1451 for col in ["TRITC", "CY5", "FITC"]: 1452 if col in metadata: 1453 labels = { 1454 "False": False, 1455 "True": True, 1456 "FALSE": False, 1457 "TRUE": True, 1458 False: False, 1459 True: True, 1460 } 1461 metadata[col] = metadata[col].map(labels).astype(bool) 1462 for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]: 1463 if col in metadata: 1464 metadata[col] = metadata[col].fillna(-1).astype(int) 1465 return EventArray(info, metadata, features)
40class Event: 41 """ 42 A class that represents a single event in a scan, making it easy to evaluate 43 singular events. Required metadata is exposed as attributes, and optional 44 metadata and features are stored as DataFrames. 45 """ 46 47 SCAN_TO_SLIDE_TRANSFORM = { 48 # Axioscan zero is in the top-right corner instead of top-left 49 Scan.Type.AXIOSCAN7: np.array( 50 [ 51 [1, 0, 75000], 52 [0, 1, 0], 53 [0, 0, 1], 54 ] 55 ), 56 # BZScanner coordinates are a special kind of messed up: 57 # - The slide is upside-down. 58 # - The slide is oriented vertically, with the barcode at the bottom. 59 # - Tiles are numbered from the top-right 60 Scan.Type.BZSCANNER: np.array( 61 [ 62 [0, -1, 75000], 63 [-1, 0, 25000], 64 [0, 0, 1], 65 ] 66 ), 67 } 68 """ 69 Homogeneous transformation matrices for converting between scanner and slide 70 coordinates. The matrices are 3x3, with the final column representing the 71 translation in micrometers (um). For more information, see 72 [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations). 73 74 Transformations are nominal, and accuracy is not guaranteed; this is due to 75 imperfections in slides and alignment in the scanners. Units are in micrometers. 76 """ 77 78 def __init__( 79 self, 80 tile: Tile, 81 x: int, 82 y: int, 83 metadata: pd.Series = None, 84 features: pd.Series = None, 85 ): 86 self.tile = tile 87 self.x = int(x) 88 self.y = int(y) 89 self.metadata = metadata 90 self.features = features 91 92 def __repr__(self) -> str: 93 return f"{self.tile}-{self.x}-{self.y}" 94 95 def __eq__(self, other) -> bool: 96 return self.__repr__() == other.__repr__() 97 98 def __lt__(self, other): 99 return self.__repr__() < other.__repr__() 100 101 def get_scan_position(self) -> tuple[float, float]: 102 """ 103 Get the position of the event in the scanner's coordinate frame. 104 :return: the scan position of the event in micrometers (um). 105 """ 106 # Get overall pixel position 107 real_tile_height, real_tile_width = self.tile.scan.get_image_size() 108 pixel_x = self.x + (real_tile_width * self.tile.x) 109 pixel_y = self.y + (real_tile_height * self.tile.y) 110 # Convert to micrometers 111 x_um = pixel_x * self.tile.scan.pixel_size_um 112 y_um = pixel_y * self.tile.scan.pixel_size_um 113 # Add the scan's origin in the scanner frame 114 x_um += self.tile.scan.roi[self.tile.n_roi].origin_x_um 115 y_um += self.tile.scan.roi[self.tile.n_roi].origin_y_um 116 return x_um, y_um 117 118 def get_slide_position(self) -> tuple[float, float]: 119 """ 120 Get the slide position of the event in micrometers (um). 121 :return: the slide position of the event. 122 """ 123 # Turn scan_position into a 3x1 vector 124 scan_position = self.get_scan_position() 125 scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]]) 126 127 # Multiply by the appropriate homogeneous matrix 128 if self.tile.scan.scanner_id.startswith(self.tile.scan.Type.AXIOSCAN7.value): 129 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.tile.scan.Type.AXIOSCAN7] 130 elif self.tile.scan.scanner_id.startswith(self.tile.scan.Type.BZSCANNER.value): 131 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.tile.scan.Type.BZSCANNER] 132 else: 133 raise ValueError(f"Scanner type {self.tile.scan.scanner_id} not supported.") 134 slide_position = np.matmul(transform, scan_position) 135 return float(slide_position[0][0]), float(slide_position[1][0]) 136 137 def crop( 138 self, images: Iterable[np.ndarray], crop_size: int = 100, in_pixels: bool = True 139 ) -> list[np.ndarray]: 140 """ 141 Crop the event from the provided frame images. Use if you have already gotten 142 frame images; useful for cropping multiple events from the same frame image. 143 :param images: the frame images. 144 :param crop_size: the square size of the image crop to get for this event. 145 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 146 :return: image_size x image_size crops of the event in the provided frames. If 147 the event is too close to the edge, the crop will be smaller and not centered. 148 """ 149 # Convert a crop size in micrometers to pixels 150 if not in_pixels: 151 crop_size = round(crop_size / self.tile.scan.pixel_size_um) 152 image_height, image_width = 0, 0 153 for image in images: 154 if image_height == 0 and image_width == 0: 155 image_height, image_width = image.shape 156 else: 157 if image_height != image.shape[0] or image_width != image.shape[1]: 158 raise ValueError("All images must be the same size") 159 if image_height == 0 or image_width == 0: 160 raise ValueError("No images provided") 161 162 # Find the crop bounds 163 bounds = [ 164 self.x - (crop_size // 2) + 1, 165 self.y - (crop_size // 2) + 1, 166 self.x + math.ceil(crop_size / 2) + 1, 167 self.y + math.ceil(crop_size / 2) + 1, 168 ] 169 # Determine how much the bounds violate the image size 170 displacements = [ 171 max(0, -bounds[0]), 172 max(0, -bounds[1]), 173 max(0, bounds[2] - image_width), 174 max(0, bounds[3] - image_height), 175 ] 176 # Cap off the bounds 177 bounds = [ 178 max(0, bounds[0]), 179 max(0, bounds[1]), 180 min(image_width, bounds[2]), 181 min(image_height, bounds[3]), 182 ] 183 184 # Crop the images 185 crops = [] 186 for image in images: 187 # Create a blank image of the right size 188 crop = np.zeros((crop_size, crop_size), dtype=image.dtype) 189 190 # Insert the cropped image into the blank image, leaving a black buffer 191 # around the edges if the crop would go beyond the original image bounds 192 crop[ 193 displacements[1] : crop_size - displacements[3], 194 displacements[0] : crop_size - displacements[2], 195 ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]] 196 crops.append(crop) 197 return crops 198 199 def get_crops( 200 self, 201 crop_size: int = 100, 202 in_pixels: bool = True, 203 input_path: str = None, 204 channels: Iterable[int | str] = None, 205 apply_gain: bool | Iterable[bool] = True, 206 ) -> list[np.ndarray]: 207 """ 208 Gets the frame images for this event and then crops the event from the images. 209 Convenient for retrieving a single event's crops, but less efficient when 210 retrieving multiple events from the same tile as it will reread the images. 211 :param crop_size: the square size of the image crop to get for this event. 212 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 213 :param input_path: the path to the input images. Defaults to None (uses the scan's path). 214 :param channels: the channels to extract images for. Defaults to all channels. 215 :param apply_gain: whether to apply scanner-calculated gain to the images, if 216 not already applied. If a list, matches the channels. 217 :return: a list of cropped images from the scan in the order of the channels. 218 """ 219 # This function validates channels 220 frames = Frame.get_frames(self.tile, channels) 221 # Convert individual inputs to lists of appropriate length 222 if isinstance(apply_gain, bool): 223 apply_gain = [apply_gain] * len(frames) 224 images = [f.get_image(input_path, a) for f, a in zip(frames, apply_gain)] 225 return self.crop(images, crop_size, in_pixels) 226 227 def save_crops( 228 self, 229 crops: Sequence[np.ndarray], 230 output_path: str, 231 labels: Sequence[str], 232 ext: str = "auto", 233 ): 234 """ 235 Save the crops to image files. 236 :param crops: the crops to save. Will save as RGB if 3 channel [h, w, 3] or 237 grayscale if 1 channel [h, w] or [h, w, 1]. 238 :param labels: the labels to append to the file name, usually the channel names 239 associated with each crop. 240 :param output_path: the folder to save the crops to. Will make if needed. 241 :param ext: the file extension to save the crops as. Defaults to "auto", which 242 will save as .tif for grayscale images and .jpg for RGB images. 243 :return: None 244 """ 245 if len(crops) != len(labels): 246 raise ValueError("Crops and labels must be the same length") 247 248 if csi_images is None or imageio is None: 249 raise ModuleNotFoundError( 250 "imageio libraries not installed! " 251 "run `pip install csi_images[imageio]` to resolve." 252 ) 253 254 os.makedirs(output_path, exist_ok=True) 255 256 for crop, label in zip(crops, labels): 257 if ext == "auto": 258 if len(crop.shape) == 2 or crop.shape[2] == 1: 259 file_extension = ".tif" 260 elif crop.shape[2] == 3: 261 file_extension = ".jpg" 262 else: 263 warnings.warn( 264 f"Image shape {crop.shape} not recognized; saving as .tif" 265 ) 266 file_extension = ".tif" 267 else: 268 file_extension = ext 269 file = os.path.join(output_path, f"{self}-{label}{file_extension}") 270 # TODO: add more file types here 271 if file_extension == ".tif": 272 imageio.imwrite(file, crop, compression="deflate") 273 elif file_extension in [".jpg", ".jpeg"]: 274 crop = csi_images.scale_bit_depth(crop, np.uint8) 275 imageio.imwrite(file, crop, quality=80) 276 else: 277 imageio.imwrite(file, crop) 278 279 def load_crops( 280 self, input_path: str, labels: list[str] = None 281 ) -> dict[str, np.ndarray]: 282 """ 283 Loads previously saved crop files from a folder. 284 :param input_path: folder containing crop files. 285 :param labels: optional label filter, will only return crops with these labels. 286 :return: a tuple of lists containing the crops and their labels. 287 """ 288 crops = {} 289 for file in glob.glob(os.path.join(input_path, f"{self}-*")): 290 label = os.path.splitext(os.path.basename(file))[0].split("-")[-1] 291 # Skip if we have labels to target 292 if labels is not None and label not in labels: 293 continue 294 crops[label] = imageio.imread(file) 295 return crops 296 297 def get_montage_channels( 298 self, 299 channels: Sequence[int | str] | None, 300 composites: dict[int | str, tuple[float, float, float]] | None, 301 ) -> tuple[list[int], list[int], dict[int, tuple[float, float, float]]]: 302 """ 303 Get the channel names for the montage from the event's tile. 304 :param channels: channel indices or names for grayscale channels 305 :param composites: dictionary of channel indices or names and RGB values 306 :return: (1) channel indices to retrieve, 307 (2) relative grayscale channel indices, and 308 (3) composite channel indices and RGB values. 309 """ 310 if channels is None: 311 channels = list(range(len(self.tile.scan.channels))) 312 if (len(channels) == 0) and (composites is None or len(composites) == 0): 313 raise ValueError("Must provide at least one channel type to montage") 314 315 channels_to_get = [] 316 317 # Build the list of channels to retrieve 318 if channels is not None: 319 if isinstance(channels[0], str): 320 channels = self.tile.scan.get_channel_indices(channels) 321 channels_to_get += channels 322 order = list(range(len(channels))) # Always the first n channels 323 else: 324 order = None 325 326 if composites is not None: 327 relative_composites = {} # Relative indices for retrieved channels 328 # Convert to scan indices 329 rgb_channels = list(composites.keys()) 330 if isinstance(rgb_channels[0], str): 331 rgb_channels = self.tile.scan.get_channel_indices(rgb_channels) 332 # Find the index or add to the end 333 for channel, rgb in zip(rgb_channels, composites.values()): 334 if channel not in channels_to_get: 335 channels_to_get.append(channel) 336 relative_composites[channel] = rgb 337 else: 338 relative_composites[channels_to_get.index(channel)] = rgb 339 else: 340 relative_composites = None 341 342 return channels_to_get, order, relative_composites 343 344 def get_montage( 345 self, 346 channels: Sequence[int | str] = None, 347 composites: dict[int | str, tuple[float, float, float]] = None, 348 crop_size: int = 100, 349 in_pixels: bool = True, 350 input_path: str = None, 351 apply_gain: bool = True, 352 **kwargs, 353 ) -> np.ndarray: 354 """ 355 Convenience function for getting frame images and creating a montage. Mirrors 356 csi_images.make_montage(). Convenient for a single event's montage, but less 357 efficient when for multiple events from the same tile. 358 :param channels: the channels to use for black-and-white montages. 359 :param composites: dictionary of indices and RGB tuples for a composite. 360 :param crop_size: the square size of the image crop to get for this event. 361 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 362 :param input_path: the path to the input images. Defaults to None (uses the scan's path). 363 :param apply_gain: whether to apply scanner-calculated gain to the images, if 364 not already applied. If a list, matches the channels. 365 :param kwargs: montage options. See csi_images.make_montage() for more details. 366 :return: numpy array representing the montage. 367 """ 368 channels, order, composites = self.get_montage_channels(channels, composites) 369 images = self.get_crops(crop_size, in_pixels, input_path, channels, apply_gain) 370 return csi_images.make_montage(images, order, composites, **kwargs) 371 372 def save_montage( 373 self, 374 montage: np.ndarray, 375 output_path: str, 376 ocular_names: bool = False, 377 tag: str = "", 378 ): 379 """ 380 Save the montage as a JPEG image with a set name. 381 :param montage: the montage to save. 382 :param output_path: the folder to save the montage in. Wil make if needed. 383 :param ocular_names: whether to use the OCULAR naming convention. 384 :param tag: a tag to append to the file name. Ignored if ocular_names is True. 385 :return: None 386 """ 387 if csi_images is None or imageio is None: 388 raise ModuleNotFoundError( 389 "imageio libraries not installed! " 390 "run `pip install csi_images[imageio]` to resolve." 391 ) 392 393 montage = csi_images.scale_bit_depth(montage, np.uint8) 394 395 if ocular_names: 396 if "cell_id" not in self.metadata.index: 397 raise ValueError( 398 "Event metadata must include 'cell_id' for OCULAR naming." 399 ) 400 file = f"{self.tile.n}-{self.metadata['cell_id']}-{self.x}-{self.y}.jpeg" 401 else: 402 file = f"{self}{tag}.jpeg" 403 404 os.makedirs(output_path, exist_ok=True) 405 imageio.imwrite(os.path.join(output_path, file), montage, quality=80) 406 407 def load_montage(self, input_path: str, tag: str = "") -> np.ndarray: 408 """ 409 Loads the montage from a file saved by Event.save_montage. 410 :param input_path: the path to the folder where the montage was saved. 411 :param tag: a string to add to the file name, before the extension. 412 :return: 413 """ 414 file = f"{self}{tag}.jpeg" 415 return imageio.imread(os.path.join(input_path, file)) 416 417 @classmethod 418 def get_many_crops( 419 cls, 420 events: Sequence[Self], 421 crop_size: int | Sequence[int] = 100, 422 in_pixels: bool = True, 423 input_path: str | Sequence[str] = None, 424 channels: Sequence[int | str] = None, 425 apply_gain: bool | Sequence[bool] = True, 426 ) -> list[list[np.ndarray]]: 427 """ 428 Get the crops for a list of events, ensuring that there is no wasteful reading 429 of the same tile multiple times. This function is more efficient than calling 430 get_crops() for each event. 431 :param events: the events to get crops for. 432 :param crop_size: the square size of the image crop to get for this event. 433 Defaults to four times the size of the event. 434 :param in_pixels: whether the crop size is in pixels or micrometers. 435 Defaults to pixels, and is ignored if crop_size is None. 436 :param input_path: the path to the input images. Will only work for lists of events 437 from the same scan. Defaults to None (uses the scan's path). 438 :param channels: the channels to extract images for. Defaults to all channels. 439 :param apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True. 440 Can be supplied as a list to apply gain to individual channels. 441 :return: a list of lists of cropped images for each event. 442 """ 443 if len(events) == 0: 444 return [] 445 # Adapt singular inputs to lists of appropriate length 446 if isinstance(crop_size, int): 447 crop_size = [crop_size] * len(events) 448 if input_path is None or isinstance(input_path, str): 449 input_path = [input_path] * len(events) 450 451 # Get the order of the events when sorted by slide/tile 452 order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__())) 453 454 # Allocate the list to size 455 crops = [[]] * len(events) 456 last_tile = None 457 images = None # Holds large numpy arrays, so expensive to compare 458 # Iterate through in slide/tile sorted order 459 for i in order: 460 if last_tile != events[i].tile: 461 # Gather the frame images, preserving them for the next event 462 frames = Frame.get_frames(events[i].tile, channels) 463 if isinstance(apply_gain, bool): 464 apply = [apply_gain] * len(frames) 465 else: 466 apply = apply_gain 467 images = [f.get_image(input_path[i], a) for f, a in zip(frames, apply)] 468 last_tile = events[i].tile 469 # Use the frame images to crop the event images 470 crops[i] = events[i].crop(images, crop_size[i], in_pixels) 471 return crops 472 473 @classmethod 474 def get_many_montages( 475 cls, 476 events: Sequence[Self], 477 channels: Sequence[int | str] = None, 478 composites: dict[int | str, tuple[float, float, float]] = None, 479 crop_size: int = 100, 480 in_pixels: bool = True, 481 input_path: str = None, 482 apply_gain: bool | Iterable[bool] = True, 483 **kwargs, 484 ) -> list[np.ndarray]: 485 """ 486 Convenience function for get_montage(), but for a list of events. More efficient 487 thank get_montage() when working with multiple events from the same tile. 488 :param events: a list of Event objects. 489 :param channels: the channels to extract images for. Defaults to all channels. 490 :param composites: dictionary of indices and RGB tuples for a composite. 491 :param crop_size: the square size of the image crop to get for this event. 492 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 493 :param input_path: the path to the input images. Defaults to None (uses the scan's path). 494 :param apply_gain: whether to apply scanner-calculated gain to the images, if 495 not already applied. If a list, matches the channels. 496 :param kwargs: montage options. See csi_images.make_montage() for more details. 497 :return: a list of numpy arrays representing the montages. 498 """ 499 if len(events) == 0: 500 return [] 501 # Adapt singular inputs to lists of appropriate length 502 if isinstance(crop_size, int): 503 crop_size = [crop_size] * len(events) 504 if input_path is None or isinstance(input_path, str): 505 input_path = [input_path] * len(events) 506 507 # Get the order of the events when sorted by slide/tile 508 event_order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__())) 509 510 # Allocate the list to size 511 montages = [np.empty(0)] * len(events) 512 # Placeholder variables to avoid rereading the same tile 513 images = None # Holds large numpy arrays, so expensive to compare 514 order = None 515 rel_composites = None 516 last_tile = None 517 # Iterate through in slide/tile sorted order 518 for i in event_order: 519 if last_tile != events[i].tile: 520 channels_to_get, order, rel_composites = events[i].get_montage_channels( 521 channels, composites 522 ) 523 # Gather the frame images, preserving them for the next event 524 frames = Frame.get_frames(events[i].tile, channels_to_get) 525 if isinstance(apply_gain, bool): 526 apply = [apply_gain] * len(frames) 527 else: 528 apply = apply_gain 529 images = [f.get_image(input_path[i], a) for f, a in zip(frames, apply)] 530 last_tile = events[i].tile 531 # Use the frame images to crop the event images and make montages 532 crops = events[i].crop(images, crop_size[i], in_pixels) 533 montages[i] = csi_images.make_montage( 534 crops, order, rel_composites, **kwargs 535 ) 536 537 return montages 538 539 @classmethod 540 def get_and_save_many_crops( 541 cls, 542 events: list[Self], 543 output_path: str, 544 labels: Sequence[str], 545 ext: str = "auto", 546 additional_gain: Sequence[float] = None, 547 **kwargs, 548 ) -> None: 549 """ 550 Get and save the crops for a list of events, ensuring that there is no wasteful 551 reading and limiting the image data in memory to 1 tile at a time. This function 552 is more efficient that chaining get_crops() and save_crops() for each event or 553 get_many_crops() and then save_crops(). 554 :param events: list of events to get, crop, and save. 555 :param output_path: the folder to save the crops in. Will make if needed. 556 :param labels: the labels to save the crops with. See save_crops(). 557 :param ext: the file extension to save the crops as. See save_crops(). 558 :param additional_gain: additional gain to apply to the crops. If not None, must 559 match the length of the number of crop channels. 560 :param kwargs: see get_many_crops() for more parameters. 561 :return: 562 """ 563 unique_tiles = set([event.tile for event in events]) 564 565 for tile in unique_tiles: 566 # Get one tile's worth of event crops 567 tile_events = [e for e in events if e.tile == tile] 568 crops_list = cls.get_many_crops(tile_events, **kwargs) 569 for event, crops in zip(tile_events, crops_list): 570 # Apply any additional gains 571 if additional_gain is not None: 572 crops = [gain * crop for gain, crop in zip(additional_gain, crops)] 573 event.save_crops(crops, output_path, labels, ext) 574 575 @classmethod 576 def get_and_save_many_montages( 577 cls, 578 events: list[Self], 579 output_path: str, 580 ocular_names: bool = False, 581 tag: str = "", 582 **kwargs, 583 ) -> None: 584 """ 585 Save montages of the events to image files. 586 :param events: the events to get, montage, and save. 587 :param output_path: the folder to save the montages to. Will make if needed. 588 :param ocular_names: whether to use the OCULAR naming convention. 589 :param tag: a tag to append to the file name. Ignored if ocular_names is True. 590 :param kwargs: see get_many_montages() for more parameters. 591 """ 592 unique_tiles = set([event.tile for event in events]) 593 594 for tile in unique_tiles: 595 # Get one tile's worth of event crops 596 tile_events = [e for e in events if e.tile == tile] 597 montages = cls.get_many_montages(tile_events, **kwargs) 598 for event, montage in zip(tile_events, montages): 599 event.save_montage(montage, output_path, ocular_names, tag)
A class that represents a single event in a scan, making it easy to evaluate singular events. Required metadata is exposed as attributes, and optional metadata and features are stored as DataFrames.
Homogeneous transformation matrices for converting between scanner and slide coordinates. The matrices are 3x3, with the final column representing the translation in micrometers (um). For more information, see affine transformations.
Transformations are nominal, and accuracy is not guaranteed; this is due to imperfections in slides and alignment in the scanners. Units are in micrometers.
101 def get_scan_position(self) -> tuple[float, float]: 102 """ 103 Get the position of the event in the scanner's coordinate frame. 104 :return: the scan position of the event in micrometers (um). 105 """ 106 # Get overall pixel position 107 real_tile_height, real_tile_width = self.tile.scan.get_image_size() 108 pixel_x = self.x + (real_tile_width * self.tile.x) 109 pixel_y = self.y + (real_tile_height * self.tile.y) 110 # Convert to micrometers 111 x_um = pixel_x * self.tile.scan.pixel_size_um 112 y_um = pixel_y * self.tile.scan.pixel_size_um 113 # Add the scan's origin in the scanner frame 114 x_um += self.tile.scan.roi[self.tile.n_roi].origin_x_um 115 y_um += self.tile.scan.roi[self.tile.n_roi].origin_y_um 116 return x_um, y_um
Get the position of the event in the scanner's coordinate frame.
Returns
the scan position of the event in micrometers (um).
118 def get_slide_position(self) -> tuple[float, float]: 119 """ 120 Get the slide position of the event in micrometers (um). 121 :return: the slide position of the event. 122 """ 123 # Turn scan_position into a 3x1 vector 124 scan_position = self.get_scan_position() 125 scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]]) 126 127 # Multiply by the appropriate homogeneous matrix 128 if self.tile.scan.scanner_id.startswith(self.tile.scan.Type.AXIOSCAN7.value): 129 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.tile.scan.Type.AXIOSCAN7] 130 elif self.tile.scan.scanner_id.startswith(self.tile.scan.Type.BZSCANNER.value): 131 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.tile.scan.Type.BZSCANNER] 132 else: 133 raise ValueError(f"Scanner type {self.tile.scan.scanner_id} not supported.") 134 slide_position = np.matmul(transform, scan_position) 135 return float(slide_position[0][0]), float(slide_position[1][0])
Get the slide position of the event in micrometers (um).
Returns
the slide position of the event.
137 def crop( 138 self, images: Iterable[np.ndarray], crop_size: int = 100, in_pixels: bool = True 139 ) -> list[np.ndarray]: 140 """ 141 Crop the event from the provided frame images. Use if you have already gotten 142 frame images; useful for cropping multiple events from the same frame image. 143 :param images: the frame images. 144 :param crop_size: the square size of the image crop to get for this event. 145 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 146 :return: image_size x image_size crops of the event in the provided frames. If 147 the event is too close to the edge, the crop will be smaller and not centered. 148 """ 149 # Convert a crop size in micrometers to pixels 150 if not in_pixels: 151 crop_size = round(crop_size / self.tile.scan.pixel_size_um) 152 image_height, image_width = 0, 0 153 for image in images: 154 if image_height == 0 and image_width == 0: 155 image_height, image_width = image.shape 156 else: 157 if image_height != image.shape[0] or image_width != image.shape[1]: 158 raise ValueError("All images must be the same size") 159 if image_height == 0 or image_width == 0: 160 raise ValueError("No images provided") 161 162 # Find the crop bounds 163 bounds = [ 164 self.x - (crop_size // 2) + 1, 165 self.y - (crop_size // 2) + 1, 166 self.x + math.ceil(crop_size / 2) + 1, 167 self.y + math.ceil(crop_size / 2) + 1, 168 ] 169 # Determine how much the bounds violate the image size 170 displacements = [ 171 max(0, -bounds[0]), 172 max(0, -bounds[1]), 173 max(0, bounds[2] - image_width), 174 max(0, bounds[3] - image_height), 175 ] 176 # Cap off the bounds 177 bounds = [ 178 max(0, bounds[0]), 179 max(0, bounds[1]), 180 min(image_width, bounds[2]), 181 min(image_height, bounds[3]), 182 ] 183 184 # Crop the images 185 crops = [] 186 for image in images: 187 # Create a blank image of the right size 188 crop = np.zeros((crop_size, crop_size), dtype=image.dtype) 189 190 # Insert the cropped image into the blank image, leaving a black buffer 191 # around the edges if the crop would go beyond the original image bounds 192 crop[ 193 displacements[1] : crop_size - displacements[3], 194 displacements[0] : crop_size - displacements[2], 195 ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]] 196 crops.append(crop) 197 return crops
Crop the event from the provided frame images. Use if you have already gotten frame images; useful for cropping multiple events from the same frame image.
Parameters
- images: the frame images.
- crop_size: the square size of the image crop to get for this event.
- in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
Returns
image_size x image_size crops of the event in the provided frames. If the event is too close to the edge, the crop will be smaller and not centered.
199 def get_crops( 200 self, 201 crop_size: int = 100, 202 in_pixels: bool = True, 203 input_path: str = None, 204 channels: Iterable[int | str] = None, 205 apply_gain: bool | Iterable[bool] = True, 206 ) -> list[np.ndarray]: 207 """ 208 Gets the frame images for this event and then crops the event from the images. 209 Convenient for retrieving a single event's crops, but less efficient when 210 retrieving multiple events from the same tile as it will reread the images. 211 :param crop_size: the square size of the image crop to get for this event. 212 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 213 :param input_path: the path to the input images. Defaults to None (uses the scan's path). 214 :param channels: the channels to extract images for. Defaults to all channels. 215 :param apply_gain: whether to apply scanner-calculated gain to the images, if 216 not already applied. If a list, matches the channels. 217 :return: a list of cropped images from the scan in the order of the channels. 218 """ 219 # This function validates channels 220 frames = Frame.get_frames(self.tile, channels) 221 # Convert individual inputs to lists of appropriate length 222 if isinstance(apply_gain, bool): 223 apply_gain = [apply_gain] * len(frames) 224 images = [f.get_image(input_path, a) for f, a in zip(frames, apply_gain)] 225 return self.crop(images, crop_size, in_pixels)
Gets the frame images for this event and then crops the event from the images. Convenient for retrieving a single event's crops, but less efficient when retrieving multiple events from the same tile as it will reread the images.
Parameters
- crop_size: the square size of the image crop to get for this event.
- in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
- input_path: the path to the input images. Defaults to None (uses the scan's path).
- channels: the channels to extract images for. Defaults to all channels.
- apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. If a list, matches the channels.
Returns
a list of cropped images from the scan in the order of the channels.
227 def save_crops( 228 self, 229 crops: Sequence[np.ndarray], 230 output_path: str, 231 labels: Sequence[str], 232 ext: str = "auto", 233 ): 234 """ 235 Save the crops to image files. 236 :param crops: the crops to save. Will save as RGB if 3 channel [h, w, 3] or 237 grayscale if 1 channel [h, w] or [h, w, 1]. 238 :param labels: the labels to append to the file name, usually the channel names 239 associated with each crop. 240 :param output_path: the folder to save the crops to. Will make if needed. 241 :param ext: the file extension to save the crops as. Defaults to "auto", which 242 will save as .tif for grayscale images and .jpg for RGB images. 243 :return: None 244 """ 245 if len(crops) != len(labels): 246 raise ValueError("Crops and labels must be the same length") 247 248 if csi_images is None or imageio is None: 249 raise ModuleNotFoundError( 250 "imageio libraries not installed! " 251 "run `pip install csi_images[imageio]` to resolve." 252 ) 253 254 os.makedirs(output_path, exist_ok=True) 255 256 for crop, label in zip(crops, labels): 257 if ext == "auto": 258 if len(crop.shape) == 2 or crop.shape[2] == 1: 259 file_extension = ".tif" 260 elif crop.shape[2] == 3: 261 file_extension = ".jpg" 262 else: 263 warnings.warn( 264 f"Image shape {crop.shape} not recognized; saving as .tif" 265 ) 266 file_extension = ".tif" 267 else: 268 file_extension = ext 269 file = os.path.join(output_path, f"{self}-{label}{file_extension}") 270 # TODO: add more file types here 271 if file_extension == ".tif": 272 imageio.imwrite(file, crop, compression="deflate") 273 elif file_extension in [".jpg", ".jpeg"]: 274 crop = csi_images.scale_bit_depth(crop, np.uint8) 275 imageio.imwrite(file, crop, quality=80) 276 else: 277 imageio.imwrite(file, crop)
Save the crops to image files.
Parameters
- crops: the crops to save. Will save as RGB if 3 channel [h, w, 3] or grayscale if 1 channel [h, w] or [h, w, 1].
- labels: the labels to append to the file name, usually the channel names associated with each crop.
- output_path: the folder to save the crops to. Will make if needed.
- ext: the file extension to save the crops as. Defaults to "auto", which will save as .tif for grayscale images and .jpg for RGB images.
Returns
None
279 def load_crops( 280 self, input_path: str, labels: list[str] = None 281 ) -> dict[str, np.ndarray]: 282 """ 283 Loads previously saved crop files from a folder. 284 :param input_path: folder containing crop files. 285 :param labels: optional label filter, will only return crops with these labels. 286 :return: a tuple of lists containing the crops and their labels. 287 """ 288 crops = {} 289 for file in glob.glob(os.path.join(input_path, f"{self}-*")): 290 label = os.path.splitext(os.path.basename(file))[0].split("-")[-1] 291 # Skip if we have labels to target 292 if labels is not None and label not in labels: 293 continue 294 crops[label] = imageio.imread(file) 295 return crops
Loads previously saved crop files from a folder.
Parameters
- input_path: folder containing crop files.
- labels: optional label filter, will only return crops with these labels.
Returns
a tuple of lists containing the crops and their labels.
297 def get_montage_channels( 298 self, 299 channels: Sequence[int | str] | None, 300 composites: dict[int | str, tuple[float, float, float]] | None, 301 ) -> tuple[list[int], list[int], dict[int, tuple[float, float, float]]]: 302 """ 303 Get the channel names for the montage from the event's tile. 304 :param channels: channel indices or names for grayscale channels 305 :param composites: dictionary of channel indices or names and RGB values 306 :return: (1) channel indices to retrieve, 307 (2) relative grayscale channel indices, and 308 (3) composite channel indices and RGB values. 309 """ 310 if channels is None: 311 channels = list(range(len(self.tile.scan.channels))) 312 if (len(channels) == 0) and (composites is None or len(composites) == 0): 313 raise ValueError("Must provide at least one channel type to montage") 314 315 channels_to_get = [] 316 317 # Build the list of channels to retrieve 318 if channels is not None: 319 if isinstance(channels[0], str): 320 channels = self.tile.scan.get_channel_indices(channels) 321 channels_to_get += channels 322 order = list(range(len(channels))) # Always the first n channels 323 else: 324 order = None 325 326 if composites is not None: 327 relative_composites = {} # Relative indices for retrieved channels 328 # Convert to scan indices 329 rgb_channels = list(composites.keys()) 330 if isinstance(rgb_channels[0], str): 331 rgb_channels = self.tile.scan.get_channel_indices(rgb_channels) 332 # Find the index or add to the end 333 for channel, rgb in zip(rgb_channels, composites.values()): 334 if channel not in channels_to_get: 335 channels_to_get.append(channel) 336 relative_composites[channel] = rgb 337 else: 338 relative_composites[channels_to_get.index(channel)] = rgb 339 else: 340 relative_composites = None 341 342 return channels_to_get, order, relative_composites
Get the channel names for the montage from the event's tile.
Parameters
- channels: channel indices or names for grayscale channels
- composites: dictionary of channel indices or names and RGB values
Returns
(1) channel indices to retrieve, (2) relative grayscale channel indices, and (3) composite channel indices and RGB values.
344 def get_montage( 345 self, 346 channels: Sequence[int | str] = None, 347 composites: dict[int | str, tuple[float, float, float]] = None, 348 crop_size: int = 100, 349 in_pixels: bool = True, 350 input_path: str = None, 351 apply_gain: bool = True, 352 **kwargs, 353 ) -> np.ndarray: 354 """ 355 Convenience function for getting frame images and creating a montage. Mirrors 356 csi_images.make_montage(). Convenient for a single event's montage, but less 357 efficient when for multiple events from the same tile. 358 :param channels: the channels to use for black-and-white montages. 359 :param composites: dictionary of indices and RGB tuples for a composite. 360 :param crop_size: the square size of the image crop to get for this event. 361 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 362 :param input_path: the path to the input images. Defaults to None (uses the scan's path). 363 :param apply_gain: whether to apply scanner-calculated gain to the images, if 364 not already applied. If a list, matches the channels. 365 :param kwargs: montage options. See csi_images.make_montage() for more details. 366 :return: numpy array representing the montage. 367 """ 368 channels, order, composites = self.get_montage_channels(channels, composites) 369 images = self.get_crops(crop_size, in_pixels, input_path, channels, apply_gain) 370 return csi_images.make_montage(images, order, composites, **kwargs)
Convenience function for getting frame images and creating a montage. Mirrors csi_images.make_montage(). Convenient for a single event's montage, but less efficient when for multiple events from the same tile.
Parameters
- channels: the channels to use for black-and-white montages.
- composites: dictionary of indices and RGB tuples for a composite.
- crop_size: the square size of the image crop to get for this event.
- in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
- input_path: the path to the input images. Defaults to None (uses the scan's path).
- apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. If a list, matches the channels.
- kwargs: montage options. See csi_images.make_montage() for more details.
Returns
numpy array representing the montage.
372 def save_montage( 373 self, 374 montage: np.ndarray, 375 output_path: str, 376 ocular_names: bool = False, 377 tag: str = "", 378 ): 379 """ 380 Save the montage as a JPEG image with a set name. 381 :param montage: the montage to save. 382 :param output_path: the folder to save the montage in. Wil make if needed. 383 :param ocular_names: whether to use the OCULAR naming convention. 384 :param tag: a tag to append to the file name. Ignored if ocular_names is True. 385 :return: None 386 """ 387 if csi_images is None or imageio is None: 388 raise ModuleNotFoundError( 389 "imageio libraries not installed! " 390 "run `pip install csi_images[imageio]` to resolve." 391 ) 392 393 montage = csi_images.scale_bit_depth(montage, np.uint8) 394 395 if ocular_names: 396 if "cell_id" not in self.metadata.index: 397 raise ValueError( 398 "Event metadata must include 'cell_id' for OCULAR naming." 399 ) 400 file = f"{self.tile.n}-{self.metadata['cell_id']}-{self.x}-{self.y}.jpeg" 401 else: 402 file = f"{self}{tag}.jpeg" 403 404 os.makedirs(output_path, exist_ok=True) 405 imageio.imwrite(os.path.join(output_path, file), montage, quality=80)
Save the montage as a JPEG image with a set name.
Parameters
- montage: the montage to save.
- output_path: the folder to save the montage in. Wil make if needed.
- ocular_names: whether to use the OCULAR naming convention.
- tag: a tag to append to the file name. Ignored if ocular_names is True.
Returns
None
407 def load_montage(self, input_path: str, tag: str = "") -> np.ndarray: 408 """ 409 Loads the montage from a file saved by Event.save_montage. 410 :param input_path: the path to the folder where the montage was saved. 411 :param tag: a string to add to the file name, before the extension. 412 :return: 413 """ 414 file = f"{self}{tag}.jpeg" 415 return imageio.imread(os.path.join(input_path, file))
Loads the montage from a file saved by Event.save_montage.
Parameters
- input_path: the path to the folder where the montage was saved.
- tag: a string to add to the file name, before the extension.
Returns
417 @classmethod 418 def get_many_crops( 419 cls, 420 events: Sequence[Self], 421 crop_size: int | Sequence[int] = 100, 422 in_pixels: bool = True, 423 input_path: str | Sequence[str] = None, 424 channels: Sequence[int | str] = None, 425 apply_gain: bool | Sequence[bool] = True, 426 ) -> list[list[np.ndarray]]: 427 """ 428 Get the crops for a list of events, ensuring that there is no wasteful reading 429 of the same tile multiple times. This function is more efficient than calling 430 get_crops() for each event. 431 :param events: the events to get crops for. 432 :param crop_size: the square size of the image crop to get for this event. 433 Defaults to four times the size of the event. 434 :param in_pixels: whether the crop size is in pixels or micrometers. 435 Defaults to pixels, and is ignored if crop_size is None. 436 :param input_path: the path to the input images. Will only work for lists of events 437 from the same scan. Defaults to None (uses the scan's path). 438 :param channels: the channels to extract images for. Defaults to all channels. 439 :param apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True. 440 Can be supplied as a list to apply gain to individual channels. 441 :return: a list of lists of cropped images for each event. 442 """ 443 if len(events) == 0: 444 return [] 445 # Adapt singular inputs to lists of appropriate length 446 if isinstance(crop_size, int): 447 crop_size = [crop_size] * len(events) 448 if input_path is None or isinstance(input_path, str): 449 input_path = [input_path] * len(events) 450 451 # Get the order of the events when sorted by slide/tile 452 order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__())) 453 454 # Allocate the list to size 455 crops = [[]] * len(events) 456 last_tile = None 457 images = None # Holds large numpy arrays, so expensive to compare 458 # Iterate through in slide/tile sorted order 459 for i in order: 460 if last_tile != events[i].tile: 461 # Gather the frame images, preserving them for the next event 462 frames = Frame.get_frames(events[i].tile, channels) 463 if isinstance(apply_gain, bool): 464 apply = [apply_gain] * len(frames) 465 else: 466 apply = apply_gain 467 images = [f.get_image(input_path[i], a) for f, a in zip(frames, apply)] 468 last_tile = events[i].tile 469 # Use the frame images to crop the event images 470 crops[i] = events[i].crop(images, crop_size[i], in_pixels) 471 return crops
Get the crops for a list of events, ensuring that there is no wasteful reading of the same tile multiple times. This function is more efficient than calling get_crops() for each event.
Parameters
- events: the events to get crops for.
- crop_size: the square size of the image crop to get for this event. Defaults to four times the size of the event.
- in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels, and is ignored if crop_size is None.
- input_path: the path to the input images. Will only work for lists of events from the same scan. Defaults to None (uses the scan's path).
- channels: the channels to extract images for. Defaults to all channels.
- apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True. Can be supplied as a list to apply gain to individual channels.
Returns
a list of lists of cropped images for each event.
473 @classmethod 474 def get_many_montages( 475 cls, 476 events: Sequence[Self], 477 channels: Sequence[int | str] = None, 478 composites: dict[int | str, tuple[float, float, float]] = None, 479 crop_size: int = 100, 480 in_pixels: bool = True, 481 input_path: str = None, 482 apply_gain: bool | Iterable[bool] = True, 483 **kwargs, 484 ) -> list[np.ndarray]: 485 """ 486 Convenience function for get_montage(), but for a list of events. More efficient 487 thank get_montage() when working with multiple events from the same tile. 488 :param events: a list of Event objects. 489 :param channels: the channels to extract images for. Defaults to all channels. 490 :param composites: dictionary of indices and RGB tuples for a composite. 491 :param crop_size: the square size of the image crop to get for this event. 492 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 493 :param input_path: the path to the input images. Defaults to None (uses the scan's path). 494 :param apply_gain: whether to apply scanner-calculated gain to the images, if 495 not already applied. If a list, matches the channels. 496 :param kwargs: montage options. See csi_images.make_montage() for more details. 497 :return: a list of numpy arrays representing the montages. 498 """ 499 if len(events) == 0: 500 return [] 501 # Adapt singular inputs to lists of appropriate length 502 if isinstance(crop_size, int): 503 crop_size = [crop_size] * len(events) 504 if input_path is None or isinstance(input_path, str): 505 input_path = [input_path] * len(events) 506 507 # Get the order of the events when sorted by slide/tile 508 event_order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__())) 509 510 # Allocate the list to size 511 montages = [np.empty(0)] * len(events) 512 # Placeholder variables to avoid rereading the same tile 513 images = None # Holds large numpy arrays, so expensive to compare 514 order = None 515 rel_composites = None 516 last_tile = None 517 # Iterate through in slide/tile sorted order 518 for i in event_order: 519 if last_tile != events[i].tile: 520 channels_to_get, order, rel_composites = events[i].get_montage_channels( 521 channels, composites 522 ) 523 # Gather the frame images, preserving them for the next event 524 frames = Frame.get_frames(events[i].tile, channels_to_get) 525 if isinstance(apply_gain, bool): 526 apply = [apply_gain] * len(frames) 527 else: 528 apply = apply_gain 529 images = [f.get_image(input_path[i], a) for f, a in zip(frames, apply)] 530 last_tile = events[i].tile 531 # Use the frame images to crop the event images and make montages 532 crops = events[i].crop(images, crop_size[i], in_pixels) 533 montages[i] = csi_images.make_montage( 534 crops, order, rel_composites, **kwargs 535 ) 536 537 return montages
Convenience function for get_montage(), but for a list of events. More efficient thank get_montage() when working with multiple events from the same tile.
Parameters
- events: a list of Event objects.
- channels: the channels to extract images for. Defaults to all channels.
- composites: dictionary of indices and RGB tuples for a composite.
- crop_size: the square size of the image crop to get for this event.
- in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
- input_path: the path to the input images. Defaults to None (uses the scan's path).
- apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. If a list, matches the channels.
- kwargs: montage options. See csi_images.make_montage() for more details.
Returns
a list of numpy arrays representing the montages.
539 @classmethod 540 def get_and_save_many_crops( 541 cls, 542 events: list[Self], 543 output_path: str, 544 labels: Sequence[str], 545 ext: str = "auto", 546 additional_gain: Sequence[float] = None, 547 **kwargs, 548 ) -> None: 549 """ 550 Get and save the crops for a list of events, ensuring that there is no wasteful 551 reading and limiting the image data in memory to 1 tile at a time. This function 552 is more efficient that chaining get_crops() and save_crops() for each event or 553 get_many_crops() and then save_crops(). 554 :param events: list of events to get, crop, and save. 555 :param output_path: the folder to save the crops in. Will make if needed. 556 :param labels: the labels to save the crops with. See save_crops(). 557 :param ext: the file extension to save the crops as. See save_crops(). 558 :param additional_gain: additional gain to apply to the crops. If not None, must 559 match the length of the number of crop channels. 560 :param kwargs: see get_many_crops() for more parameters. 561 :return: 562 """ 563 unique_tiles = set([event.tile for event in events]) 564 565 for tile in unique_tiles: 566 # Get one tile's worth of event crops 567 tile_events = [e for e in events if e.tile == tile] 568 crops_list = cls.get_many_crops(tile_events, **kwargs) 569 for event, crops in zip(tile_events, crops_list): 570 # Apply any additional gains 571 if additional_gain is not None: 572 crops = [gain * crop for gain, crop in zip(additional_gain, crops)] 573 event.save_crops(crops, output_path, labels, ext)
Get and save the crops for a list of events, ensuring that there is no wasteful reading and limiting the image data in memory to 1 tile at a time. This function is more efficient that chaining get_crops() and save_crops() for each event or get_many_crops() and then save_crops().
Parameters
- events: list of events to get, crop, and save.
- output_path: the folder to save the crops in. Will make if needed.
- labels: the labels to save the crops with. See save_crops().
- ext: the file extension to save the crops as. See save_crops().
- additional_gain: additional gain to apply to the crops. If not None, must match the length of the number of crop channels.
- kwargs: see get_many_crops() for more parameters.
Returns
575 @classmethod 576 def get_and_save_many_montages( 577 cls, 578 events: list[Self], 579 output_path: str, 580 ocular_names: bool = False, 581 tag: str = "", 582 **kwargs, 583 ) -> None: 584 """ 585 Save montages of the events to image files. 586 :param events: the events to get, montage, and save. 587 :param output_path: the folder to save the montages to. Will make if needed. 588 :param ocular_names: whether to use the OCULAR naming convention. 589 :param tag: a tag to append to the file name. Ignored if ocular_names is True. 590 :param kwargs: see get_many_montages() for more parameters. 591 """ 592 unique_tiles = set([event.tile for event in events]) 593 594 for tile in unique_tiles: 595 # Get one tile's worth of event crops 596 tile_events = [e for e in events if e.tile == tile] 597 montages = cls.get_many_montages(tile_events, **kwargs) 598 for event, montage in zip(tile_events, montages): 599 event.save_montage(montage, output_path, ocular_names, tag)
Save montages of the events to image files.
Parameters
- events: the events to get, montage, and save.
- output_path: the folder to save the montages to. Will make if needed.
- ocular_names: whether to use the OCULAR naming convention.
- tag: a tag to append to the file name. Ignored if ocular_names is True.
- kwargs: see get_many_montages() for more parameters.
602class EventArray: 603 """ 604 A class that holds a large number of events' data, making it easy to analyze and 605 manipulate many events at once. A more separated version of the Event class. 606 """ 607 608 INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y"] 609 610 def __init__( 611 self, 612 info: pd.DataFrame = None, 613 metadata: pd.DataFrame = None, 614 features: pd.DataFrame = None, 615 ): 616 # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y" 617 if info is not None: 618 # Special case: "roi" is often not required, so we'll fill in if its missing 619 if "roi" not in info.columns: 620 info["roi"] = 0 621 if set(info.columns) != set(self.INFO_COLUMNS): 622 raise ValueError( 623 f"EventArray.info must have columns:" 624 f"{self.INFO_COLUMNS}; had {list(info.columns)}" 625 ) 626 # Copy first to avoid modifying the original 627 info = info.copy() 628 # Ensure that the columns are the right types 629 info["slide_id"] = info["slide_id"].astype(str) 630 info["tile"] = info["tile"].astype(np.uint16) 631 info["roi"] = info["roi"].astype(np.uint8) 632 info["x"] = info["x"].round().astype(np.uint16) 633 info["y"] = info["y"].round().astype(np.uint16) 634 # Ensure that the columns are in the right order 635 info = info[self.INFO_COLUMNS] 636 # All DataFrames must all have the same number of rows 637 if metadata is not None and (info is None or len(info) != len(metadata)): 638 raise ValueError( 639 "If EventArray.metadata is not None, it should match rows with .info" 640 ) 641 if features is not None and (info is None or len(info) != len(features)): 642 raise ValueError( 643 "If EventArray.features is not None, it should match rows with .info" 644 ) 645 # No columns named "metadata_", "features_", or "None" 646 column_names = [] 647 if metadata is not None: 648 column_names += metadata.columns.tolist() 649 if features is not None: 650 column_names += features.columns.tolist() 651 if any([col.lower().startswith("metadata_") for col in column_names]): 652 raise ValueError("EventArray column names cannot start with 'metadata_'") 653 if any([col.lower().startswith("features_") for col in column_names]): 654 raise ValueError("EventArray column names cannot start with 'features_'") 655 if any([col.lower() == "none" for col in column_names]): 656 raise ValueError("EventArray column names cannot be 'none'") 657 658 self.info = info 659 self.metadata = metadata 660 self.features = features 661 662 def __len__(self) -> int: 663 # Convenience method to get the number of events 664 if self.info is None: 665 return 0 666 else: 667 return len(self.info) 668 669 def __eq__(self, other): 670 # Parse all possibilities for info 671 if isinstance(self.info, pd.DataFrame): 672 if isinstance(other.info, pd.DataFrame): 673 if not self.info.equals(other.info): 674 return False 675 else: 676 return False 677 elif self.info is None: 678 if other.info is not None: 679 return False 680 681 # Parse all possibilities for metadata 682 if isinstance(self.metadata, pd.DataFrame): 683 if isinstance(other.metadata, pd.DataFrame): 684 is_equal = self.metadata.equals(other.metadata) 685 if not is_equal: 686 return False 687 else: 688 return False 689 elif self.metadata is None: 690 if other.metadata is not None: 691 return False 692 693 # Parse all possibilities for features 694 if isinstance(self.features, pd.DataFrame): 695 if isinstance(other.features, pd.DataFrame): 696 is_equal = self.features.equals(other.features) 697 if not is_equal: 698 return False 699 else: 700 return False 701 elif self.features is None: 702 if other.features is not None: 703 return False 704 705 return is_equal 706 707 def get_sort_order( 708 self, by: Hashable | Sequence[Hashable], ascending: bool | Sequence[bool] = True 709 ): 710 """ 711 Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames. 712 :param by: name of the column(s) to sort by. 713 :param ascending: whether to sort in ascending order; can be a list to match by 714 :return: the order of the indices to sort by. 715 """ 716 columns = self.get(by) 717 return columns.sort_values(by=by, ascending=ascending).index 718 719 def sort( 720 self, 721 by: Hashable | Sequence[Hashable], 722 ascending: bool | Sequence[bool] = True, 723 ) -> Self: 724 """ 725 Sort the EventArray by column(s) in the info, metadata, or features DataFrames. 726 :param by: name of the column(s) to sort by. 727 :param ascending: whether to sort in ascending order; can be a list to match by 728 :return: a new, sorted EventArray. 729 """ 730 order = self.get_sort_order(by, ascending) 731 info = self.info.loc[order].reset_index(drop=True) 732 if self.metadata is not None: 733 metadata = self.metadata.loc[order].reset_index(drop=True) 734 else: 735 metadata = None 736 if self.features is not None: 737 features = self.features.loc[order].reset_index(drop=True) 738 else: 739 features = None 740 return EventArray(info, metadata, features) 741 742 def get(self, column_names: Hashable | Sequence[Hashable]) -> pd.DataFrame: 743 """ 744 Get a DataFrame with the specified columns from the EventArray, by value. 745 :param column_names: the names of the columns to get. 746 :return: a DataFrame with the specified columns. 747 """ 748 if isinstance(column_names, Hashable): 749 column_names = [column_names] # Drop into a list for the loop 750 columns = [] 751 for column_name in column_names: 752 if column_name in self.info.columns: 753 columns.append(self.info[column_name]) 754 elif self.metadata is not None and column_name in self.metadata.columns: 755 columns.append(self.metadata[column_name]) 756 elif self.features is not None and column_name in self.features.columns: 757 columns.append(self.features[column_name]) 758 else: 759 raise ValueError(f"Column {column_name} not found in EventArray") 760 return pd.concat(columns, axis=1) 761 762 def rows(self, rows: Sequence[Hashable]) -> Self: 763 """ 764 Get a subset of the EventArray rows based on a boolean or integer index, by value. 765 :param rows: row labels, indices, or boolean mask; anything for .loc[] 766 :return: a new EventArray with the subset of events. 767 """ 768 info = self.info.loc[rows].reset_index(drop=True) 769 if self.metadata is not None: 770 metadata = self.metadata.loc[rows].reset_index(drop=True) 771 else: 772 metadata = None 773 if self.features is not None: 774 features = self.features.loc[rows].reset_index(drop=True) 775 else: 776 features = None 777 return EventArray(info, metadata, features) 778 779 def copy(self) -> Self: 780 """ 781 Create a deep copy of the EventArray. 782 :return: a deep copy of the EventArray. 783 """ 784 return EventArray( 785 info=self.info.copy(), 786 metadata=None if self.metadata is None else self.metadata.copy(), 787 features=None if self.features is None else self.features.copy(), 788 ) 789 790 # TODO: add a "filter" convenience function that takes a column name and values to filter by 791 792 def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None: 793 """ 794 Add metadata to the EventArray. Removes the need to check if metadata is None. 795 Overwrites any existing metadata with the same column names as the new metadata. 796 :param new_metadata: the metadata to add. 797 """ 798 if len(self) != len(new_metadata): 799 raise ValueError("New metadata must match length of existing info") 800 801 if self.metadata is None: 802 self.metadata = new_metadata 803 else: 804 if isinstance(new_metadata, pd.Series): 805 self.metadata[new_metadata.name] = new_metadata 806 else: 807 # It's a DataFrame 808 self.metadata[new_metadata.columns] = new_metadata 809 810 def add_features(self, new_features: pd.Series | pd.DataFrame) -> None: 811 """ 812 Add features to the EventArray. Removes the need to check if features is None. 813 Overwrites any existing features with the same column names as the new features. 814 :param new_features: the features to add. 815 """ 816 if len(self) != len(new_features): 817 raise ValueError("New features must match length of existing info") 818 819 if self.features is None: 820 self.features = new_features 821 else: 822 if isinstance(new_features, pd.Series): 823 self.features[new_features.name] = new_features 824 else: 825 # It's a DataFrame 826 self.features[new_features.columns] = new_features 827 828 @classmethod 829 def merge(cls, events: Iterable[Self]) -> Self: 830 """ 831 Combine EventArrays in a list into a single EventArray. 832 :param events: the new list of events. 833 """ 834 all_info = [] 835 all_metadata = [] 836 all_features = [] 837 for event_array in events: 838 # Skip empty EventArrays 839 if event_array.info is not None: 840 all_info.append(event_array.info) 841 if event_array.metadata is not None: 842 all_metadata.append(event_array.metadata) 843 if event_array.features is not None: 844 all_features.append(event_array.features) 845 if len(all_info) == 0: 846 return EventArray() 847 else: 848 all_info = pd.concat(all_info, ignore_index=True) 849 if len(all_metadata) == 0: 850 all_metadata = None 851 else: 852 all_metadata = pd.concat(all_metadata, ignore_index=True) 853 if len(all_features) == 0: 854 all_features = None 855 else: 856 all_features = pd.concat(all_features, ignore_index=True) 857 858 return EventArray(all_info, all_metadata, all_features) 859 860 def to_events( 861 self, 862 scans: Scan | Iterable[Scan], 863 ignore_missing_scans=True, 864 ignore_metadata=False, 865 ignore_features=False, 866 ) -> list[Event]: 867 """ 868 Get the events in the EventArray as a list of events. 869 :param scans: the scans that the events belong to, auto-matched by slide_id. 870 Pass None if you don't care about scan metadata (pass ignore_missing_scans). 871 :param ignore_missing_scans: whether to create blank scans for events without scans. 872 :param ignore_metadata: whether to ignore metadata or not 873 :param ignore_features: whether to ignore features or not 874 :return: 875 """ 876 if isinstance(scans, Scan): 877 scans = [scans] 878 scans = {scan.slide_id: scan for scan in scans} 879 events = [] 880 for i in range(len(self.info)): 881 # Determine the associated scan 882 slide_id = self.info["slide_id"][i] 883 if slide_id not in scans: 884 if ignore_missing_scans: 885 # Create a placeholder scan if the scan is missing 886 scan = Scan.make_placeholder( 887 slide_id, 888 self.info["tile"][i], 889 self.info["roi"][i], 890 ) 891 else: 892 raise ValueError( 893 f"Scan {self.info['slide_id'][i]} not found for event {i}." 894 ) 895 else: 896 scan = scans[slide_id] 897 898 # Prepare the metadata and features 899 if ignore_metadata or self.metadata is None: 900 metadata = None 901 else: 902 # This Series creation method is less efficient, 903 # but required for preserving dtypes 904 metadata = pd.Series( 905 {col: self.metadata.loc[i, col] for col in self.metadata.columns}, 906 dtype=object, 907 ) 908 if ignore_features or self.features is None: 909 features = None 910 else: 911 features = pd.Series( 912 {col: self.features.loc[i, col] for col in self.features.columns}, 913 dtype=object, 914 ) 915 # Create the event and append it to the list 916 events.append( 917 Event( 918 Tile(scan, self.info["tile"][i], self.info["roi"][i]), 919 self.info["x"][i], 920 self.info["y"][i], 921 metadata=metadata, 922 features=features, 923 ) 924 ) 925 return events 926 927 @classmethod 928 def from_events(cls, events: Iterable[Event]) -> Self: 929 """ 930 Set the events in the EventArray to a new list of events. 931 :param events: the new list of events. 932 """ 933 info = pd.DataFrame( 934 { 935 "slide_id": [event.tile.scan.slide_id for event in events], 936 "tile": [event.tile.n for event in events], 937 "roi": [event.tile.n_roi for event in events], 938 "x": [event.x for event in events], 939 "y": [event.y for event in events], 940 } 941 ) 942 metadata_list = [event.metadata for event in events] 943 # Iterate through and ensure that all metadata is the same shape 944 for metadata in metadata_list: 945 if type(metadata) != type(metadata_list[0]): 946 raise ValueError("All metadata must be the same type.") 947 if metadata is not None and metadata.shape != metadata_list[0].shape: 948 raise ValueError("All metadata must be the same shape.") 949 if metadata_list[0] is None: 950 metadata = None 951 else: 952 metadata = pd.DataFrame(metadata_list) 953 features_list = [event.features for event in events] 954 # Iterate through and ensure that all features are the same shape 955 for features in features_list: 956 if type(features) != type(features_list[0]): 957 raise ValueError("All features must be the same type.") 958 if features is not None and features.shape != features_list[0].shape: 959 raise ValueError("All features must be the same shape.") 960 if features_list[0] is None: 961 features = None 962 else: 963 features = pd.DataFrame(features_list) 964 return EventArray(info=info, metadata=metadata, features=features) 965 966 def to_dataframe(self) -> pd.DataFrame: 967 """ 968 Convert all the data in the EventArray to a single DataFrame. 969 :return: a DataFrame with all the data in the EventArray. 970 """ 971 # Make a copy of the info DataFrame and prepend "info_" to the column names 972 output = self.info.copy() 973 # Combine with the metadata and prepend "metadata_" to the column names 974 if self.metadata is not None: 975 metadata = self.metadata.copy() 976 metadata.columns = [f"metadata_{col}" for col in metadata.columns] 977 output = pd.concat([output, metadata], axis=1) 978 # Combine with the features and prepend "features_" to the column names 979 if self.features is not None: 980 features = self.features.copy() 981 features.columns = [f"features_{col}" for col in features.columns] 982 output = pd.concat([output, features], axis=1) 983 return output 984 985 @classmethod 986 def from_dataframe( 987 cls, df, metadata_prefix: str = "metadata_", features_prefix: str = "features_" 988 ) -> Self: 989 """ 990 From a single, special DataFrame, create an EventArray. 991 :param df: the DataFrame to convert to an EventArray. 992 :param metadata_prefix: the prefix for metadata columns. 993 :param features_prefix: the prefix for features columns. 994 :return: a DataFrame with all the data in the EventArray. 995 """ 996 # Split the columns into info, metadata, and features and strip prefix 997 info = df[[col for col in df.columns if col in cls.INFO_COLUMNS]].copy() 998 if info.size == 0: 999 info = None 1000 metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy() 1001 metadata.columns = [ 1002 col.replace(metadata_prefix, "") for col in metadata.columns 1003 ] 1004 if metadata.size == 0: 1005 metadata = None 1006 features = df[[col for col in df.columns if col.startswith("features_")]].copy() 1007 features.columns = [ 1008 col.replace(features_prefix, "") for col in features.columns 1009 ] 1010 if features.size == 0: 1011 features = None 1012 return cls(info=info, metadata=metadata, features=features) 1013 1014 @classmethod 1015 def from_mask( 1016 cls, 1017 mask: np.ndarray, 1018 slide_id: str, 1019 tile_n: int, 1020 n_roi: int = 0, 1021 include_cell_id: bool = True, 1022 images: list[np.ndarray] = None, 1023 image_labels: list[str] = None, 1024 properties: list[str] = None, 1025 ) -> Self: 1026 """ 1027 Extract events from a mask DataFrame, including metadata and features. 1028 :param mask: the mask to extract events from. 1029 :param slide_id: the slide ID the mask is from. 1030 :param tile_n: the tile number the mask is from. 1031 :param n_roi: the ROI number the mask is from. 1032 :param include_cell_id: whether to include the cell_id, or numerical 1033 mask label, as metadata in the EventArray. 1034 :param images: the intensity images to extract features from. 1035 :param image_labels: the labels for the intensity images. 1036 :param properties: list of properties to extract in addition to the defaults: 1037 :return: EventArray corresponding to the mask labels. 1038 """ 1039 if csi_images is None: 1040 raise ModuleNotFoundError( 1041 "imageio libraries not installed! " 1042 "run `pip install csi_images[imageio]` to resolve." 1043 ) 1044 # Gather mask_info 1045 if images is not None and image_labels is not None: 1046 if len(images) != len(image_labels): 1047 raise ValueError("Intensity images and labels must match lengths.") 1048 1049 mask_info = csi_images.extract_mask_info(mask, images, image_labels, properties) 1050 1051 if len(mask_info) == 0: 1052 return EventArray() 1053 1054 # Combine provided info and mask info 1055 info = pd.DataFrame( 1056 { 1057 "slide_id": slide_id, 1058 "tile": tile_n, 1059 "roi": n_roi, 1060 "x": mask_info["x"], 1061 "y": mask_info["y"], 1062 }, 1063 ) 1064 # Extract a metadata column if desired 1065 if include_cell_id: 1066 metadata = pd.DataFrame({"cell_id": mask_info["id"]}) 1067 else: 1068 metadata = None 1069 # If any additional properties were extracted, add them as features 1070 mask_info = mask_info.drop(columns=["id", "x", "y"], errors="ignore") 1071 if len(mask_info.columns) > 0: 1072 features = mask_info 1073 else: 1074 features = None 1075 return EventArray(info, metadata, features) 1076 1077 def save_csv(self, output_path: str) -> bool: 1078 """ 1079 Save the events to an CSV file, including metadata and features. 1080 :param output_path: 1081 :return: 1082 """ 1083 if not output_path.endswith(".csv"): 1084 output_path += ".csv" 1085 self.to_dataframe().to_csv(output_path, index=False) 1086 return os.path.exists(output_path) 1087 1088 @classmethod 1089 def load_csv( 1090 cls, 1091 input_path: str, 1092 metadata_prefix: str = "metadata_", 1093 features_prefix: str = "features_", 1094 ) -> Self: 1095 """ 1096 Load the events from an CSV file, including metadata and features. 1097 :param input_path: 1098 :param metadata_prefix: 1099 :param features_prefix: 1100 :return: 1101 """ 1102 # Load the CSV file 1103 df = pd.read_csv(input_path) 1104 return cls.from_dataframe(df, metadata_prefix, features_prefix) 1105 1106 def save_hdf5(self, output_path: str) -> bool: 1107 """ 1108 Save the events to an HDF5 file, including metadata and features. 1109 Uses the pandas-provided HDF5 functions for ease, and external compatibility, 1110 though these files are slightly harder to view in HDFView or similar. 1111 :param output_path: 1112 :return: 1113 """ 1114 if not output_path.endswith(".hdf5") and not output_path.endswith(".h5"): 1115 output_path += ".hdf5" 1116 # Open the output_path as an HDF5 file 1117 with pd.HDFStore(output_path) as store: 1118 # Store the dataframes in the HDF5 file 1119 if self.info is not None: 1120 store.put("info", self.info, index=False) 1121 if self.metadata is not None: 1122 store.put("metadata", self.metadata, index=False) 1123 if self.features is not None: 1124 store.put("features", self.features, index=False) 1125 return os.path.exists(output_path) 1126 1127 @classmethod 1128 def load_hdf5(cls, input_path: str) -> Self: 1129 """ 1130 Load the events from an HDF5 file, including metadata and features. 1131 :param input_path: 1132 :return: 1133 """ 1134 # Open the input_path as an HDF5 file 1135 with pd.HDFStore(input_path, "r") as store: 1136 # Load the dataframes from the HDF5 file 1137 info = store.get("info") if "info" in store else None 1138 metadata = store.get("metadata") if "metadata" in store else None 1139 features = store.get("features") if "features" in store else None 1140 return cls(info=info, metadata=metadata, features=features) 1141 1142 def save_ocular(self, output_path: str, event_type: str = "cells"): 1143 """ 1144 Save the events to an OCULAR file. Relies on the dataframe originating 1145 from an OCULAR file (same columns; duplicate metadata/info). 1146 :param output_path: 1147 :param event_type: 1148 :return: 1149 """ 1150 if pyreadr is None: 1151 raise ModuleNotFoundError( 1152 "pyreadr not installed! Install pyreadr directly " 1153 "or run `pip install csi-images[rds]` option to resolve." 1154 ) 1155 if event_type == "cells": 1156 file_stub = "rc-final" 1157 elif event_type == "others": 1158 file_stub = "others-final" 1159 else: 1160 raise ValueError("Invalid event type. Must be cells or others.") 1161 1162 # Ensure good metadata 1163 metadata = pd.DataFrame( 1164 { 1165 "slide_id": self.info["slide_id"], 1166 "frame_id": self.info["tile"], 1167 "cell_id": ( 1168 self.metadata["cell_id"] 1169 if "cell_id" in self.metadata.columns 1170 else range(len(self.info)) 1171 ), 1172 "cellx": self.info["x"], 1173 "celly": self.info["y"], 1174 } 1175 ) 1176 if self.metadata is not None: 1177 metadata[self.metadata.columns] = self.metadata.copy() 1178 1179 # Check for the "ocular_interesting" column 1180 if event_type == "cells": 1181 if "ocular_interesting" in metadata.columns: 1182 interesting_rows = metadata["ocular_interesting"].to_numpy(dtype=bool) 1183 elif "hcpc" in metadata.columns: 1184 # Interesting cells don't get an hcpc designation, leaving them as -1 1185 interesting_rows = ( 1186 metadata["hcpc"].to_numpy() == -1 1187 ) # interesting cells 1188 else: 1189 interesting_rows = [] 1190 if sum(interesting_rows) > 0: 1191 # Split the metadata into interesting and regular 1192 interesting_events = self.rows(interesting_rows) 1193 interesting_df = pd.concat( 1194 [interesting_events.features, interesting_events.metadata], axis=1 1195 ) 1196 data_events = self.rows(~interesting_rows) 1197 data_df = pd.concat( 1198 [data_events.features, data_events.metadata], axis=1 1199 ) 1200 data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore") 1201 1202 # Drop particular columns for "interesting" 1203 interesting_df = interesting_df.drop( 1204 [ 1205 "clust", 1206 "hcpc", 1207 "frame_id", 1208 "cell_id", 1209 "unique_id", 1210 "ocular_interesting", 1211 ], 1212 axis=1, 1213 errors="ignore", 1214 ) 1215 # Save both .csv and .rds 1216 interesting_stub = os.path.join(output_path, "ocular_interesting") 1217 interesting_df.to_csv(f"{interesting_stub}.csv") 1218 # Suppress pandas FutureWarning 1219 with warnings.catch_warnings(): 1220 warnings.simplefilter(action="ignore", category=FutureWarning) 1221 pyreadr.write_rds(f"{interesting_stub}.rds", interesting_df) 1222 else: 1223 data_df = pd.concat([self.features, metadata], axis=1) 1224 else: 1225 # Get all data and reset_index (will copy it) 1226 data_df = pd.concat([self.features, metadata], axis=1) 1227 1228 # Split based on cluster number to conform to *-final[1-4].rds 1229 n_clusters = max(data_df["clust"]) + 1 1230 split_idx = [round(i * n_clusters / 4) for i in range(5)] 1231 for i in range(4): 1232 subset = (split_idx[i] <= data_df["clust"]) & ( 1233 data_df["clust"] < split_idx[i + 1] 1234 ) 1235 data_df.loc[subset, "hcpc"] = i + 1 1236 subset = data_df[subset].reset_index(drop=True) 1237 # Suppress pandas FutureWarning 1238 with warnings.catch_warnings(): 1239 warnings.simplefilter(action="ignore", category=FutureWarning) 1240 pyreadr.write_rds( 1241 os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset 1242 ) 1243 1244 # Create new example cell strings 1245 data_df["example_cell_id"] = ( 1246 data_df["slide_id"] 1247 + " " 1248 + data_df["frame_id"].astype(str) 1249 + " " 1250 + data_df["cell_id"].astype(str) 1251 + " " 1252 + data_df["cellx"].astype(int).astype(str) 1253 + " " 1254 + data_df["celly"].astype(int).astype(str) 1255 ) 1256 # Find averagable data columns 1257 if "cellcluster_id" in data_df.columns: 1258 end_idx = data_df.columns.get_loc("cellcluster_id") 1259 else: 1260 end_idx = data_df.columns.get_loc("slide_id") 1261 avg_cols = data_df.columns[:end_idx].tolist() 1262 # Group by cluster and average 1263 data_df = data_df.groupby("clust").agg( 1264 **{col: (col, "mean") for col in avg_cols}, 1265 count=("clust", "size"), # count rows in each cluster 1266 example_cells=("example_cell_id", lambda x: ",".join(x)), 1267 hcpc=("hcpc", lambda x: x.iloc[0]), 1268 ) 1269 data_df = data_df.reset_index() # Do NOT drop, index is "clust" 1270 # Create new columns 1271 metadata = pd.DataFrame( 1272 { 1273 "count": data_df["count"], 1274 "example_cells": data_df["example_cells"], 1275 "clust": data_df["clust"].astype(int), 1276 "hcpc": data_df["hcpc"].astype(int), 1277 "id": data_df["clust"].astype(int).astype(str), 1278 "cccluster": "0", # Dummy value 1279 "ccdistance": 0.0, # Dummy value 1280 "rownum": list(range(len(data_df))), 1281 "framegroup": 0, # Dummy value 1282 } 1283 ) 1284 # Need to pad the features to 761 columns, as per OCULAR report needs 1285 additional_columns = range(len(avg_cols), 761) 1286 if len(additional_columns) > 0: 1287 padding = pd.DataFrame( 1288 np.zeros((len(data_df), len(additional_columns))), 1289 columns=[f"pad{i}" for i in additional_columns], 1290 ) 1291 data_df = pd.concat([data_df[avg_cols], padding, metadata], axis=1) 1292 else: 1293 data_df = pd.concat([data_df[avg_cols], metadata], axis=1) 1294 1295 # Save the cluster data 1296 data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv")) 1297 # Suppress pandas FutureWarning 1298 with warnings.catch_warnings(): 1299 warnings.simplefilter(action="ignore", category=FutureWarning) 1300 pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df) 1301 1302 @classmethod 1303 def load_ocular( 1304 cls, 1305 input_path: str, 1306 event_type="cells", 1307 cell_data_files=( 1308 "rc-final1.rds", 1309 "rc-final2.rds", 1310 "rc-final3.rds", 1311 "rc-final4.rds", 1312 "ocular_interesting.rds", 1313 ), 1314 others_data_files=( 1315 "others-final1.rds", 1316 "others-final2.rds", 1317 "others-final3.rds", 1318 "others-final4.rds", 1319 ), 1320 atlas_data_files=( 1321 "ocular_interesting.rds", 1322 "ocular_not_interesting.rds", 1323 ), 1324 drop_common_events=True, 1325 ) -> Self: 1326 """ 1327 1328 :param input_path: 1329 :param event_type: 1330 :param cell_data_files: 1331 :param others_data_files: 1332 :param atlas_data_files: 1333 :param drop_common_events: 1334 :return: 1335 """ 1336 if pyreadr is None: 1337 raise ModuleNotFoundError( 1338 "pyreadr not installed! Install pyreadr directly " 1339 "or run `pip install csi-images[rds]` option to resolve." 1340 ) 1341 # Check if the input path is a directory or a file 1342 if os.path.isfile(input_path): 1343 data_files = [os.path.basename(input_path)] 1344 input_path = os.path.dirname(input_path) 1345 if event_type == "cells": 1346 data_files = cell_data_files 1347 elif event_type == "others": 1348 data_files = others_data_files 1349 else: 1350 raise ValueError("Invalid event type.") 1351 1352 # Load the data from the OCULAR files 1353 file_data = {} 1354 for file in data_files: 1355 file_path = os.path.join(input_path, file) 1356 if not os.path.isfile(file_path): 1357 warnings.warn(f"{file} not found for in {input_path}") 1358 continue 1359 file_data[file] = pyreadr.read_r(file_path) 1360 # Get the DataFrame associated with None (pyreadr dict quirk) 1361 file_data[file] = file_data[file][None] 1362 if len(file_data[file]) == 0: 1363 # File gets dropped from the dict 1364 file_data.pop(file) 1365 warnings.warn(f"{file} has no cells") 1366 continue 1367 1368 # Drop common cells if requested and in this file 1369 if ( 1370 file in atlas_data_files 1371 and drop_common_events 1372 and "catalogue_classification" in file_data[file] 1373 ): 1374 common_cell_indices = ( 1375 file_data[file]["catalogue_classification"] == "common_cell" 1376 ) 1377 file_data[file] = file_data[file][common_cell_indices == False] 1378 1379 if len(file_data[file]) == 0: 1380 # File gets dropped from the dict 1381 file_data.pop(file) 1382 warnings.warn(f"{file} has no cells after dropping common cells") 1383 continue 1384 1385 # Extract frame_id and cell_id 1386 # DAPI- events already have frame_id cell_id outside rowname 1387 if event_type == "cells" and "frame_id" not in file_data[file].columns: 1388 file_data[file]["rowname"] = file_data[file]["rowname"].astype("str") 1389 # get frame_id cell_id from rownames column and split into two columns 1390 split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True) 1391 if len(split_res.columns) != 2: 1392 warnings.warn( 1393 f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}' 1394 ) 1395 # then assign it back to the dataframe 1396 file_data[file][["frame_id", "cell_id"]] = split_res.astype("int") 1397 # Ensure frame_id and cell_id are integers 1398 file_data[file]["frame_id"] = file_data[file]["frame_id"].astype("int") 1399 file_data[file]["cell_id"] = file_data[file]["cell_id"].astype("int") 1400 # reset indexes since they can cause NaN values in concat 1401 file_data[file] = file_data[file].reset_index(drop=True) 1402 1403 # Merge the data from all files 1404 if len(file_data) == 0: 1405 return EventArray() 1406 elif len(file_data) == 1: 1407 data = [file_data[file] for file in file_data.keys()][0] 1408 else: 1409 data = pd.concat(file_data.values()) 1410 1411 # Others is missing the "slide_id". Insert it right before "frame_id" column 1412 if event_type == "others" and "slide_id" not in data.columns: 1413 if os.path.basename(input_path) == "ocular": 1414 slide_id = os.path.basename(os.path.dirname(input_path)) 1415 else: 1416 slide_id = "UNKNOWN" 1417 data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id) 1418 1419 # Sort according to ascending cell_id to keep the original, which is in manual_df 1420 data = data.sort_values(by=["cell_id"], ascending=True) 1421 # Filter out duplicates by x & y 1422 data = data.assign( 1423 unique_id=data["slide_id"] 1424 + "_" 1425 + data["frame_id"].astype(str) 1426 + "_" 1427 + data["cellx"].astype(int).astype(str) 1428 + "_" 1429 + data["celly"].astype(int).astype(str) 1430 ) 1431 data = data.drop_duplicates(subset=["unique_id"], keep="first") 1432 # Normal unique_id is with cell_id 1433 data = data.assign( 1434 unique_id=data["slide_id"] 1435 + "_" 1436 + data["frame_id"].astype(str) 1437 + "_" 1438 + data["cell_id"].astype(str) 1439 ) 1440 data = data.reset_index(drop=True) 1441 # All columns up to "slide_id" are features; drop the "slide_id" 1442 features = data.loc[:, :"slide_id"].iloc[:, :-1] 1443 data = data.loc[:, "slide_id":] 1444 # Grab the info columns 1445 info = data[["slide_id", "frame_id", "cellx", "celly"]] 1446 info.columns = ["slide_id", "tile", "x", "y"] 1447 info = info.assign(roi=0) # OCULAR only works on 1 ROI, as far as known 1448 info = info[["slide_id", "tile", "roi", "x", "y"]] 1449 # Metadata has duplicate columns for later convenience 1450 metadata = data 1451 # Certain columns tend to be problematic with mixed data formats... 1452 for col in ["TRITC", "CY5", "FITC"]: 1453 if col in metadata: 1454 labels = { 1455 "False": False, 1456 "True": True, 1457 "FALSE": False, 1458 "TRUE": True, 1459 False: False, 1460 True: True, 1461 } 1462 metadata[col] = metadata[col].map(labels).astype(bool) 1463 for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]: 1464 if col in metadata: 1465 metadata[col] = metadata[col].fillna(-1).astype(int) 1466 return EventArray(info, metadata, features)
A class that holds a large number of events' data, making it easy to analyze and manipulate many events at once. A more separated version of the Event class.
610 def __init__( 611 self, 612 info: pd.DataFrame = None, 613 metadata: pd.DataFrame = None, 614 features: pd.DataFrame = None, 615 ): 616 # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y" 617 if info is not None: 618 # Special case: "roi" is often not required, so we'll fill in if its missing 619 if "roi" not in info.columns: 620 info["roi"] = 0 621 if set(info.columns) != set(self.INFO_COLUMNS): 622 raise ValueError( 623 f"EventArray.info must have columns:" 624 f"{self.INFO_COLUMNS}; had {list(info.columns)}" 625 ) 626 # Copy first to avoid modifying the original 627 info = info.copy() 628 # Ensure that the columns are the right types 629 info["slide_id"] = info["slide_id"].astype(str) 630 info["tile"] = info["tile"].astype(np.uint16) 631 info["roi"] = info["roi"].astype(np.uint8) 632 info["x"] = info["x"].round().astype(np.uint16) 633 info["y"] = info["y"].round().astype(np.uint16) 634 # Ensure that the columns are in the right order 635 info = info[self.INFO_COLUMNS] 636 # All DataFrames must all have the same number of rows 637 if metadata is not None and (info is None or len(info) != len(metadata)): 638 raise ValueError( 639 "If EventArray.metadata is not None, it should match rows with .info" 640 ) 641 if features is not None and (info is None or len(info) != len(features)): 642 raise ValueError( 643 "If EventArray.features is not None, it should match rows with .info" 644 ) 645 # No columns named "metadata_", "features_", or "None" 646 column_names = [] 647 if metadata is not None: 648 column_names += metadata.columns.tolist() 649 if features is not None: 650 column_names += features.columns.tolist() 651 if any([col.lower().startswith("metadata_") for col in column_names]): 652 raise ValueError("EventArray column names cannot start with 'metadata_'") 653 if any([col.lower().startswith("features_") for col in column_names]): 654 raise ValueError("EventArray column names cannot start with 'features_'") 655 if any([col.lower() == "none" for col in column_names]): 656 raise ValueError("EventArray column names cannot be 'none'") 657 658 self.info = info 659 self.metadata = metadata 660 self.features = features
707 def get_sort_order( 708 self, by: Hashable | Sequence[Hashable], ascending: bool | Sequence[bool] = True 709 ): 710 """ 711 Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames. 712 :param by: name of the column(s) to sort by. 713 :param ascending: whether to sort in ascending order; can be a list to match by 714 :return: the order of the indices to sort by. 715 """ 716 columns = self.get(by) 717 return columns.sort_values(by=by, ascending=ascending).index
Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.
Parameters
- by: name of the column(s) to sort by.
- ascending: whether to sort in ascending order; can be a list to match by
Returns
the order of the indices to sort by.
719 def sort( 720 self, 721 by: Hashable | Sequence[Hashable], 722 ascending: bool | Sequence[bool] = True, 723 ) -> Self: 724 """ 725 Sort the EventArray by column(s) in the info, metadata, or features DataFrames. 726 :param by: name of the column(s) to sort by. 727 :param ascending: whether to sort in ascending order; can be a list to match by 728 :return: a new, sorted EventArray. 729 """ 730 order = self.get_sort_order(by, ascending) 731 info = self.info.loc[order].reset_index(drop=True) 732 if self.metadata is not None: 733 metadata = self.metadata.loc[order].reset_index(drop=True) 734 else: 735 metadata = None 736 if self.features is not None: 737 features = self.features.loc[order].reset_index(drop=True) 738 else: 739 features = None 740 return EventArray(info, metadata, features)
Sort the EventArray by column(s) in the info, metadata, or features DataFrames.
Parameters
- by: name of the column(s) to sort by.
- ascending: whether to sort in ascending order; can be a list to match by
Returns
a new, sorted EventArray.
742 def get(self, column_names: Hashable | Sequence[Hashable]) -> pd.DataFrame: 743 """ 744 Get a DataFrame with the specified columns from the EventArray, by value. 745 :param column_names: the names of the columns to get. 746 :return: a DataFrame with the specified columns. 747 """ 748 if isinstance(column_names, Hashable): 749 column_names = [column_names] # Drop into a list for the loop 750 columns = [] 751 for column_name in column_names: 752 if column_name in self.info.columns: 753 columns.append(self.info[column_name]) 754 elif self.metadata is not None and column_name in self.metadata.columns: 755 columns.append(self.metadata[column_name]) 756 elif self.features is not None and column_name in self.features.columns: 757 columns.append(self.features[column_name]) 758 else: 759 raise ValueError(f"Column {column_name} not found in EventArray") 760 return pd.concat(columns, axis=1)
Get a DataFrame with the specified columns from the EventArray, by value.
Parameters
- column_names: the names of the columns to get.
Returns
a DataFrame with the specified columns.
762 def rows(self, rows: Sequence[Hashable]) -> Self: 763 """ 764 Get a subset of the EventArray rows based on a boolean or integer index, by value. 765 :param rows: row labels, indices, or boolean mask; anything for .loc[] 766 :return: a new EventArray with the subset of events. 767 """ 768 info = self.info.loc[rows].reset_index(drop=True) 769 if self.metadata is not None: 770 metadata = self.metadata.loc[rows].reset_index(drop=True) 771 else: 772 metadata = None 773 if self.features is not None: 774 features = self.features.loc[rows].reset_index(drop=True) 775 else: 776 features = None 777 return EventArray(info, metadata, features)
Get a subset of the EventArray rows based on a boolean or integer index, by value.
Parameters
- rows: row labels, indices, or boolean mask; anything for .loc[]
Returns
a new EventArray with the subset of events.
779 def copy(self) -> Self: 780 """ 781 Create a deep copy of the EventArray. 782 :return: a deep copy of the EventArray. 783 """ 784 return EventArray( 785 info=self.info.copy(), 786 metadata=None if self.metadata is None else self.metadata.copy(), 787 features=None if self.features is None else self.features.copy(), 788 )
Create a deep copy of the EventArray.
Returns
a deep copy of the EventArray.
792 def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None: 793 """ 794 Add metadata to the EventArray. Removes the need to check if metadata is None. 795 Overwrites any existing metadata with the same column names as the new metadata. 796 :param new_metadata: the metadata to add. 797 """ 798 if len(self) != len(new_metadata): 799 raise ValueError("New metadata must match length of existing info") 800 801 if self.metadata is None: 802 self.metadata = new_metadata 803 else: 804 if isinstance(new_metadata, pd.Series): 805 self.metadata[new_metadata.name] = new_metadata 806 else: 807 # It's a DataFrame 808 self.metadata[new_metadata.columns] = new_metadata
Add metadata to the EventArray. Removes the need to check if metadata is None. Overwrites any existing metadata with the same column names as the new metadata.
Parameters
- new_metadata: the metadata to add.
810 def add_features(self, new_features: pd.Series | pd.DataFrame) -> None: 811 """ 812 Add features to the EventArray. Removes the need to check if features is None. 813 Overwrites any existing features with the same column names as the new features. 814 :param new_features: the features to add. 815 """ 816 if len(self) != len(new_features): 817 raise ValueError("New features must match length of existing info") 818 819 if self.features is None: 820 self.features = new_features 821 else: 822 if isinstance(new_features, pd.Series): 823 self.features[new_features.name] = new_features 824 else: 825 # It's a DataFrame 826 self.features[new_features.columns] = new_features
Add features to the EventArray. Removes the need to check if features is None. Overwrites any existing features with the same column names as the new features.
Parameters
- new_features: the features to add.
828 @classmethod 829 def merge(cls, events: Iterable[Self]) -> Self: 830 """ 831 Combine EventArrays in a list into a single EventArray. 832 :param events: the new list of events. 833 """ 834 all_info = [] 835 all_metadata = [] 836 all_features = [] 837 for event_array in events: 838 # Skip empty EventArrays 839 if event_array.info is not None: 840 all_info.append(event_array.info) 841 if event_array.metadata is not None: 842 all_metadata.append(event_array.metadata) 843 if event_array.features is not None: 844 all_features.append(event_array.features) 845 if len(all_info) == 0: 846 return EventArray() 847 else: 848 all_info = pd.concat(all_info, ignore_index=True) 849 if len(all_metadata) == 0: 850 all_metadata = None 851 else: 852 all_metadata = pd.concat(all_metadata, ignore_index=True) 853 if len(all_features) == 0: 854 all_features = None 855 else: 856 all_features = pd.concat(all_features, ignore_index=True) 857 858 return EventArray(all_info, all_metadata, all_features)
Combine EventArrays in a list into a single EventArray.
Parameters
- events: the new list of events.
860 def to_events( 861 self, 862 scans: Scan | Iterable[Scan], 863 ignore_missing_scans=True, 864 ignore_metadata=False, 865 ignore_features=False, 866 ) -> list[Event]: 867 """ 868 Get the events in the EventArray as a list of events. 869 :param scans: the scans that the events belong to, auto-matched by slide_id. 870 Pass None if you don't care about scan metadata (pass ignore_missing_scans). 871 :param ignore_missing_scans: whether to create blank scans for events without scans. 872 :param ignore_metadata: whether to ignore metadata or not 873 :param ignore_features: whether to ignore features or not 874 :return: 875 """ 876 if isinstance(scans, Scan): 877 scans = [scans] 878 scans = {scan.slide_id: scan for scan in scans} 879 events = [] 880 for i in range(len(self.info)): 881 # Determine the associated scan 882 slide_id = self.info["slide_id"][i] 883 if slide_id not in scans: 884 if ignore_missing_scans: 885 # Create a placeholder scan if the scan is missing 886 scan = Scan.make_placeholder( 887 slide_id, 888 self.info["tile"][i], 889 self.info["roi"][i], 890 ) 891 else: 892 raise ValueError( 893 f"Scan {self.info['slide_id'][i]} not found for event {i}." 894 ) 895 else: 896 scan = scans[slide_id] 897 898 # Prepare the metadata and features 899 if ignore_metadata or self.metadata is None: 900 metadata = None 901 else: 902 # This Series creation method is less efficient, 903 # but required for preserving dtypes 904 metadata = pd.Series( 905 {col: self.metadata.loc[i, col] for col in self.metadata.columns}, 906 dtype=object, 907 ) 908 if ignore_features or self.features is None: 909 features = None 910 else: 911 features = pd.Series( 912 {col: self.features.loc[i, col] for col in self.features.columns}, 913 dtype=object, 914 ) 915 # Create the event and append it to the list 916 events.append( 917 Event( 918 Tile(scan, self.info["tile"][i], self.info["roi"][i]), 919 self.info["x"][i], 920 self.info["y"][i], 921 metadata=metadata, 922 features=features, 923 ) 924 ) 925 return events
Get the events in the EventArray as a list of events.
Parameters
- scans: the scans that the events belong to, auto-matched by slide_id. Pass None if you don't care about scan metadata (pass ignore_missing_scans).
- ignore_missing_scans: whether to create blank scans for events without scans.
- ignore_metadata: whether to ignore metadata or not
- ignore_features: whether to ignore features or not
Returns
927 @classmethod 928 def from_events(cls, events: Iterable[Event]) -> Self: 929 """ 930 Set the events in the EventArray to a new list of events. 931 :param events: the new list of events. 932 """ 933 info = pd.DataFrame( 934 { 935 "slide_id": [event.tile.scan.slide_id for event in events], 936 "tile": [event.tile.n for event in events], 937 "roi": [event.tile.n_roi for event in events], 938 "x": [event.x for event in events], 939 "y": [event.y for event in events], 940 } 941 ) 942 metadata_list = [event.metadata for event in events] 943 # Iterate through and ensure that all metadata is the same shape 944 for metadata in metadata_list: 945 if type(metadata) != type(metadata_list[0]): 946 raise ValueError("All metadata must be the same type.") 947 if metadata is not None and metadata.shape != metadata_list[0].shape: 948 raise ValueError("All metadata must be the same shape.") 949 if metadata_list[0] is None: 950 metadata = None 951 else: 952 metadata = pd.DataFrame(metadata_list) 953 features_list = [event.features for event in events] 954 # Iterate through and ensure that all features are the same shape 955 for features in features_list: 956 if type(features) != type(features_list[0]): 957 raise ValueError("All features must be the same type.") 958 if features is not None and features.shape != features_list[0].shape: 959 raise ValueError("All features must be the same shape.") 960 if features_list[0] is None: 961 features = None 962 else: 963 features = pd.DataFrame(features_list) 964 return EventArray(info=info, metadata=metadata, features=features)
Set the events in the EventArray to a new list of events.
Parameters
- events: the new list of events.
966 def to_dataframe(self) -> pd.DataFrame: 967 """ 968 Convert all the data in the EventArray to a single DataFrame. 969 :return: a DataFrame with all the data in the EventArray. 970 """ 971 # Make a copy of the info DataFrame and prepend "info_" to the column names 972 output = self.info.copy() 973 # Combine with the metadata and prepend "metadata_" to the column names 974 if self.metadata is not None: 975 metadata = self.metadata.copy() 976 metadata.columns = [f"metadata_{col}" for col in metadata.columns] 977 output = pd.concat([output, metadata], axis=1) 978 # Combine with the features and prepend "features_" to the column names 979 if self.features is not None: 980 features = self.features.copy() 981 features.columns = [f"features_{col}" for col in features.columns] 982 output = pd.concat([output, features], axis=1) 983 return output
Convert all the data in the EventArray to a single DataFrame.
Returns
a DataFrame with all the data in the EventArray.
985 @classmethod 986 def from_dataframe( 987 cls, df, metadata_prefix: str = "metadata_", features_prefix: str = "features_" 988 ) -> Self: 989 """ 990 From a single, special DataFrame, create an EventArray. 991 :param df: the DataFrame to convert to an EventArray. 992 :param metadata_prefix: the prefix for metadata columns. 993 :param features_prefix: the prefix for features columns. 994 :return: a DataFrame with all the data in the EventArray. 995 """ 996 # Split the columns into info, metadata, and features and strip prefix 997 info = df[[col for col in df.columns if col in cls.INFO_COLUMNS]].copy() 998 if info.size == 0: 999 info = None 1000 metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy() 1001 metadata.columns = [ 1002 col.replace(metadata_prefix, "") for col in metadata.columns 1003 ] 1004 if metadata.size == 0: 1005 metadata = None 1006 features = df[[col for col in df.columns if col.startswith("features_")]].copy() 1007 features.columns = [ 1008 col.replace(features_prefix, "") for col in features.columns 1009 ] 1010 if features.size == 0: 1011 features = None 1012 return cls(info=info, metadata=metadata, features=features)
From a single, special DataFrame, create an EventArray.
Parameters
- df: the DataFrame to convert to an EventArray.
- metadata_prefix: the prefix for metadata columns.
- features_prefix: the prefix for features columns.
Returns
a DataFrame with all the data in the EventArray.
1014 @classmethod 1015 def from_mask( 1016 cls, 1017 mask: np.ndarray, 1018 slide_id: str, 1019 tile_n: int, 1020 n_roi: int = 0, 1021 include_cell_id: bool = True, 1022 images: list[np.ndarray] = None, 1023 image_labels: list[str] = None, 1024 properties: list[str] = None, 1025 ) -> Self: 1026 """ 1027 Extract events from a mask DataFrame, including metadata and features. 1028 :param mask: the mask to extract events from. 1029 :param slide_id: the slide ID the mask is from. 1030 :param tile_n: the tile number the mask is from. 1031 :param n_roi: the ROI number the mask is from. 1032 :param include_cell_id: whether to include the cell_id, or numerical 1033 mask label, as metadata in the EventArray. 1034 :param images: the intensity images to extract features from. 1035 :param image_labels: the labels for the intensity images. 1036 :param properties: list of properties to extract in addition to the defaults: 1037 :return: EventArray corresponding to the mask labels. 1038 """ 1039 if csi_images is None: 1040 raise ModuleNotFoundError( 1041 "imageio libraries not installed! " 1042 "run `pip install csi_images[imageio]` to resolve." 1043 ) 1044 # Gather mask_info 1045 if images is not None and image_labels is not None: 1046 if len(images) != len(image_labels): 1047 raise ValueError("Intensity images and labels must match lengths.") 1048 1049 mask_info = csi_images.extract_mask_info(mask, images, image_labels, properties) 1050 1051 if len(mask_info) == 0: 1052 return EventArray() 1053 1054 # Combine provided info and mask info 1055 info = pd.DataFrame( 1056 { 1057 "slide_id": slide_id, 1058 "tile": tile_n, 1059 "roi": n_roi, 1060 "x": mask_info["x"], 1061 "y": mask_info["y"], 1062 }, 1063 ) 1064 # Extract a metadata column if desired 1065 if include_cell_id: 1066 metadata = pd.DataFrame({"cell_id": mask_info["id"]}) 1067 else: 1068 metadata = None 1069 # If any additional properties were extracted, add them as features 1070 mask_info = mask_info.drop(columns=["id", "x", "y"], errors="ignore") 1071 if len(mask_info.columns) > 0: 1072 features = mask_info 1073 else: 1074 features = None 1075 return EventArray(info, metadata, features)
Extract events from a mask DataFrame, including metadata and features.
Parameters
- mask: the mask to extract events from.
- slide_id: the slide ID the mask is from.
- tile_n: the tile number the mask is from.
- n_roi: the ROI number the mask is from.
- include_cell_id: whether to include the cell_id, or numerical mask label, as metadata in the EventArray.
- images: the intensity images to extract features from.
- image_labels: the labels for the intensity images.
- properties: list of properties to extract in addition to the defaults:
Returns
EventArray corresponding to the mask labels.
1077 def save_csv(self, output_path: str) -> bool: 1078 """ 1079 Save the events to an CSV file, including metadata and features. 1080 :param output_path: 1081 :return: 1082 """ 1083 if not output_path.endswith(".csv"): 1084 output_path += ".csv" 1085 self.to_dataframe().to_csv(output_path, index=False) 1086 return os.path.exists(output_path)
Save the events to an CSV file, including metadata and features.
Parameters
- output_path:
Returns
1088 @classmethod 1089 def load_csv( 1090 cls, 1091 input_path: str, 1092 metadata_prefix: str = "metadata_", 1093 features_prefix: str = "features_", 1094 ) -> Self: 1095 """ 1096 Load the events from an CSV file, including metadata and features. 1097 :param input_path: 1098 :param metadata_prefix: 1099 :param features_prefix: 1100 :return: 1101 """ 1102 # Load the CSV file 1103 df = pd.read_csv(input_path) 1104 return cls.from_dataframe(df, metadata_prefix, features_prefix)
Load the events from an CSV file, including metadata and features.
Parameters
- input_path:
- metadata_prefix:
- features_prefix:
Returns
1106 def save_hdf5(self, output_path: str) -> bool: 1107 """ 1108 Save the events to an HDF5 file, including metadata and features. 1109 Uses the pandas-provided HDF5 functions for ease, and external compatibility, 1110 though these files are slightly harder to view in HDFView or similar. 1111 :param output_path: 1112 :return: 1113 """ 1114 if not output_path.endswith(".hdf5") and not output_path.endswith(".h5"): 1115 output_path += ".hdf5" 1116 # Open the output_path as an HDF5 file 1117 with pd.HDFStore(output_path) as store: 1118 # Store the dataframes in the HDF5 file 1119 if self.info is not None: 1120 store.put("info", self.info, index=False) 1121 if self.metadata is not None: 1122 store.put("metadata", self.metadata, index=False) 1123 if self.features is not None: 1124 store.put("features", self.features, index=False) 1125 return os.path.exists(output_path)
Save the events to an HDF5 file, including metadata and features. Uses the pandas-provided HDF5 functions for ease, and external compatibility, though these files are slightly harder to view in HDFView or similar.
Parameters
- output_path:
Returns
1127 @classmethod 1128 def load_hdf5(cls, input_path: str) -> Self: 1129 """ 1130 Load the events from an HDF5 file, including metadata and features. 1131 :param input_path: 1132 :return: 1133 """ 1134 # Open the input_path as an HDF5 file 1135 with pd.HDFStore(input_path, "r") as store: 1136 # Load the dataframes from the HDF5 file 1137 info = store.get("info") if "info" in store else None 1138 metadata = store.get("metadata") if "metadata" in store else None 1139 features = store.get("features") if "features" in store else None 1140 return cls(info=info, metadata=metadata, features=features)
Load the events from an HDF5 file, including metadata and features.
Parameters
- input_path:
Returns
1142 def save_ocular(self, output_path: str, event_type: str = "cells"): 1143 """ 1144 Save the events to an OCULAR file. Relies on the dataframe originating 1145 from an OCULAR file (same columns; duplicate metadata/info). 1146 :param output_path: 1147 :param event_type: 1148 :return: 1149 """ 1150 if pyreadr is None: 1151 raise ModuleNotFoundError( 1152 "pyreadr not installed! Install pyreadr directly " 1153 "or run `pip install csi-images[rds]` option to resolve." 1154 ) 1155 if event_type == "cells": 1156 file_stub = "rc-final" 1157 elif event_type == "others": 1158 file_stub = "others-final" 1159 else: 1160 raise ValueError("Invalid event type. Must be cells or others.") 1161 1162 # Ensure good metadata 1163 metadata = pd.DataFrame( 1164 { 1165 "slide_id": self.info["slide_id"], 1166 "frame_id": self.info["tile"], 1167 "cell_id": ( 1168 self.metadata["cell_id"] 1169 if "cell_id" in self.metadata.columns 1170 else range(len(self.info)) 1171 ), 1172 "cellx": self.info["x"], 1173 "celly": self.info["y"], 1174 } 1175 ) 1176 if self.metadata is not None: 1177 metadata[self.metadata.columns] = self.metadata.copy() 1178 1179 # Check for the "ocular_interesting" column 1180 if event_type == "cells": 1181 if "ocular_interesting" in metadata.columns: 1182 interesting_rows = metadata["ocular_interesting"].to_numpy(dtype=bool) 1183 elif "hcpc" in metadata.columns: 1184 # Interesting cells don't get an hcpc designation, leaving them as -1 1185 interesting_rows = ( 1186 metadata["hcpc"].to_numpy() == -1 1187 ) # interesting cells 1188 else: 1189 interesting_rows = [] 1190 if sum(interesting_rows) > 0: 1191 # Split the metadata into interesting and regular 1192 interesting_events = self.rows(interesting_rows) 1193 interesting_df = pd.concat( 1194 [interesting_events.features, interesting_events.metadata], axis=1 1195 ) 1196 data_events = self.rows(~interesting_rows) 1197 data_df = pd.concat( 1198 [data_events.features, data_events.metadata], axis=1 1199 ) 1200 data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore") 1201 1202 # Drop particular columns for "interesting" 1203 interesting_df = interesting_df.drop( 1204 [ 1205 "clust", 1206 "hcpc", 1207 "frame_id", 1208 "cell_id", 1209 "unique_id", 1210 "ocular_interesting", 1211 ], 1212 axis=1, 1213 errors="ignore", 1214 ) 1215 # Save both .csv and .rds 1216 interesting_stub = os.path.join(output_path, "ocular_interesting") 1217 interesting_df.to_csv(f"{interesting_stub}.csv") 1218 # Suppress pandas FutureWarning 1219 with warnings.catch_warnings(): 1220 warnings.simplefilter(action="ignore", category=FutureWarning) 1221 pyreadr.write_rds(f"{interesting_stub}.rds", interesting_df) 1222 else: 1223 data_df = pd.concat([self.features, metadata], axis=1) 1224 else: 1225 # Get all data and reset_index (will copy it) 1226 data_df = pd.concat([self.features, metadata], axis=1) 1227 1228 # Split based on cluster number to conform to *-final[1-4].rds 1229 n_clusters = max(data_df["clust"]) + 1 1230 split_idx = [round(i * n_clusters / 4) for i in range(5)] 1231 for i in range(4): 1232 subset = (split_idx[i] <= data_df["clust"]) & ( 1233 data_df["clust"] < split_idx[i + 1] 1234 ) 1235 data_df.loc[subset, "hcpc"] = i + 1 1236 subset = data_df[subset].reset_index(drop=True) 1237 # Suppress pandas FutureWarning 1238 with warnings.catch_warnings(): 1239 warnings.simplefilter(action="ignore", category=FutureWarning) 1240 pyreadr.write_rds( 1241 os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset 1242 ) 1243 1244 # Create new example cell strings 1245 data_df["example_cell_id"] = ( 1246 data_df["slide_id"] 1247 + " " 1248 + data_df["frame_id"].astype(str) 1249 + " " 1250 + data_df["cell_id"].astype(str) 1251 + " " 1252 + data_df["cellx"].astype(int).astype(str) 1253 + " " 1254 + data_df["celly"].astype(int).astype(str) 1255 ) 1256 # Find averagable data columns 1257 if "cellcluster_id" in data_df.columns: 1258 end_idx = data_df.columns.get_loc("cellcluster_id") 1259 else: 1260 end_idx = data_df.columns.get_loc("slide_id") 1261 avg_cols = data_df.columns[:end_idx].tolist() 1262 # Group by cluster and average 1263 data_df = data_df.groupby("clust").agg( 1264 **{col: (col, "mean") for col in avg_cols}, 1265 count=("clust", "size"), # count rows in each cluster 1266 example_cells=("example_cell_id", lambda x: ",".join(x)), 1267 hcpc=("hcpc", lambda x: x.iloc[0]), 1268 ) 1269 data_df = data_df.reset_index() # Do NOT drop, index is "clust" 1270 # Create new columns 1271 metadata = pd.DataFrame( 1272 { 1273 "count": data_df["count"], 1274 "example_cells": data_df["example_cells"], 1275 "clust": data_df["clust"].astype(int), 1276 "hcpc": data_df["hcpc"].astype(int), 1277 "id": data_df["clust"].astype(int).astype(str), 1278 "cccluster": "0", # Dummy value 1279 "ccdistance": 0.0, # Dummy value 1280 "rownum": list(range(len(data_df))), 1281 "framegroup": 0, # Dummy value 1282 } 1283 ) 1284 # Need to pad the features to 761 columns, as per OCULAR report needs 1285 additional_columns = range(len(avg_cols), 761) 1286 if len(additional_columns) > 0: 1287 padding = pd.DataFrame( 1288 np.zeros((len(data_df), len(additional_columns))), 1289 columns=[f"pad{i}" for i in additional_columns], 1290 ) 1291 data_df = pd.concat([data_df[avg_cols], padding, metadata], axis=1) 1292 else: 1293 data_df = pd.concat([data_df[avg_cols], metadata], axis=1) 1294 1295 # Save the cluster data 1296 data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv")) 1297 # Suppress pandas FutureWarning 1298 with warnings.catch_warnings(): 1299 warnings.simplefilter(action="ignore", category=FutureWarning) 1300 pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df)
Save the events to an OCULAR file. Relies on the dataframe originating from an OCULAR file (same columns; duplicate metadata/info).
Parameters
- output_path:
- event_type:
Returns
1302 @classmethod 1303 def load_ocular( 1304 cls, 1305 input_path: str, 1306 event_type="cells", 1307 cell_data_files=( 1308 "rc-final1.rds", 1309 "rc-final2.rds", 1310 "rc-final3.rds", 1311 "rc-final4.rds", 1312 "ocular_interesting.rds", 1313 ), 1314 others_data_files=( 1315 "others-final1.rds", 1316 "others-final2.rds", 1317 "others-final3.rds", 1318 "others-final4.rds", 1319 ), 1320 atlas_data_files=( 1321 "ocular_interesting.rds", 1322 "ocular_not_interesting.rds", 1323 ), 1324 drop_common_events=True, 1325 ) -> Self: 1326 """ 1327 1328 :param input_path: 1329 :param event_type: 1330 :param cell_data_files: 1331 :param others_data_files: 1332 :param atlas_data_files: 1333 :param drop_common_events: 1334 :return: 1335 """ 1336 if pyreadr is None: 1337 raise ModuleNotFoundError( 1338 "pyreadr not installed! Install pyreadr directly " 1339 "or run `pip install csi-images[rds]` option to resolve." 1340 ) 1341 # Check if the input path is a directory or a file 1342 if os.path.isfile(input_path): 1343 data_files = [os.path.basename(input_path)] 1344 input_path = os.path.dirname(input_path) 1345 if event_type == "cells": 1346 data_files = cell_data_files 1347 elif event_type == "others": 1348 data_files = others_data_files 1349 else: 1350 raise ValueError("Invalid event type.") 1351 1352 # Load the data from the OCULAR files 1353 file_data = {} 1354 for file in data_files: 1355 file_path = os.path.join(input_path, file) 1356 if not os.path.isfile(file_path): 1357 warnings.warn(f"{file} not found for in {input_path}") 1358 continue 1359 file_data[file] = pyreadr.read_r(file_path) 1360 # Get the DataFrame associated with None (pyreadr dict quirk) 1361 file_data[file] = file_data[file][None] 1362 if len(file_data[file]) == 0: 1363 # File gets dropped from the dict 1364 file_data.pop(file) 1365 warnings.warn(f"{file} has no cells") 1366 continue 1367 1368 # Drop common cells if requested and in this file 1369 if ( 1370 file in atlas_data_files 1371 and drop_common_events 1372 and "catalogue_classification" in file_data[file] 1373 ): 1374 common_cell_indices = ( 1375 file_data[file]["catalogue_classification"] == "common_cell" 1376 ) 1377 file_data[file] = file_data[file][common_cell_indices == False] 1378 1379 if len(file_data[file]) == 0: 1380 # File gets dropped from the dict 1381 file_data.pop(file) 1382 warnings.warn(f"{file} has no cells after dropping common cells") 1383 continue 1384 1385 # Extract frame_id and cell_id 1386 # DAPI- events already have frame_id cell_id outside rowname 1387 if event_type == "cells" and "frame_id" not in file_data[file].columns: 1388 file_data[file]["rowname"] = file_data[file]["rowname"].astype("str") 1389 # get frame_id cell_id from rownames column and split into two columns 1390 split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True) 1391 if len(split_res.columns) != 2: 1392 warnings.warn( 1393 f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}' 1394 ) 1395 # then assign it back to the dataframe 1396 file_data[file][["frame_id", "cell_id"]] = split_res.astype("int") 1397 # Ensure frame_id and cell_id are integers 1398 file_data[file]["frame_id"] = file_data[file]["frame_id"].astype("int") 1399 file_data[file]["cell_id"] = file_data[file]["cell_id"].astype("int") 1400 # reset indexes since they can cause NaN values in concat 1401 file_data[file] = file_data[file].reset_index(drop=True) 1402 1403 # Merge the data from all files 1404 if len(file_data) == 0: 1405 return EventArray() 1406 elif len(file_data) == 1: 1407 data = [file_data[file] for file in file_data.keys()][0] 1408 else: 1409 data = pd.concat(file_data.values()) 1410 1411 # Others is missing the "slide_id". Insert it right before "frame_id" column 1412 if event_type == "others" and "slide_id" not in data.columns: 1413 if os.path.basename(input_path) == "ocular": 1414 slide_id = os.path.basename(os.path.dirname(input_path)) 1415 else: 1416 slide_id = "UNKNOWN" 1417 data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id) 1418 1419 # Sort according to ascending cell_id to keep the original, which is in manual_df 1420 data = data.sort_values(by=["cell_id"], ascending=True) 1421 # Filter out duplicates by x & y 1422 data = data.assign( 1423 unique_id=data["slide_id"] 1424 + "_" 1425 + data["frame_id"].astype(str) 1426 + "_" 1427 + data["cellx"].astype(int).astype(str) 1428 + "_" 1429 + data["celly"].astype(int).astype(str) 1430 ) 1431 data = data.drop_duplicates(subset=["unique_id"], keep="first") 1432 # Normal unique_id is with cell_id 1433 data = data.assign( 1434 unique_id=data["slide_id"] 1435 + "_" 1436 + data["frame_id"].astype(str) 1437 + "_" 1438 + data["cell_id"].astype(str) 1439 ) 1440 data = data.reset_index(drop=True) 1441 # All columns up to "slide_id" are features; drop the "slide_id" 1442 features = data.loc[:, :"slide_id"].iloc[:, :-1] 1443 data = data.loc[:, "slide_id":] 1444 # Grab the info columns 1445 info = data[["slide_id", "frame_id", "cellx", "celly"]] 1446 info.columns = ["slide_id", "tile", "x", "y"] 1447 info = info.assign(roi=0) # OCULAR only works on 1 ROI, as far as known 1448 info = info[["slide_id", "tile", "roi", "x", "y"]] 1449 # Metadata has duplicate columns for later convenience 1450 metadata = data 1451 # Certain columns tend to be problematic with mixed data formats... 1452 for col in ["TRITC", "CY5", "FITC"]: 1453 if col in metadata: 1454 labels = { 1455 "False": False, 1456 "True": True, 1457 "FALSE": False, 1458 "TRUE": True, 1459 False: False, 1460 True: True, 1461 } 1462 metadata[col] = metadata[col].map(labels).astype(bool) 1463 for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]: 1464 if col in metadata: 1465 metadata[col] = metadata[col].fillna(-1).astype(int) 1466 return EventArray(info, metadata, features)
Parameters
- input_path:
- event_type:
- cell_data_files:
- others_data_files:
- atlas_data_files:
- drop_common_events: