csi_images.csi_events
Contains the Event class, which represents a single event in a scan. The Event class optionally holds metadata and features. Lists of events with similar metadata or features can be combined into DataFrames for analysis.
The Event class holds the position of the event in the frame, which can be converted to the position in the scanner or slide coordinate positions. See the csi_utils.csi_scans documentation page for more information on the coordinate systems.
1""" 2Contains the Event class, which represents a single event in a scan. 3The Event class optionally holds metadata and features. Lists of events with 4similar metadata or features can be combined into DataFrames for analysis. 5 6The Event class holds the position of the event in the frame, which can be converted 7to the position in the scanner or slide coordinate positions. See the 8csi_utils.csi_scans documentation page for more information on the coordinate systems. 9""" 10 11import os 12import glob 13import math 14import warnings 15from typing import Self, Iterable, Hashable, Sequence 16 17import numpy as np 18import pandas as pd 19 20from .csi_scans import Scan 21from .csi_tiles import Tile 22from .csi_frames import Frame 23 24# Optional dependencies; will raise errors in particular functions if not installed 25try: 26 from . import csi_images 27except ImportError: 28 csi_images = None 29try: 30 import imageio.v3 as imageio 31except ImportError: 32 imageio = None 33try: 34 import pyreadr 35except ImportError: 36 pyreadr = None 37 38 39class Event: 40 """ 41 A class that represents a single event in a scan, making it easy to evaluate 42 singular events. Required metadata is exposed as attributes, and optional 43 metadata and features are stored as DataFrames. 44 """ 45 46 SCAN_TO_SLIDE_TRANSFORM = { 47 # Axioscan zero is in the top-right corner instead of top-left 48 Scan.Type.AXIOSCAN7: np.array( 49 [ 50 [1, 0, 75000], 51 [0, 1, 0], 52 [0, 0, 1], 53 ] 54 ), 55 # BZScanner coordinates are a special kind of messed up: 56 # - The slide is upside-down. 57 # - The slide is oriented vertically, with the barcode at the bottom. 58 # - Tiles are numbered from the top-right 59 Scan.Type.BZSCANNER: np.array( 60 [ 61 [0, -1, 75000], 62 [-1, 0, 25000], 63 [0, 0, 1], 64 ] 65 ), 66 } 67 """ 68 Homogeneous transformation matrices for converting between scanner and slide 69 coordinates. The matrices are 3x3, with the final column representing the 70 translation in micrometers (um). For more information, see 71 [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations). 72 73 Transformations are nominal, and accuracy is not guaranteed; this is due to 74 imperfections in slides and alignment in the scanners. Units are in micrometers. 75 """ 76 77 def __init__( 78 self, 79 tile: Tile, 80 x: int, 81 y: int, 82 metadata: pd.Series = None, 83 features: pd.Series = None, 84 ): 85 self.tile = tile 86 self.x = int(x) 87 self.y = int(y) 88 self.metadata = metadata 89 self.features = features 90 91 def __repr__(self) -> str: 92 return f"{self.tile}-{self.x}-{self.y}" 93 94 def __eq__(self, other) -> bool: 95 return self.__repr__() == other.__repr__() 96 97 def __lt__(self, other): 98 return self.__repr__() < other.__repr__() 99 100 def get_scan_position(self) -> tuple[float, float]: 101 """ 102 Get the position of the event in the scanner's coordinate frame. 103 :return: the scan position of the event in micrometers (um). 104 """ 105 # Get overall pixel position 106 real_tile_height, real_tile_width = self.tile.scan.get_image_size() 107 pixel_x = self.x + (real_tile_width * self.tile.x) 108 pixel_y = self.y + (real_tile_height * self.tile.y) 109 # Convert to micrometers 110 x_um = pixel_x * self.tile.scan.pixel_size_um 111 y_um = pixel_y * self.tile.scan.pixel_size_um 112 # Add the scan's origin in the scanner frame 113 x_um += self.tile.scan.roi[self.tile.n_roi].origin_x_um 114 y_um += self.tile.scan.roi[self.tile.n_roi].origin_y_um 115 return x_um, y_um 116 117 def get_slide_position(self) -> tuple[float, float]: 118 """ 119 Get the slide position of the event in micrometers (um). 120 :return: the slide position of the event. 121 """ 122 # Turn scan_position into a 3x1 vector 123 scan_position = self.get_scan_position() 124 scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]]) 125 126 # Multiply by the appropriate homogeneous matrix 127 if self.tile.scan.scanner_id.startswith(self.tile.scan.Type.AXIOSCAN7.value): 128 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.tile.scan.Type.AXIOSCAN7] 129 elif self.tile.scan.scanner_id.startswith(self.tile.scan.Type.BZSCANNER.value): 130 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.tile.scan.Type.BZSCANNER] 131 else: 132 raise ValueError(f"Scanner type {self.tile.scan.scanner_id} not supported.") 133 slide_position = np.matmul(transform, scan_position) 134 return float(slide_position[0][0]), float(slide_position[1][0]) 135 136 def crop( 137 self, images: Iterable[np.ndarray], crop_size: int = 100, in_pixels: bool = True 138 ) -> list[np.ndarray]: 139 """ 140 Crop the event from the provided frame images. Use if you have already gotten 141 frame images; useful for cropping multiple events from the same frame image. 142 :param images: the frame images. 143 :param crop_size: the square size of the image crop to get for this event. 144 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 145 :return: image_size x image_size crops of the event in the provided frames. If 146 the event is too close to the edge, the crop will be smaller and not centered. 147 """ 148 # Convert a crop size in micrometers to pixels 149 if not in_pixels: 150 crop_size = round(crop_size / self.tile.scan.pixel_size_um) 151 image_height, image_width = 0, 0 152 for image in images: 153 if image_height == 0 and image_width == 0: 154 image_height, image_width = image.shape 155 else: 156 if image_height != image.shape[0] or image_width != image.shape[1]: 157 raise ValueError("All images must be the same size") 158 if image_height == 0 or image_width == 0: 159 raise ValueError("No images provided") 160 161 # Find the crop bounds 162 bounds = [ 163 self.x - (crop_size // 2) + 1, 164 self.y - (crop_size // 2) + 1, 165 self.x + math.ceil(crop_size / 2) + 1, 166 self.y + math.ceil(crop_size / 2) + 1, 167 ] 168 # Determine how much the bounds violate the image size 169 displacements = [ 170 max(0, -bounds[0]), 171 max(0, -bounds[1]), 172 max(0, bounds[2] - image_width), 173 max(0, bounds[3] - image_height), 174 ] 175 # Cap off the bounds 176 bounds = [ 177 max(0, bounds[0]), 178 max(0, bounds[1]), 179 min(image_width, bounds[2]), 180 min(image_height, bounds[3]), 181 ] 182 183 # Crop the images 184 crops = [] 185 for image in images: 186 # Create a blank image of the right size 187 crop = np.zeros((crop_size, crop_size), dtype=image.dtype) 188 189 # Insert the cropped image into the blank image, leaving a black buffer 190 # around the edges if the crop would go beyond the original image bounds 191 crop[ 192 displacements[1] : crop_size - displacements[3], 193 displacements[0] : crop_size - displacements[2], 194 ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]] 195 crops.append(crop) 196 return crops 197 198 def get_crops( 199 self, 200 crop_size: int = 100, 201 in_pixels: bool = True, 202 input_path: str = None, 203 channels: Iterable[int | str] = None, 204 apply_gain: bool | Iterable[bool] = True, 205 ) -> list[np.ndarray]: 206 """ 207 Gets the frame images for this event and then crops the event from the images. 208 Convenient for retrieving a single event's crops, but less efficient when 209 retrieving multiple events from the same tile as it will reread the images. 210 :param crop_size: the square size of the image crop to get for this event. 211 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 212 :param input_path: the path to the input images. Defaults to None (uses the scan's path). 213 :param channels: the channels to extract images for. Defaults to all channels. 214 :param apply_gain: whether to apply scanner-calculated gain to the images, if 215 not already applied. If a list, matches the channels. 216 :return: a list of cropped images from the scan in the order of the channels. 217 """ 218 # This function validates channels 219 frames = Frame.get_frames(self.tile, channels) 220 # Convert individual inputs to lists of appropriate length 221 if isinstance(apply_gain, bool): 222 apply_gain = [apply_gain] * len(frames) 223 images = [f.get_image(input_path, a) for f, a in zip(frames, apply_gain)] 224 return self.crop(images, crop_size, in_pixels) 225 226 def save_crops( 227 self, 228 crops: Sequence[np.ndarray], 229 output_path: str, 230 labels: Sequence[str], 231 ext: str = "auto", 232 ): 233 """ 234 Save the crops to image files. 235 :param crops: the crops to save. Will save as RGB if 3 channel [h, w, 3] or 236 grayscale if 1 channel [h, w] or [h, w, 1]. 237 :param labels: the labels to append to the file name, usually the channel names 238 associated with each crop. 239 :param output_path: the folder to save the crops to. Will make if needed. 240 :param ext: the file extension to save the crops as. Defaults to "auto", which 241 will save as .tif for grayscale images and .jpg for RGB images. 242 :return: None 243 """ 244 if len(crops) != len(labels): 245 raise ValueError("Crops and labels must be the same length") 246 247 if csi_images is None or imageio is None: 248 raise ModuleNotFoundError( 249 "imageio libraries not installed! " 250 "run `pip install csi_images[imageio]` to resolve." 251 ) 252 253 os.makedirs(output_path, exist_ok=True) 254 255 for crop, label in zip(crops, labels): 256 if ext == "auto": 257 if len(crop.shape) == 2 or crop.shape[2] == 1: 258 file_extension = ".tif" 259 elif crop.shape[2] == 3: 260 file_extension = ".jpg" 261 else: 262 warnings.warn( 263 f"Image shape {crop.shape} not recognized; saving as .tif" 264 ) 265 file_extension = ".tif" 266 else: 267 file_extension = ext 268 file = os.path.join(output_path, f"{self}-{label}{file_extension}") 269 # TODO: add more file types here 270 if file_extension == ".tif": 271 imageio.imwrite(file, crop, compression="deflate") 272 elif file_extension in [".jpg", ".jpeg"]: 273 crop = csi_images.scale_bit_depth(crop, np.uint8) 274 imageio.imwrite(file, crop, quality=80) 275 else: 276 imageio.imwrite(file, crop) 277 278 def load_crops( 279 self, input_path: str, labels: list[str] = None 280 ) -> dict[str, np.ndarray]: 281 """ 282 Loads previously saved crop files from a folder. 283 :param input_path: folder containing crop files. 284 :param labels: optional label filter, will only return crops with these labels. 285 :return: a tuple of lists containing the crops and their labels. 286 """ 287 crops = {} 288 for file in glob.glob(os.path.join(input_path, f"{self}-*")): 289 label = os.path.splitext(os.path.basename(file))[0].split("-")[-1] 290 # Skip if we have labels to target 291 if labels is not None and label not in labels: 292 continue 293 crops[label] = imageio.imread(file) 294 return crops 295 296 def get_montage_channels( 297 self, 298 channels: Sequence[int | str] | None = None, 299 composites: dict[int | str, tuple[float, float, float]] | None = None, 300 ) -> tuple[list[int], list[int], dict[int, tuple[float, float, float]]]: 301 """ 302 Get the channel names for the montage from the event's tile. 303 :param channels: channel indices or names for grayscale channels 304 :param composites: dictionary of channel indices or names and RGB values 305 :return: (1) channel indices to retrieve, 306 (2) relative grayscale channel indices, and 307 (3) composite channel indices and RGB values. 308 """ 309 if channels is None: 310 channels = list(range(len(self.tile.scan.channels))) 311 if (len(channels) == 0) and (composites is None or len(composites) == 0): 312 raise ValueError("Must provide at least one channel type to montage") 313 314 channels_to_get = [] 315 316 # Build the list of channels to retrieve 317 if channels is not None: 318 if isinstance(channels[0], str): 319 channels = self.tile.scan.get_channel_indices(channels) 320 channels_to_get += channels 321 order = list(range(len(channels))) # Always the first n channels 322 else: 323 order = None 324 325 if composites is not None: 326 relative_composites = {} # Relative indices for retrieved channels 327 # Convert to scan indices 328 rgb_channels = list(composites.keys()) 329 if isinstance(rgb_channels[0], str): 330 rgb_channels = self.tile.scan.get_channel_indices(rgb_channels) 331 # Find the index or add to the end 332 for channel, rgb in zip(rgb_channels, composites.values()): 333 if channel not in channels_to_get: 334 channels_to_get.append(channel) 335 relative_composites[channel] = rgb 336 else: 337 relative_composites[channels_to_get.index(channel)] = rgb 338 else: 339 relative_composites = None 340 341 return channels_to_get, order, relative_composites 342 343 def get_montage( 344 self, 345 channels: Sequence[int | str] = None, 346 composites: dict[int | str, tuple[float, float, float]] = None, 347 mask: np.ndarray[np.uint8] = None, 348 labels: Sequence[str] = None, 349 crop_size: int = 100, 350 in_pixels: bool = True, 351 input_path: str = None, 352 apply_gain: bool = True, 353 **kwargs, 354 ) -> np.ndarray: 355 """ 356 Convenience function for getting frame images and creating a montage. Mirrors 357 csi_images.make_montage(). Convenient for a single event's montage, but less 358 efficient when for multiple events from the same tile. 359 :param channels: the channels to use for black-and-white montages. 360 :param composites: dictionary of indices and RGB tuples for a composite. 361 :param mask: a mask to apply to the montage. Must be the same size as the crop. 362 :param labels: the labels to subtitle montage images, usually the channel names 363 :param crop_size: the square size of the image crop to get for this event. 364 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 365 :param input_path: the path to the input images. Defaults to None (uses the scan's path). 366 :param apply_gain: whether to apply scanner-calculated gain to the images, if 367 not already applied. If a list, matches the channels. 368 :param kwargs: montage options. See csi_images.make_montage() for more details. 369 :return: numpy array representing the montage. 370 """ 371 channels, order, composites = self.get_montage_channels(channels, composites) 372 images = self.get_crops(crop_size, in_pixels, input_path, channels, apply_gain) 373 return csi_images.make_montage( 374 images, order, composites, mask, labels, **kwargs 375 ) 376 377 def save_montage( 378 self, 379 montage: np.ndarray, 380 output_path: str, 381 ocular_names: bool = False, 382 tag: str = "", 383 file_extension: str = ".jpeg", 384 **kwargs, 385 ): 386 """ 387 Save the montage as a JPEG image with a set name. 388 :param montage: the montage to save. 389 :param output_path: the folder to save the montage in. Will make if needed. 390 :param ocular_names: whether to use the OCULAR naming convention. 391 :param tag: a tag to append to the file name. Ignored if ocular_names is True. 392 :param file_extension: the file extension to save the montage as. Defaults to .jpeg. 393 :param kwargs: additional arguments to pass to imageio.imwrite(). 394 :return: None 395 """ 396 if csi_images is None or imageio is None: 397 raise ModuleNotFoundError( 398 "imageio libraries not installed! " 399 "run `pip install csi_images[imageio]` to resolve." 400 ) 401 402 montage = csi_images.scale_bit_depth(montage, np.uint8) 403 404 if not file_extension.startswith("."): 405 file_extension = f".{file_extension}" 406 407 if ocular_names: 408 if "cell_id" not in self.metadata.index: 409 raise ValueError( 410 "Event metadata must include 'cell_id' for OCULAR naming." 411 ) 412 file = f"{self.tile.n}-{self.metadata['cell_id']}-{self.x}-{self.y}{file_extension}" 413 else: 414 file = f"{self}{tag}{file_extension}" 415 416 os.makedirs(output_path, exist_ok=True) 417 imageio.imwrite(os.path.join(output_path, file), montage, **kwargs) 418 419 def load_montage(self, input_path: str, tag: str = "") -> np.ndarray: 420 """ 421 Loads the montage from a file saved by Event.save_montage. 422 :param input_path: the path to the folder where the montage was saved. 423 :param tag: a string to add to the file name, before the extension. 424 :return: 425 """ 426 file = f"{self}{tag}.jpeg" 427 return imageio.imread(os.path.join(input_path, file)) 428 429 @classmethod 430 def get_many_crops( 431 cls, 432 events: Sequence[Self], 433 crop_size: int | Sequence[int] = 100, 434 in_pixels: bool = True, 435 input_path: str | Sequence[str] = None, 436 channels: Sequence[int | str] = None, 437 apply_gain: bool | Sequence[bool] = True, 438 ) -> list[list[np.ndarray]]: 439 """ 440 Get the crops for a list of events, ensuring that there is no wasteful reading 441 of the same tile multiple times. This function is more efficient than calling 442 get_crops() for each event. 443 :param events: the events to get crops for. 444 :param crop_size: the square size of the image crop to get for this event. 445 Defaults to four times the size of the event. 446 :param in_pixels: whether the crop size is in pixels or micrometers. 447 Defaults to pixels, and is ignored if crop_size is None. 448 :param input_path: the path to the input images. Will only work for lists of events 449 from the same scan. Defaults to None (uses the scan's path). 450 :param channels: the channels to extract images for. Defaults to all channels. 451 :param apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True. 452 Can be supplied as a list to apply gain to individual channels. 453 :return: a list of lists of cropped images for each event. 454 """ 455 if len(events) == 0: 456 return [] 457 # Adapt singular inputs to lists of appropriate length 458 if isinstance(crop_size, int): 459 crop_size = [crop_size] * len(events) 460 if input_path is None or isinstance(input_path, str): 461 input_path = [input_path] * len(events) 462 463 # Get the order of the events when sorted by slide/tile 464 order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__())) 465 466 # Allocate the list to size 467 crops = [[]] * len(events) 468 last_tile = None 469 images = None # Holds large numpy arrays, so expensive to compare 470 # Iterate through in slide/tile sorted order 471 for i in order: 472 if last_tile != events[i].tile: 473 # Gather the frame images, preserving them for the next event 474 frames = Frame.get_frames(events[i].tile, channels) 475 if isinstance(apply_gain, bool): 476 apply = [apply_gain] * len(frames) 477 else: 478 apply = apply_gain 479 images = [f.get_image(input_path[i], a) for f, a in zip(frames, apply)] 480 last_tile = events[i].tile 481 # Use the frame images to crop the event images 482 crops[i] = events[i].crop(images, crop_size[i], in_pixels) 483 return crops 484 485 @classmethod 486 def get_many_montages( 487 cls, 488 events: Sequence[Self], 489 channels: Sequence[int | str] = None, 490 composites: dict[int | str, tuple[float, float, float]] = None, 491 masks: Sequence[np.ndarray[np.uint8]] = None, 492 labels: Sequence[str] = None, 493 crop_size: int = 100, 494 in_pixels: bool = True, 495 input_path: str = None, 496 apply_gain: bool | Iterable[bool] = True, 497 **kwargs, 498 ) -> list[np.ndarray]: 499 """ 500 Convenience function for get_montage(), but for a list of events. More efficient 501 than get_montage() when working with multiple events from the same tile. 502 :param events: a list of Event objects. 503 :param channels: the channels to extract images for. Defaults to all channels. 504 :param composites: dictionary of indices and RGB tuples for a composite. 505 :param masks: a list of masks to apply to the montages. Must be the same size as the crops. 506 :param labels: the labels to subtitle montage images, usually the channel names 507 :param crop_size: the square size of the image crop to get for this event. 508 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 509 :param input_path: the path to the input images. Defaults to None (uses the scan's path). 510 :param apply_gain: whether to apply scanner-calculated gain to the images, if 511 not already applied. If a list, matches the channels. 512 :param kwargs: montage options. See csi_images.make_montage() for more details. 513 :return: a list of numpy arrays representing the montages. 514 """ 515 if len(events) == 0: 516 return [] 517 # Adapt singular inputs to lists of appropriate length 518 if isinstance(crop_size, int): 519 crop_size = [crop_size] * len(events) 520 if input_path is None or isinstance(input_path, str): 521 input_path = [input_path] * len(events) 522 if masks is None or isinstance(masks, np.ndarray): 523 masks = [masks] * len(events) 524 525 # Get the order of the events when sorted by slide/tile 526 event_order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__())) 527 528 # Allocate the list to size 529 montages = [np.empty(0)] * len(events) 530 # Placeholder variables to avoid rereading the same tile 531 images = None # Holds large numpy arrays, so expensive to compare 532 order = None 533 rel_composites = None 534 last_tile = None 535 # Iterate through in slide/tile sorted order 536 for i in event_order: 537 if last_tile != events[i].tile: 538 channels_to_get, order, rel_composites = events[i].get_montage_channels( 539 channels, composites 540 ) 541 # Gather the frame images, preserving them for the next event 542 frames = Frame.get_frames(events[i].tile, channels_to_get) 543 if isinstance(apply_gain, bool): 544 apply = [apply_gain] * len(frames) 545 else: 546 apply = apply_gain 547 images = [f.get_image(input_path[i], a) for f, a in zip(frames, apply)] 548 last_tile = events[i].tile 549 # Use the frame images to crop the event images and make montages 550 crops = events[i].crop(images, crop_size[i], in_pixels) 551 montages[i] = csi_images.make_montage( 552 crops, order, rel_composites, masks[i], labels, **kwargs 553 ) 554 555 return montages 556 557 @classmethod 558 def get_and_save_many_crops( 559 cls, 560 events: list[Self], 561 output_path: str, 562 labels: Sequence[str], 563 ext: str = "auto", 564 additional_gain: Sequence[float] = None, 565 **kwargs, 566 ) -> None: 567 """ 568 Get and save the crops for a list of events, ensuring that there is no wasteful 569 reading and limiting the image data in memory to 1 tile at a time. This function 570 is more efficient that chaining get_crops() and save_crops() for each event or 571 get_many_crops() and then save_crops(). 572 :param events: list of events to get, crop, and save. 573 :param output_path: the folder to save the crops in. Will make if needed. 574 :param labels: the labels to save the crops with. See save_crops(). 575 :param ext: the file extension to save the crops as. See save_crops(). 576 :param additional_gain: additional gain to apply to the crops. If not None, must 577 match the length of the number of crop channels. 578 :param kwargs: see get_many_crops() for more parameters. 579 :return: 580 """ 581 unique_tiles = set([event.tile for event in events]) 582 583 for tile in unique_tiles: 584 # Get one tile's worth of event crops 585 tile_events = [e for e in events if e.tile == tile] 586 crops_list = cls.get_many_crops(tile_events, **kwargs) 587 for event, crops in zip(tile_events, crops_list): 588 # Apply any additional gains 589 if additional_gain is not None: 590 crops = [gain * crop for gain, crop in zip(additional_gain, crops)] 591 event.save_crops(crops, output_path, labels, ext) 592 593 @classmethod 594 def get_and_save_many_montages( 595 cls, 596 events: list[Self], 597 output_path: str, 598 ocular_names: bool = False, 599 tag: str = "", 600 **kwargs, 601 ) -> None: 602 """ 603 Save montages of the events to image files. 604 :param events: the events to get, montage, and save. 605 :param output_path: the folder to save the montages to. Will make if needed. 606 :param ocular_names: whether to use the OCULAR naming convention. 607 :param tag: a tag to append to the file name. Ignored if ocular_names is True. 608 :param kwargs: see get_many_montages() for more parameters. 609 """ 610 unique_tiles = set([event.tile for event in events]) 611 612 for tile in unique_tiles: 613 # Get one tile's worth of event crops 614 tile_events = [e for e in events if e.tile == tile] 615 montages = cls.get_many_montages(tile_events, **kwargs) 616 for event, montage in zip(tile_events, montages): 617 event.save_montage(montage, output_path, ocular_names, tag) 618 619 620class EventArray: 621 """ 622 A class that holds a large number of events' data, making it easy to analyze and 623 manipulate many events at once. A more separated version of the Event class. 624 """ 625 626 INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y"] 627 628 def __init__( 629 self, 630 info: pd.DataFrame = None, 631 metadata: pd.DataFrame = None, 632 features: pd.DataFrame = None, 633 ): 634 635 # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y" 636 self.info = info 637 if self.info is not None: 638 # Special case: "roi" is often not required, so we'll fill in if its missing 639 if "roi" not in info.columns: 640 self.info = self.info.assign(roi=0) 641 if set(self.info.columns) != set(self.INFO_COLUMNS): 642 raise ValueError( 643 f"EventArray.info must have columns:" 644 f"{self.INFO_COLUMNS}; had {list(self.info.columns)}" 645 ) 646 # Ensure order and data types 647 self.info = pd.DataFrame( 648 { 649 "slide_id": self.info["slide_id"].astype(str), 650 "tile": self.info["tile"].astype(np.uint16), 651 "roi": self.info["roi"].astype(np.uint8), 652 "x": self.info["x"].round().astype(np.uint16), 653 "y": self.info["y"].round().astype(np.uint16), 654 } 655 ) 656 657 # All DataFrames must all have the same number of rows 658 if metadata is not None and (info is None or len(info) != len(metadata)): 659 raise ValueError( 660 "If EventArray.metadata is not None, it should match rows with .info" 661 ) 662 if features is not None and (info is None or len(info) != len(features)): 663 raise ValueError( 664 "If EventArray.features is not None, it should match rows with .info" 665 ) 666 # No columns named "metadata_", "features_", or "None" 667 column_names = [] 668 if metadata is not None: 669 column_names += metadata.columns.tolist() 670 if features is not None: 671 column_names += features.columns.tolist() 672 if any([col.lower().startswith("metadata_") for col in column_names]): 673 raise ValueError("EventArray column names cannot start with 'metadata_'") 674 if any([col.lower().startswith("features_") for col in column_names]): 675 raise ValueError("EventArray column names cannot start with 'features_'") 676 if any([col.lower() == "none" for col in column_names]): 677 raise ValueError("EventArray column names cannot be 'none'") 678 679 # Add metadata and features 680 self.metadata = None 681 self.features = None 682 if metadata is not None: 683 self.add_metadata(metadata) 684 if features is not None: 685 self.add_features(features) 686 687 def __len__(self) -> int: 688 # Convenience method to get the number of events 689 if self.info is None: 690 return 0 691 else: 692 return len(self.info) 693 694 def __eq__(self, other): 695 # Parse all possibilities for info 696 if isinstance(self.info, pd.DataFrame): 697 if isinstance(other.info, pd.DataFrame): 698 if not self.info.equals(other.info): 699 return False 700 else: 701 return False 702 elif self.info is None: 703 if other.info is not None: 704 return False 705 706 # Parse all possibilities for metadata 707 if isinstance(self.metadata, pd.DataFrame): 708 if isinstance(other.metadata, pd.DataFrame): 709 is_equal = self.metadata.equals(other.metadata) 710 if not is_equal: 711 return False 712 else: 713 return False 714 elif self.metadata is None: 715 if other.metadata is not None: 716 return False 717 718 # Parse all possibilities for features 719 if isinstance(self.features, pd.DataFrame): 720 if isinstance(other.features, pd.DataFrame): 721 is_equal = self.features.equals(other.features) 722 if not is_equal: 723 return False 724 else: 725 return False 726 elif self.features is None: 727 if other.features is not None: 728 return False 729 730 return is_equal 731 732 def get_sort_order( 733 self, by: Hashable | Sequence[Hashable], ascending: bool | Sequence[bool] = True 734 ): 735 """ 736 Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames. 737 :param by: name of the column(s) to sort by. 738 :param ascending: whether to sort in ascending order; can be a list to match by 739 :return: the order of the indices to sort by. 740 """ 741 columns = self.get(by) 742 return columns.sort_values(by=by, ascending=ascending).index 743 744 def sort( 745 self, 746 by: Hashable | Sequence[Hashable], 747 ascending: bool | Sequence[bool] = True, 748 ) -> Self: 749 """ 750 Sort the EventArray by column(s) in the info, metadata, or features DataFrames. 751 :param by: name of the column(s) to sort by. 752 :param ascending: whether to sort in ascending order; can be a list to match by 753 :return: a new, sorted EventArray. 754 """ 755 order = self.get_sort_order(by, ascending) 756 info = self.info.loc[order].reset_index(drop=True) 757 if self.metadata is not None: 758 metadata = self.metadata.loc[order].reset_index(drop=True) 759 else: 760 metadata = None 761 if self.features is not None: 762 features = self.features.loc[order].reset_index(drop=True) 763 else: 764 features = None 765 return EventArray(info, metadata, features) 766 767 def get(self, column_names: Hashable | Sequence[Hashable]) -> pd.DataFrame: 768 """ 769 Get a DataFrame with the specified columns from the EventArray, by value. 770 :param column_names: the names of the columns to get. 771 :return: a DataFrame with the specified columns. 772 """ 773 if isinstance(column_names, Hashable): 774 column_names = [column_names] # Drop into a list for the loop 775 columns = [] 776 for column_name in column_names: 777 if column_name in self.info.columns: 778 columns.append(self.info[column_name]) 779 elif self.metadata is not None and column_name in self.metadata.columns: 780 columns.append(self.metadata[column_name]) 781 elif self.features is not None and column_name in self.features.columns: 782 columns.append(self.features[column_name]) 783 else: 784 raise ValueError(f"Column {column_name} not found in EventArray") 785 return pd.concat(columns, axis=1) 786 787 def rows(self, rows: Sequence[Hashable]) -> Self: 788 """ 789 Get a subset of the EventArray rows based on a boolean or integer index, by value. 790 :param rows: row labels, indices, or boolean mask; anything for .loc[] 791 :return: a new EventArray with the subset of events. 792 """ 793 info = self.info.loc[rows].reset_index(drop=True) 794 if self.metadata is not None: 795 metadata = self.metadata.loc[rows].reset_index(drop=True) 796 else: 797 metadata = None 798 if self.features is not None: 799 features = self.features.loc[rows].reset_index(drop=True) 800 else: 801 features = None 802 return EventArray(info, metadata, features) 803 804 def copy(self) -> Self: 805 """ 806 Create a deep copy of the EventArray. 807 :return: a deep copy of the EventArray. 808 """ 809 return EventArray( 810 info=self.info.copy(), 811 metadata=None if self.metadata is None else self.metadata.copy(), 812 features=None if self.features is None else self.features.copy(), 813 ) 814 815 # TODO: add a "filter" convenience function that takes a column name and values to filter by 816 817 def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None: 818 """ 819 Add metadata to the EventArray. Removes the need to check if metadata is None. 820 Overwrites any existing metadata with the same column names as the new metadata. 821 :param new_metadata: the metadata to add. 822 """ 823 if self.info is None or len(self.info) != len(new_metadata): 824 raise ValueError("New metadata must match length of existing info") 825 826 if isinstance(new_metadata, pd.Series): 827 # Convert to a DataFrame 828 new_metadata = pd.DataFrame(new_metadata) 829 830 for col in new_metadata.columns: 831 if col in self.INFO_COLUMNS: 832 warnings.warn( 833 f"Column name {col} is reserved for info; you can only " 834 "access this column through the .metadata attribute" 835 ) 836 elif self.features is not None and col in self.features.columns: 837 warnings.warn( 838 f"Column name {col} also exists in the .features attribute; " 839 f"calling this.get({col}) will return the .metadata column" 840 ) 841 842 if self.metadata is None: 843 self.metadata = new_metadata 844 else: 845 self.metadata.loc[:, new_metadata.columns] = new_metadata 846 847 def add_features(self, new_features: pd.Series | pd.DataFrame) -> None: 848 """ 849 Add features to the EventArray. Removes the need to check if features is None. 850 Overwrites any existing features with the same column names as the new features. 851 :param new_features: the features to add. 852 """ 853 if self.info is None or len(self.info) != len(new_features): 854 raise ValueError("New features must match length of existing info") 855 856 if isinstance(new_features, pd.Series): 857 # Convert to a DataFrame 858 new_features = pd.DataFrame(new_features) 859 860 for col in new_features.columns: 861 if col in self.INFO_COLUMNS: 862 warnings.warn( 863 f"Column name {col} is reserved for info; you can only " 864 "access this column through the .features attribute" 865 ) 866 elif self.metadata is not None and col in self.metadata.columns: 867 warnings.warn( 868 f"Column name {col} already exists in the .metadata attribute;" 869 f"calling this.get({col}) will return the .metadata column" 870 ) 871 872 if self.features is None: 873 self.features = new_features 874 else: 875 self.features.loc[:, new_features.columns] = new_features 876 877 @classmethod 878 def merge(cls, events: Iterable[Self]) -> Self: 879 """ 880 Combine EventArrays in a list into a single EventArray. 881 :param events: the new list of events. 882 """ 883 all_info = [] 884 all_metadata = [] 885 all_features = [] 886 for event_array in events: 887 # Skip empty EventArrays 888 if event_array.info is not None: 889 all_info.append(event_array.info) 890 if event_array.metadata is not None: 891 all_metadata.append(event_array.metadata) 892 if event_array.features is not None: 893 all_features.append(event_array.features) 894 if len(all_info) == 0: 895 return EventArray() 896 else: 897 all_info = pd.concat(all_info, ignore_index=True) 898 if len(all_metadata) == 0: 899 all_metadata = None 900 else: 901 all_metadata = pd.concat(all_metadata, ignore_index=True) 902 if len(all_features) == 0: 903 all_features = None 904 else: 905 all_features = pd.concat(all_features, ignore_index=True) 906 907 return EventArray(all_info, all_metadata, all_features) 908 909 def to_events( 910 self, 911 scans: Scan | Iterable[Scan], 912 ignore_missing_scans=True, 913 ignore_metadata=False, 914 ignore_features=False, 915 ) -> list[Event]: 916 """ 917 Get the events in the EventArray as a list of events. Returns [] if empty. 918 :param scans: the scans that the events belong to, auto-matched by slide_id. 919 Pass None if you don't care about scan metadata (pass ignore_missing_scans). 920 :param ignore_missing_scans: whether to create blank scans for events without scans. 921 :param ignore_metadata: whether to ignore metadata or not 922 :param ignore_features: whether to ignore features or not 923 :return: 924 """ 925 if len(self) == 0: 926 return [] 927 if isinstance(scans, Scan): 928 scans = [scans] 929 scans = {scan.slide_id: scan for scan in scans} 930 events = [] 931 for i in range(len(self.info)): 932 # Determine the associated scan 933 slide_id = self.info["slide_id"][i] 934 if slide_id not in scans: 935 if ignore_missing_scans: 936 # Create a placeholder scan if the scan is missing 937 scan = Scan.make_placeholder( 938 slide_id, 939 self.info["tile"][i], 940 self.info["roi"][i], 941 ) 942 else: 943 raise ValueError( 944 f"Scan {self.info['slide_id'][i]} not found for event {i}." 945 ) 946 else: 947 scan = scans[slide_id] 948 949 # Prepare the metadata and features 950 if ignore_metadata or self.metadata is None: 951 metadata = None 952 else: 953 # This Series creation method is less efficient, 954 # but required for preserving dtypes 955 metadata = pd.Series( 956 {col: self.metadata.loc[i, col] for col in self.metadata.columns}, 957 dtype=object, 958 ) 959 if ignore_features or self.features is None: 960 features = None 961 else: 962 features = pd.Series( 963 {col: self.features.loc[i, col] for col in self.features.columns}, 964 dtype=object, 965 ) 966 # Create the event and append it to the list 967 events.append( 968 Event( 969 Tile(scan, self.info["tile"][i], self.info["roi"][i]), 970 self.info["x"][i], 971 self.info["y"][i], 972 metadata=metadata, 973 features=features, 974 ) 975 ) 976 return events 977 978 @classmethod 979 def from_events(cls, events: Iterable[Event]) -> Self: 980 """ 981 Set the events in the EventArray to a new list of events. 982 :param events: the new list of events. 983 """ 984 info = pd.DataFrame( 985 { 986 "slide_id": [event.tile.scan.slide_id for event in events], 987 "tile": [event.tile.n for event in events], 988 "roi": [event.tile.n_roi for event in events], 989 "x": [event.x for event in events], 990 "y": [event.y for event in events], 991 } 992 ) 993 metadata_list = [event.metadata for event in events] 994 # Iterate through and ensure that all metadata is the same shape 995 for metadata in metadata_list: 996 if type(metadata) != type(metadata_list[0]): 997 raise ValueError("All metadata must be the same type.") 998 if metadata is not None and metadata.shape != metadata_list[0].shape: 999 raise ValueError("All metadata must be the same shape.") 1000 if metadata_list[0] is None: 1001 metadata = None 1002 else: 1003 metadata = pd.DataFrame(metadata_list) 1004 features_list = [event.features for event in events] 1005 # Iterate through and ensure that all features are the same shape 1006 for features in features_list: 1007 if type(features) != type(features_list[0]): 1008 raise ValueError("All features must be the same type.") 1009 if features is not None and features.shape != features_list[0].shape: 1010 raise ValueError("All features must be the same shape.") 1011 if features_list[0] is None: 1012 features = None 1013 else: 1014 features = pd.DataFrame(features_list) 1015 return EventArray(info=info, metadata=metadata, features=features) 1016 1017 def to_dataframe(self) -> pd.DataFrame: 1018 """ 1019 Convert all the data in the EventArray to a single DataFrame. 1020 :return: a DataFrame with all the data in the EventArray. 1021 """ 1022 # Make a copy of the info DataFrame and prepend "info_" to the column names 1023 output = self.info.copy() 1024 # Combine with the metadata and prepend "metadata_" to the column names 1025 if self.metadata is not None: 1026 metadata = self.metadata.copy() 1027 metadata.columns = [f"metadata_{col}" for col in metadata.columns] 1028 output = pd.concat([output, metadata], axis=1) 1029 # Combine with the features and prepend "features_" to the column names 1030 if self.features is not None: 1031 features = self.features.copy() 1032 features.columns = [f"features_{col}" for col in features.columns] 1033 output = pd.concat([output, features], axis=1) 1034 return output 1035 1036 @classmethod 1037 def from_dataframe( 1038 cls, df, metadata_prefix: str = "metadata_", features_prefix: str = "features_" 1039 ) -> Self: 1040 """ 1041 From a single, special DataFrame, create an EventArray. 1042 :param df: the DataFrame to convert to an EventArray. 1043 :param metadata_prefix: the prefix for metadata columns. 1044 :param features_prefix: the prefix for features columns. 1045 :return: a DataFrame with all the data in the EventArray. 1046 """ 1047 # Split the columns into info, metadata, and features and strip prefix 1048 info = df[[col for col in df.columns if col in cls.INFO_COLUMNS]].copy() 1049 if info.size == 0: 1050 info = None 1051 metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy() 1052 metadata.columns = [ 1053 col.replace(metadata_prefix, "") for col in metadata.columns 1054 ] 1055 if metadata.size == 0: 1056 metadata = None 1057 features = df[[col for col in df.columns if col.startswith("features_")]].copy() 1058 features.columns = [ 1059 col.replace(features_prefix, "") for col in features.columns 1060 ] 1061 if features.size == 0: 1062 features = None 1063 return cls(info=info, metadata=metadata, features=features) 1064 1065 @classmethod 1066 def from_mask( 1067 cls, 1068 mask: np.ndarray, 1069 tile: Tile, 1070 include_cell_id: bool = True, 1071 images: list[np.ndarray] = None, 1072 image_labels: list[str] = None, 1073 properties: list[str] = None, 1074 ) -> Self: 1075 """ 1076 Extract events from a mask DataFrame, including metadata and features. 1077 :param mask: the mask to extract events from. 1078 :param tile: the Tile object associated with this mask. 1079 :param include_cell_id: whether to include the cell_id, or numerical 1080 mask label, as metadata in the EventArray. 1081 :param images: the intensity images to extract features from. 1082 :param image_labels: the labels for the intensity images. 1083 :param properties: list of properties to extract in addition to the defaults: 1084 :return: EventArray corresponding to the mask labels. 1085 """ 1086 if csi_images is None: 1087 raise ModuleNotFoundError( 1088 "imageio libraries not installed! " 1089 "run `pip install csi_images[imageio]` to resolve." 1090 ) 1091 # Gather mask_info 1092 if images is not None and image_labels is not None: 1093 if len(images) != len(image_labels): 1094 raise ValueError("Intensity images and labels must match lengths.") 1095 1096 mask_info = csi_images.extract_mask_info(mask, images, image_labels, properties) 1097 1098 if len(mask_info) == 0: 1099 return EventArray() 1100 1101 # Combine provided info and mask info 1102 info = pd.DataFrame( 1103 { 1104 "slide_id": tile.scan.slide_id, 1105 "tile": tile.n, 1106 "roi": tile.n_roi, 1107 "x": mask_info["x"], 1108 "y": mask_info["y"], 1109 }, 1110 ) 1111 # Extract a metadata column if desired 1112 if include_cell_id: 1113 metadata = pd.DataFrame({"cell_id": mask_info["id"]}) 1114 else: 1115 metadata = None 1116 # If any additional properties were extracted, add them as features 1117 mask_info = mask_info.drop(columns=["id", "x", "y"], errors="ignore") 1118 if len(mask_info.columns) > 0: 1119 features = mask_info 1120 features.columns = [col.lower() for col in features.columns] 1121 else: 1122 features = None 1123 return EventArray(info, metadata, features) 1124 1125 def save_csv(self, output_path: str) -> bool: 1126 """ 1127 Save the events to an CSV file, including metadata and features. 1128 :param output_path: 1129 :return: 1130 """ 1131 if not output_path.endswith(".csv"): 1132 output_path += ".csv" 1133 self.to_dataframe().to_csv(output_path, index=False) 1134 return os.path.exists(output_path) 1135 1136 @classmethod 1137 def load_csv( 1138 cls, 1139 input_path: str, 1140 metadata_prefix: str = "metadata_", 1141 features_prefix: str = "features_", 1142 ) -> Self: 1143 """ 1144 Load the events from an CSV file, including metadata and features. 1145 :param input_path: 1146 :param metadata_prefix: 1147 :param features_prefix: 1148 :return: 1149 """ 1150 # Load the CSV file 1151 df = pd.read_csv(input_path) 1152 return cls.from_dataframe(df, metadata_prefix, features_prefix) 1153 1154 def save_json(self, output_path: str, orient: str = "records") -> bool: 1155 """ 1156 Save the events to a JSON file, including metadata and features. 1157 :param output_path: 1158 :param orient: the orientation of the JSON file, see pandas.DataFrame.to_json() 1159 :return: 1160 """ 1161 if not output_path.endswith(".json"): 1162 output_path += ".json" 1163 self.to_dataframe().to_json(output_path, orient=orient, indent=2) 1164 return os.path.exists(output_path) 1165 1166 @classmethod 1167 def load_json( 1168 cls, 1169 input_path: str, 1170 metadata_prefix: str = "metadata_", 1171 features_prefix: str = "features_", 1172 ) -> Self: 1173 """ 1174 Load the events from a JSON file, including metadata and features. 1175 :param input_path: 1176 :param metadata_prefix: 1177 :param features_prefix: 1178 :return: 1179 """ 1180 # Load the JSON file 1181 df = pd.read_json(input_path, orient="records") 1182 return cls.from_dataframe(df, metadata_prefix, features_prefix) 1183 1184 def save_hdf5( 1185 self, output_path: str, complevel: int = 1, complib="blosc:zstd" 1186 ) -> bool: 1187 """ 1188 Save the events to an HDF5 file, including metadata and features. 1189 Uses the pandas-provided HDF5 functions for ease, and external compatibility, 1190 though these files are slightly harder to view in HDFView or similar. 1191 Compression defaults remain very quick while cutting file size by 50%+. 1192 :param output_path: 1193 :param complevel: see pandas.HDFStore for more details. 1194 :param complib: see pandas.HDFStore for more details. 1195 :return: 1196 """ 1197 if not output_path.endswith(".hdf5") and not output_path.endswith(".h5"): 1198 output_path += ".hdf5" 1199 # Open the output_path as an HDF5 file 1200 with pd.HDFStore( 1201 output_path, mode="w", complevel=complevel, complib=complib 1202 ) as store: 1203 # Store the dataframes in the HDF5 file 1204 if self.info is not None: 1205 store.put("info", self.info, index=False) 1206 if self.metadata is not None: 1207 store.put("metadata", self.metadata, index=False) 1208 if self.features is not None: 1209 store.put("features", self.features, index=False) 1210 return os.path.exists(output_path) 1211 1212 @classmethod 1213 def load_hdf5(cls, input_path: str) -> Self: 1214 """ 1215 Load the events from an HDF5 file, including metadata and features. 1216 :param input_path: 1217 :return: 1218 """ 1219 # Open the input_path as an HDF5 file 1220 with pd.HDFStore(input_path, "r") as store: 1221 # Load the dataframes from the HDF5 file 1222 info = store.get("info") if "info" in store else None 1223 metadata = store.get("metadata") if "metadata" in store else None 1224 features = store.get("features") if "features" in store else None 1225 return cls(info=info, metadata=metadata, features=features) 1226 1227 def save_ocular(self, output_path: str, event_type: str = "cells"): 1228 """ 1229 Save the events to an OCULAR file. Relies on the dataframe originating 1230 from an OCULAR file (same columns; duplicate metadata/info). 1231 :param output_path: 1232 :param event_type: 1233 :return: 1234 """ 1235 if pyreadr is None: 1236 raise ModuleNotFoundError( 1237 "pyreadr not installed! Install pyreadr directly " 1238 "or run `pip install csi-images[rds]` option to resolve." 1239 ) 1240 if event_type == "cells": 1241 file_stub = "rc-final" 1242 elif event_type == "others": 1243 file_stub = "others-final" 1244 else: 1245 raise ValueError("Invalid event type. Must be cells or others.") 1246 1247 # Ensure good metadata 1248 metadata = pd.DataFrame( 1249 { 1250 "slide_id": self.info["slide_id"], 1251 "frame_id": self.info["tile"] + 1, # Convert to 1-indexed for R 1252 "cell_id": ( 1253 self.metadata["cell_id"] 1254 if "cell_id" in self.metadata.columns 1255 else range(len(self.info)) 1256 ), 1257 "cellx": self.info["x"], 1258 "celly": self.info["y"], 1259 } 1260 ) 1261 if self.metadata is not None: 1262 metadata[self.metadata.columns] = self.metadata.copy() 1263 1264 # Check for the "ocular_interesting" column 1265 if event_type == "cells": 1266 if "ocular_interesting" in metadata.columns: 1267 interesting_rows = metadata["ocular_interesting"].to_numpy(dtype=bool) 1268 elif "hcpc" in metadata.columns: 1269 # Interesting cells don't get an hcpc designation, leaving them as -1 1270 interesting_rows = ( 1271 metadata["hcpc"].to_numpy() == -1 1272 ) # interesting cells 1273 else: 1274 interesting_rows = [] 1275 if sum(interesting_rows) > 0: 1276 # Split the metadata into interesting and regular 1277 interesting_events = self.rows(interesting_rows) 1278 interesting_df = pd.concat( 1279 [interesting_events.features, interesting_events.metadata], axis=1 1280 ) 1281 data_events = self.rows(~interesting_rows) 1282 data_df = pd.concat( 1283 [data_events.features, data_events.metadata], axis=1 1284 ) 1285 data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore") 1286 1287 # Drop particular columns for "interesting" 1288 interesting_df = interesting_df.drop( 1289 [ 1290 "clust", 1291 "hcpc", 1292 "frame_id", 1293 "cell_id", 1294 "unique_id", 1295 "ocular_interesting", 1296 ], 1297 axis=1, 1298 errors="ignore", 1299 ) 1300 # Save both .csv and .rds 1301 interesting_stub = os.path.join(output_path, "ocular_interesting") 1302 interesting_df.to_csv(f"{interesting_stub}.csv") 1303 # Suppress pandas FutureWarning 1304 with warnings.catch_warnings(): 1305 warnings.simplefilter(action="ignore", category=FutureWarning) 1306 pyreadr.write_rds(f"{interesting_stub}.rds", interesting_df) 1307 else: 1308 data_df = pd.concat([self.features, metadata], axis=1) 1309 else: 1310 # Get all data and reset_index (will copy it) 1311 data_df = pd.concat([self.features, metadata], axis=1) 1312 1313 # Split based on cluster number to conform to *-final[1-4].rds 1314 n_clusters = max(data_df["clust"]) + 1 1315 split_idx = [round(i * n_clusters / 4) for i in range(5)] 1316 for i in range(4): 1317 subset = (split_idx[i] <= data_df["clust"]) & ( 1318 data_df["clust"] < split_idx[i + 1] 1319 ) 1320 data_df.loc[subset, "hcpc"] = i + 1 1321 subset = data_df[subset].reset_index(drop=True) 1322 # Suppress pandas FutureWarning 1323 with warnings.catch_warnings(): 1324 warnings.simplefilter(action="ignore", category=FutureWarning) 1325 pyreadr.write_rds( 1326 os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset 1327 ) 1328 1329 # Create new example cell strings 1330 data_df["example_cell_id"] = ( 1331 data_df["slide_id"] 1332 + " " 1333 + data_df["frame_id"].astype(str) 1334 + " " 1335 + data_df["cell_id"].astype(str) 1336 + " " 1337 + data_df["cellx"].astype(int).astype(str) 1338 + " " 1339 + data_df["celly"].astype(int).astype(str) 1340 ) 1341 # Find averagable data columns 1342 if "cellcluster_id" in data_df.columns: 1343 end_idx = data_df.columns.get_loc("cellcluster_id") 1344 else: 1345 end_idx = data_df.columns.get_loc("slide_id") 1346 avg_cols = data_df.columns[:end_idx].tolist() 1347 # Group by cluster and average 1348 data_df = data_df.groupby("clust").agg( 1349 **{col: (col, "mean") for col in avg_cols}, 1350 count=("clust", "size"), # count rows in each cluster 1351 example_cells=("example_cell_id", lambda x: ",".join(x)), 1352 hcpc=("hcpc", lambda x: x.iloc[0]), 1353 ) 1354 data_df = data_df.reset_index() # Do NOT drop, index is "clust" 1355 # Create new columns 1356 metadata = pd.DataFrame( 1357 { 1358 "count": data_df["count"], 1359 "example_cells": data_df["example_cells"], 1360 "clust": data_df["clust"].astype(int), 1361 "hcpc": data_df["hcpc"].astype(int), 1362 "id": data_df["clust"].astype(int).astype(str), 1363 "cccluster": "0", # Dummy value 1364 "ccdistance": 0.0, # Dummy value 1365 "rownum": list(range(len(data_df))), 1366 "framegroup": 0, # Dummy value 1367 } 1368 ) 1369 # Need to pad the features to 761 columns, as per OCULAR report needs 1370 additional_columns = range(len(avg_cols), 761) 1371 if len(additional_columns) > 0: 1372 padding = pd.DataFrame( 1373 np.zeros((len(data_df), len(additional_columns))), 1374 columns=[f"pad{i}" for i in additional_columns], 1375 ) 1376 data_df = pd.concat([data_df[avg_cols], padding, metadata], axis=1) 1377 else: 1378 data_df = pd.concat([data_df[avg_cols], metadata], axis=1) 1379 1380 # Save the cluster data 1381 data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv")) 1382 # Suppress pandas FutureWarning 1383 with warnings.catch_warnings(): 1384 warnings.simplefilter(action="ignore", category=FutureWarning) 1385 pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df) 1386 1387 @classmethod 1388 def load_ocular( 1389 cls, 1390 input_path: str, 1391 event_type="cells", 1392 cell_data_files=( 1393 "rc-final1.rds", 1394 "rc-final2.rds", 1395 "rc-final3.rds", 1396 "rc-final4.rds", 1397 "ocular_interesting.rds", 1398 ), 1399 others_data_files=( 1400 "others-final1.rds", 1401 "others-final2.rds", 1402 "others-final3.rds", 1403 "others-final4.rds", 1404 ), 1405 atlas_data_files=( 1406 "ocular_interesting.rds", 1407 "ocular_not_interesting.rds", 1408 ), 1409 drop_common_events=True, 1410 ) -> Self: 1411 """ 1412 1413 :param input_path: 1414 :param event_type: 1415 :param cell_data_files: 1416 :param others_data_files: 1417 :param atlas_data_files: 1418 :param drop_common_events: 1419 :return: 1420 """ 1421 if pyreadr is None: 1422 raise ModuleNotFoundError( 1423 "pyreadr not installed! Install pyreadr directly " 1424 "or run `pip install csi-images[rds]` option to resolve." 1425 ) 1426 # Check if the input path is a directory or a file 1427 if os.path.isfile(input_path): 1428 data_files = [os.path.basename(input_path)] 1429 input_path = os.path.dirname(input_path) 1430 if event_type == "cells": 1431 data_files = cell_data_files 1432 elif event_type == "others": 1433 data_files = others_data_files 1434 else: 1435 raise ValueError("Invalid event type.") 1436 1437 # Load the data from the OCULAR files 1438 file_data = {} 1439 for file in data_files: 1440 file_path = os.path.join(input_path, file) 1441 if not os.path.isfile(file_path): 1442 warnings.warn(f"{file} not found for in {input_path}") 1443 continue 1444 file_data[file] = pyreadr.read_r(file_path) 1445 # Get the DataFrame associated with None (pyreadr dict quirk) 1446 file_data[file] = file_data[file][None] 1447 if len(file_data[file]) == 0: 1448 # File gets dropped from the dict 1449 file_data.pop(file) 1450 warnings.warn(f"{file} has no cells") 1451 continue 1452 1453 # Drop common cells if requested and in this file 1454 if ( 1455 file in atlas_data_files 1456 and drop_common_events 1457 and "catalogue_classification" in file_data[file] 1458 ): 1459 common_cell_indices = ( 1460 file_data[file]["catalogue_classification"] == "common_cell" 1461 ) 1462 file_data[file] = file_data[file][common_cell_indices == False] 1463 1464 if len(file_data[file]) == 0: 1465 # File gets dropped from the dict 1466 file_data.pop(file) 1467 warnings.warn(f"{file} has no cells after dropping common cells") 1468 continue 1469 1470 # Extract frame_id and cell_id 1471 # DAPI- events already have frame_id cell_id outside rowname 1472 if event_type == "cells" and "frame_id" not in file_data[file].columns: 1473 file_data[file]["rowname"] = file_data[file]["rowname"].astype("str") 1474 # get frame_id cell_id from rownames column and split into two columns 1475 split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True) 1476 if len(split_res.columns) != 2: 1477 warnings.warn( 1478 f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}' 1479 ) 1480 # then assign it back to the dataframe 1481 file_data[file][["frame_id", "cell_id"]] = split_res.astype("int") 1482 # Ensure frame_id and cell_id are integers 1483 file_data[file]["frame_id"] = file_data[file]["frame_id"].astype("int") 1484 file_data[file]["cell_id"] = file_data[file]["cell_id"].astype("int") 1485 # reset indexes since they can cause NaN values in concat 1486 file_data[file] = file_data[file].reset_index(drop=True) 1487 1488 # Merge the data from all files 1489 if len(file_data) == 0: 1490 return EventArray() 1491 elif len(file_data) == 1: 1492 data = [file_data[file] for file in file_data.keys()][0] 1493 else: 1494 data = pd.concat(file_data.values()) 1495 1496 # Others is missing the "slide_id". Insert it right before "frame_id" column 1497 if event_type == "others" and "slide_id" not in data.columns: 1498 if os.path.basename(input_path) == "ocular": 1499 slide_id = os.path.basename(os.path.dirname(input_path)) 1500 else: 1501 slide_id = "UNKNOWN" 1502 data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id) 1503 1504 # Sort according to ascending cell_id to keep the original, which is in manual_df 1505 data = data.sort_values(by=["cell_id"], ascending=True) 1506 # Filter out duplicates by x & y 1507 data = data.assign( 1508 unique_id=data["slide_id"] 1509 + "_" 1510 + data["frame_id"].astype(str) 1511 + "_" 1512 + data["cellx"].astype(int).astype(str) 1513 + "_" 1514 + data["celly"].astype(int).astype(str) 1515 ) 1516 data = data.drop_duplicates(subset=["unique_id"], keep="first") 1517 # Normal unique_id is with cell_id 1518 data = data.assign( 1519 unique_id=data["slide_id"] 1520 + "_" 1521 + data["frame_id"].astype(str) 1522 + "_" 1523 + data["cell_id"].astype(str) 1524 ) 1525 data = data.reset_index(drop=True) 1526 # All columns up to "slide_id" are features; drop the "slide_id" 1527 features = data.loc[:, :"slide_id"].iloc[:, :-1] 1528 data = data.loc[:, "slide_id":] 1529 # Grab the info columns 1530 info = data[["slide_id", "frame_id", "cellx", "celly"]] 1531 info.columns = ["slide_id", "tile", "x", "y"] 1532 info = info.assign(roi=0) # OCULAR only works on 1 ROI, as far as known 1533 info = info[["slide_id", "tile", "roi", "x", "y"]] 1534 # Metadata has duplicate columns for later convenience 1535 metadata = data 1536 # Certain columns tend to be problematic with mixed data formats... 1537 for col in ["TRITC", "CY5", "FITC"]: 1538 if col in metadata: 1539 labels = { 1540 "False": False, 1541 "True": True, 1542 "FALSE": False, 1543 "TRUE": True, 1544 False: False, 1545 True: True, 1546 } 1547 metadata[col] = metadata[col].map(labels).astype(bool) 1548 for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]: 1549 if col in metadata: 1550 metadata[col] = metadata[col].fillna(-1).astype(int) 1551 info["tile"] = info["tile"] - 1 # Convert to 0-based indexing 1552 return EventArray(info, metadata, features)
40class Event: 41 """ 42 A class that represents a single event in a scan, making it easy to evaluate 43 singular events. Required metadata is exposed as attributes, and optional 44 metadata and features are stored as DataFrames. 45 """ 46 47 SCAN_TO_SLIDE_TRANSFORM = { 48 # Axioscan zero is in the top-right corner instead of top-left 49 Scan.Type.AXIOSCAN7: np.array( 50 [ 51 [1, 0, 75000], 52 [0, 1, 0], 53 [0, 0, 1], 54 ] 55 ), 56 # BZScanner coordinates are a special kind of messed up: 57 # - The slide is upside-down. 58 # - The slide is oriented vertically, with the barcode at the bottom. 59 # - Tiles are numbered from the top-right 60 Scan.Type.BZSCANNER: np.array( 61 [ 62 [0, -1, 75000], 63 [-1, 0, 25000], 64 [0, 0, 1], 65 ] 66 ), 67 } 68 """ 69 Homogeneous transformation matrices for converting between scanner and slide 70 coordinates. The matrices are 3x3, with the final column representing the 71 translation in micrometers (um). For more information, see 72 [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations). 73 74 Transformations are nominal, and accuracy is not guaranteed; this is due to 75 imperfections in slides and alignment in the scanners. Units are in micrometers. 76 """ 77 78 def __init__( 79 self, 80 tile: Tile, 81 x: int, 82 y: int, 83 metadata: pd.Series = None, 84 features: pd.Series = None, 85 ): 86 self.tile = tile 87 self.x = int(x) 88 self.y = int(y) 89 self.metadata = metadata 90 self.features = features 91 92 def __repr__(self) -> str: 93 return f"{self.tile}-{self.x}-{self.y}" 94 95 def __eq__(self, other) -> bool: 96 return self.__repr__() == other.__repr__() 97 98 def __lt__(self, other): 99 return self.__repr__() < other.__repr__() 100 101 def get_scan_position(self) -> tuple[float, float]: 102 """ 103 Get the position of the event in the scanner's coordinate frame. 104 :return: the scan position of the event in micrometers (um). 105 """ 106 # Get overall pixel position 107 real_tile_height, real_tile_width = self.tile.scan.get_image_size() 108 pixel_x = self.x + (real_tile_width * self.tile.x) 109 pixel_y = self.y + (real_tile_height * self.tile.y) 110 # Convert to micrometers 111 x_um = pixel_x * self.tile.scan.pixel_size_um 112 y_um = pixel_y * self.tile.scan.pixel_size_um 113 # Add the scan's origin in the scanner frame 114 x_um += self.tile.scan.roi[self.tile.n_roi].origin_x_um 115 y_um += self.tile.scan.roi[self.tile.n_roi].origin_y_um 116 return x_um, y_um 117 118 def get_slide_position(self) -> tuple[float, float]: 119 """ 120 Get the slide position of the event in micrometers (um). 121 :return: the slide position of the event. 122 """ 123 # Turn scan_position into a 3x1 vector 124 scan_position = self.get_scan_position() 125 scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]]) 126 127 # Multiply by the appropriate homogeneous matrix 128 if self.tile.scan.scanner_id.startswith(self.tile.scan.Type.AXIOSCAN7.value): 129 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.tile.scan.Type.AXIOSCAN7] 130 elif self.tile.scan.scanner_id.startswith(self.tile.scan.Type.BZSCANNER.value): 131 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.tile.scan.Type.BZSCANNER] 132 else: 133 raise ValueError(f"Scanner type {self.tile.scan.scanner_id} not supported.") 134 slide_position = np.matmul(transform, scan_position) 135 return float(slide_position[0][0]), float(slide_position[1][0]) 136 137 def crop( 138 self, images: Iterable[np.ndarray], crop_size: int = 100, in_pixels: bool = True 139 ) -> list[np.ndarray]: 140 """ 141 Crop the event from the provided frame images. Use if you have already gotten 142 frame images; useful for cropping multiple events from the same frame image. 143 :param images: the frame images. 144 :param crop_size: the square size of the image crop to get for this event. 145 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 146 :return: image_size x image_size crops of the event in the provided frames. If 147 the event is too close to the edge, the crop will be smaller and not centered. 148 """ 149 # Convert a crop size in micrometers to pixels 150 if not in_pixels: 151 crop_size = round(crop_size / self.tile.scan.pixel_size_um) 152 image_height, image_width = 0, 0 153 for image in images: 154 if image_height == 0 and image_width == 0: 155 image_height, image_width = image.shape 156 else: 157 if image_height != image.shape[0] or image_width != image.shape[1]: 158 raise ValueError("All images must be the same size") 159 if image_height == 0 or image_width == 0: 160 raise ValueError("No images provided") 161 162 # Find the crop bounds 163 bounds = [ 164 self.x - (crop_size // 2) + 1, 165 self.y - (crop_size // 2) + 1, 166 self.x + math.ceil(crop_size / 2) + 1, 167 self.y + math.ceil(crop_size / 2) + 1, 168 ] 169 # Determine how much the bounds violate the image size 170 displacements = [ 171 max(0, -bounds[0]), 172 max(0, -bounds[1]), 173 max(0, bounds[2] - image_width), 174 max(0, bounds[3] - image_height), 175 ] 176 # Cap off the bounds 177 bounds = [ 178 max(0, bounds[0]), 179 max(0, bounds[1]), 180 min(image_width, bounds[2]), 181 min(image_height, bounds[3]), 182 ] 183 184 # Crop the images 185 crops = [] 186 for image in images: 187 # Create a blank image of the right size 188 crop = np.zeros((crop_size, crop_size), dtype=image.dtype) 189 190 # Insert the cropped image into the blank image, leaving a black buffer 191 # around the edges if the crop would go beyond the original image bounds 192 crop[ 193 displacements[1] : crop_size - displacements[3], 194 displacements[0] : crop_size - displacements[2], 195 ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]] 196 crops.append(crop) 197 return crops 198 199 def get_crops( 200 self, 201 crop_size: int = 100, 202 in_pixels: bool = True, 203 input_path: str = None, 204 channels: Iterable[int | str] = None, 205 apply_gain: bool | Iterable[bool] = True, 206 ) -> list[np.ndarray]: 207 """ 208 Gets the frame images for this event and then crops the event from the images. 209 Convenient for retrieving a single event's crops, but less efficient when 210 retrieving multiple events from the same tile as it will reread the images. 211 :param crop_size: the square size of the image crop to get for this event. 212 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 213 :param input_path: the path to the input images. Defaults to None (uses the scan's path). 214 :param channels: the channels to extract images for. Defaults to all channels. 215 :param apply_gain: whether to apply scanner-calculated gain to the images, if 216 not already applied. If a list, matches the channels. 217 :return: a list of cropped images from the scan in the order of the channels. 218 """ 219 # This function validates channels 220 frames = Frame.get_frames(self.tile, channels) 221 # Convert individual inputs to lists of appropriate length 222 if isinstance(apply_gain, bool): 223 apply_gain = [apply_gain] * len(frames) 224 images = [f.get_image(input_path, a) for f, a in zip(frames, apply_gain)] 225 return self.crop(images, crop_size, in_pixels) 226 227 def save_crops( 228 self, 229 crops: Sequence[np.ndarray], 230 output_path: str, 231 labels: Sequence[str], 232 ext: str = "auto", 233 ): 234 """ 235 Save the crops to image files. 236 :param crops: the crops to save. Will save as RGB if 3 channel [h, w, 3] or 237 grayscale if 1 channel [h, w] or [h, w, 1]. 238 :param labels: the labels to append to the file name, usually the channel names 239 associated with each crop. 240 :param output_path: the folder to save the crops to. Will make if needed. 241 :param ext: the file extension to save the crops as. Defaults to "auto", which 242 will save as .tif for grayscale images and .jpg for RGB images. 243 :return: None 244 """ 245 if len(crops) != len(labels): 246 raise ValueError("Crops and labels must be the same length") 247 248 if csi_images is None or imageio is None: 249 raise ModuleNotFoundError( 250 "imageio libraries not installed! " 251 "run `pip install csi_images[imageio]` to resolve." 252 ) 253 254 os.makedirs(output_path, exist_ok=True) 255 256 for crop, label in zip(crops, labels): 257 if ext == "auto": 258 if len(crop.shape) == 2 or crop.shape[2] == 1: 259 file_extension = ".tif" 260 elif crop.shape[2] == 3: 261 file_extension = ".jpg" 262 else: 263 warnings.warn( 264 f"Image shape {crop.shape} not recognized; saving as .tif" 265 ) 266 file_extension = ".tif" 267 else: 268 file_extension = ext 269 file = os.path.join(output_path, f"{self}-{label}{file_extension}") 270 # TODO: add more file types here 271 if file_extension == ".tif": 272 imageio.imwrite(file, crop, compression="deflate") 273 elif file_extension in [".jpg", ".jpeg"]: 274 crop = csi_images.scale_bit_depth(crop, np.uint8) 275 imageio.imwrite(file, crop, quality=80) 276 else: 277 imageio.imwrite(file, crop) 278 279 def load_crops( 280 self, input_path: str, labels: list[str] = None 281 ) -> dict[str, np.ndarray]: 282 """ 283 Loads previously saved crop files from a folder. 284 :param input_path: folder containing crop files. 285 :param labels: optional label filter, will only return crops with these labels. 286 :return: a tuple of lists containing the crops and their labels. 287 """ 288 crops = {} 289 for file in glob.glob(os.path.join(input_path, f"{self}-*")): 290 label = os.path.splitext(os.path.basename(file))[0].split("-")[-1] 291 # Skip if we have labels to target 292 if labels is not None and label not in labels: 293 continue 294 crops[label] = imageio.imread(file) 295 return crops 296 297 def get_montage_channels( 298 self, 299 channels: Sequence[int | str] | None = None, 300 composites: dict[int | str, tuple[float, float, float]] | None = None, 301 ) -> tuple[list[int], list[int], dict[int, tuple[float, float, float]]]: 302 """ 303 Get the channel names for the montage from the event's tile. 304 :param channels: channel indices or names for grayscale channels 305 :param composites: dictionary of channel indices or names and RGB values 306 :return: (1) channel indices to retrieve, 307 (2) relative grayscale channel indices, and 308 (3) composite channel indices and RGB values. 309 """ 310 if channels is None: 311 channels = list(range(len(self.tile.scan.channels))) 312 if (len(channels) == 0) and (composites is None or len(composites) == 0): 313 raise ValueError("Must provide at least one channel type to montage") 314 315 channels_to_get = [] 316 317 # Build the list of channels to retrieve 318 if channels is not None: 319 if isinstance(channels[0], str): 320 channels = self.tile.scan.get_channel_indices(channels) 321 channels_to_get += channels 322 order = list(range(len(channels))) # Always the first n channels 323 else: 324 order = None 325 326 if composites is not None: 327 relative_composites = {} # Relative indices for retrieved channels 328 # Convert to scan indices 329 rgb_channels = list(composites.keys()) 330 if isinstance(rgb_channels[0], str): 331 rgb_channels = self.tile.scan.get_channel_indices(rgb_channels) 332 # Find the index or add to the end 333 for channel, rgb in zip(rgb_channels, composites.values()): 334 if channel not in channels_to_get: 335 channels_to_get.append(channel) 336 relative_composites[channel] = rgb 337 else: 338 relative_composites[channels_to_get.index(channel)] = rgb 339 else: 340 relative_composites = None 341 342 return channels_to_get, order, relative_composites 343 344 def get_montage( 345 self, 346 channels: Sequence[int | str] = None, 347 composites: dict[int | str, tuple[float, float, float]] = None, 348 mask: np.ndarray[np.uint8] = None, 349 labels: Sequence[str] = None, 350 crop_size: int = 100, 351 in_pixels: bool = True, 352 input_path: str = None, 353 apply_gain: bool = True, 354 **kwargs, 355 ) -> np.ndarray: 356 """ 357 Convenience function for getting frame images and creating a montage. Mirrors 358 csi_images.make_montage(). Convenient for a single event's montage, but less 359 efficient when for multiple events from the same tile. 360 :param channels: the channels to use for black-and-white montages. 361 :param composites: dictionary of indices and RGB tuples for a composite. 362 :param mask: a mask to apply to the montage. Must be the same size as the crop. 363 :param labels: the labels to subtitle montage images, usually the channel names 364 :param crop_size: the square size of the image crop to get for this event. 365 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 366 :param input_path: the path to the input images. Defaults to None (uses the scan's path). 367 :param apply_gain: whether to apply scanner-calculated gain to the images, if 368 not already applied. If a list, matches the channels. 369 :param kwargs: montage options. See csi_images.make_montage() for more details. 370 :return: numpy array representing the montage. 371 """ 372 channels, order, composites = self.get_montage_channels(channels, composites) 373 images = self.get_crops(crop_size, in_pixels, input_path, channels, apply_gain) 374 return csi_images.make_montage( 375 images, order, composites, mask, labels, **kwargs 376 ) 377 378 def save_montage( 379 self, 380 montage: np.ndarray, 381 output_path: str, 382 ocular_names: bool = False, 383 tag: str = "", 384 file_extension: str = ".jpeg", 385 **kwargs, 386 ): 387 """ 388 Save the montage as a JPEG image with a set name. 389 :param montage: the montage to save. 390 :param output_path: the folder to save the montage in. Will make if needed. 391 :param ocular_names: whether to use the OCULAR naming convention. 392 :param tag: a tag to append to the file name. Ignored if ocular_names is True. 393 :param file_extension: the file extension to save the montage as. Defaults to .jpeg. 394 :param kwargs: additional arguments to pass to imageio.imwrite(). 395 :return: None 396 """ 397 if csi_images is None or imageio is None: 398 raise ModuleNotFoundError( 399 "imageio libraries not installed! " 400 "run `pip install csi_images[imageio]` to resolve." 401 ) 402 403 montage = csi_images.scale_bit_depth(montage, np.uint8) 404 405 if not file_extension.startswith("."): 406 file_extension = f".{file_extension}" 407 408 if ocular_names: 409 if "cell_id" not in self.metadata.index: 410 raise ValueError( 411 "Event metadata must include 'cell_id' for OCULAR naming." 412 ) 413 file = f"{self.tile.n}-{self.metadata['cell_id']}-{self.x}-{self.y}{file_extension}" 414 else: 415 file = f"{self}{tag}{file_extension}" 416 417 os.makedirs(output_path, exist_ok=True) 418 imageio.imwrite(os.path.join(output_path, file), montage, **kwargs) 419 420 def load_montage(self, input_path: str, tag: str = "") -> np.ndarray: 421 """ 422 Loads the montage from a file saved by Event.save_montage. 423 :param input_path: the path to the folder where the montage was saved. 424 :param tag: a string to add to the file name, before the extension. 425 :return: 426 """ 427 file = f"{self}{tag}.jpeg" 428 return imageio.imread(os.path.join(input_path, file)) 429 430 @classmethod 431 def get_many_crops( 432 cls, 433 events: Sequence[Self], 434 crop_size: int | Sequence[int] = 100, 435 in_pixels: bool = True, 436 input_path: str | Sequence[str] = None, 437 channels: Sequence[int | str] = None, 438 apply_gain: bool | Sequence[bool] = True, 439 ) -> list[list[np.ndarray]]: 440 """ 441 Get the crops for a list of events, ensuring that there is no wasteful reading 442 of the same tile multiple times. This function is more efficient than calling 443 get_crops() for each event. 444 :param events: the events to get crops for. 445 :param crop_size: the square size of the image crop to get for this event. 446 Defaults to four times the size of the event. 447 :param in_pixels: whether the crop size is in pixels or micrometers. 448 Defaults to pixels, and is ignored if crop_size is None. 449 :param input_path: the path to the input images. Will only work for lists of events 450 from the same scan. Defaults to None (uses the scan's path). 451 :param channels: the channels to extract images for. Defaults to all channels. 452 :param apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True. 453 Can be supplied as a list to apply gain to individual channels. 454 :return: a list of lists of cropped images for each event. 455 """ 456 if len(events) == 0: 457 return [] 458 # Adapt singular inputs to lists of appropriate length 459 if isinstance(crop_size, int): 460 crop_size = [crop_size] * len(events) 461 if input_path is None or isinstance(input_path, str): 462 input_path = [input_path] * len(events) 463 464 # Get the order of the events when sorted by slide/tile 465 order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__())) 466 467 # Allocate the list to size 468 crops = [[]] * len(events) 469 last_tile = None 470 images = None # Holds large numpy arrays, so expensive to compare 471 # Iterate through in slide/tile sorted order 472 for i in order: 473 if last_tile != events[i].tile: 474 # Gather the frame images, preserving them for the next event 475 frames = Frame.get_frames(events[i].tile, channels) 476 if isinstance(apply_gain, bool): 477 apply = [apply_gain] * len(frames) 478 else: 479 apply = apply_gain 480 images = [f.get_image(input_path[i], a) for f, a in zip(frames, apply)] 481 last_tile = events[i].tile 482 # Use the frame images to crop the event images 483 crops[i] = events[i].crop(images, crop_size[i], in_pixels) 484 return crops 485 486 @classmethod 487 def get_many_montages( 488 cls, 489 events: Sequence[Self], 490 channels: Sequence[int | str] = None, 491 composites: dict[int | str, tuple[float, float, float]] = None, 492 masks: Sequence[np.ndarray[np.uint8]] = None, 493 labels: Sequence[str] = None, 494 crop_size: int = 100, 495 in_pixels: bool = True, 496 input_path: str = None, 497 apply_gain: bool | Iterable[bool] = True, 498 **kwargs, 499 ) -> list[np.ndarray]: 500 """ 501 Convenience function for get_montage(), but for a list of events. More efficient 502 than get_montage() when working with multiple events from the same tile. 503 :param events: a list of Event objects. 504 :param channels: the channels to extract images for. Defaults to all channels. 505 :param composites: dictionary of indices and RGB tuples for a composite. 506 :param masks: a list of masks to apply to the montages. Must be the same size as the crops. 507 :param labels: the labels to subtitle montage images, usually the channel names 508 :param crop_size: the square size of the image crop to get for this event. 509 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 510 :param input_path: the path to the input images. Defaults to None (uses the scan's path). 511 :param apply_gain: whether to apply scanner-calculated gain to the images, if 512 not already applied. If a list, matches the channels. 513 :param kwargs: montage options. See csi_images.make_montage() for more details. 514 :return: a list of numpy arrays representing the montages. 515 """ 516 if len(events) == 0: 517 return [] 518 # Adapt singular inputs to lists of appropriate length 519 if isinstance(crop_size, int): 520 crop_size = [crop_size] * len(events) 521 if input_path is None or isinstance(input_path, str): 522 input_path = [input_path] * len(events) 523 if masks is None or isinstance(masks, np.ndarray): 524 masks = [masks] * len(events) 525 526 # Get the order of the events when sorted by slide/tile 527 event_order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__())) 528 529 # Allocate the list to size 530 montages = [np.empty(0)] * len(events) 531 # Placeholder variables to avoid rereading the same tile 532 images = None # Holds large numpy arrays, so expensive to compare 533 order = None 534 rel_composites = None 535 last_tile = None 536 # Iterate through in slide/tile sorted order 537 for i in event_order: 538 if last_tile != events[i].tile: 539 channels_to_get, order, rel_composites = events[i].get_montage_channels( 540 channels, composites 541 ) 542 # Gather the frame images, preserving them for the next event 543 frames = Frame.get_frames(events[i].tile, channels_to_get) 544 if isinstance(apply_gain, bool): 545 apply = [apply_gain] * len(frames) 546 else: 547 apply = apply_gain 548 images = [f.get_image(input_path[i], a) for f, a in zip(frames, apply)] 549 last_tile = events[i].tile 550 # Use the frame images to crop the event images and make montages 551 crops = events[i].crop(images, crop_size[i], in_pixels) 552 montages[i] = csi_images.make_montage( 553 crops, order, rel_composites, masks[i], labels, **kwargs 554 ) 555 556 return montages 557 558 @classmethod 559 def get_and_save_many_crops( 560 cls, 561 events: list[Self], 562 output_path: str, 563 labels: Sequence[str], 564 ext: str = "auto", 565 additional_gain: Sequence[float] = None, 566 **kwargs, 567 ) -> None: 568 """ 569 Get and save the crops for a list of events, ensuring that there is no wasteful 570 reading and limiting the image data in memory to 1 tile at a time. This function 571 is more efficient that chaining get_crops() and save_crops() for each event or 572 get_many_crops() and then save_crops(). 573 :param events: list of events to get, crop, and save. 574 :param output_path: the folder to save the crops in. Will make if needed. 575 :param labels: the labels to save the crops with. See save_crops(). 576 :param ext: the file extension to save the crops as. See save_crops(). 577 :param additional_gain: additional gain to apply to the crops. If not None, must 578 match the length of the number of crop channels. 579 :param kwargs: see get_many_crops() for more parameters. 580 :return: 581 """ 582 unique_tiles = set([event.tile for event in events]) 583 584 for tile in unique_tiles: 585 # Get one tile's worth of event crops 586 tile_events = [e for e in events if e.tile == tile] 587 crops_list = cls.get_many_crops(tile_events, **kwargs) 588 for event, crops in zip(tile_events, crops_list): 589 # Apply any additional gains 590 if additional_gain is not None: 591 crops = [gain * crop for gain, crop in zip(additional_gain, crops)] 592 event.save_crops(crops, output_path, labels, ext) 593 594 @classmethod 595 def get_and_save_many_montages( 596 cls, 597 events: list[Self], 598 output_path: str, 599 ocular_names: bool = False, 600 tag: str = "", 601 **kwargs, 602 ) -> None: 603 """ 604 Save montages of the events to image files. 605 :param events: the events to get, montage, and save. 606 :param output_path: the folder to save the montages to. Will make if needed. 607 :param ocular_names: whether to use the OCULAR naming convention. 608 :param tag: a tag to append to the file name. Ignored if ocular_names is True. 609 :param kwargs: see get_many_montages() for more parameters. 610 """ 611 unique_tiles = set([event.tile for event in events]) 612 613 for tile in unique_tiles: 614 # Get one tile's worth of event crops 615 tile_events = [e for e in events if e.tile == tile] 616 montages = cls.get_many_montages(tile_events, **kwargs) 617 for event, montage in zip(tile_events, montages): 618 event.save_montage(montage, output_path, ocular_names, tag)
A class that represents a single event in a scan, making it easy to evaluate singular events. Required metadata is exposed as attributes, and optional metadata and features are stored as DataFrames.
Homogeneous transformation matrices for converting between scanner and slide coordinates. The matrices are 3x3, with the final column representing the translation in micrometers (um). For more information, see affine transformations.
Transformations are nominal, and accuracy is not guaranteed; this is due to imperfections in slides and alignment in the scanners. Units are in micrometers.
101 def get_scan_position(self) -> tuple[float, float]: 102 """ 103 Get the position of the event in the scanner's coordinate frame. 104 :return: the scan position of the event in micrometers (um). 105 """ 106 # Get overall pixel position 107 real_tile_height, real_tile_width = self.tile.scan.get_image_size() 108 pixel_x = self.x + (real_tile_width * self.tile.x) 109 pixel_y = self.y + (real_tile_height * self.tile.y) 110 # Convert to micrometers 111 x_um = pixel_x * self.tile.scan.pixel_size_um 112 y_um = pixel_y * self.tile.scan.pixel_size_um 113 # Add the scan's origin in the scanner frame 114 x_um += self.tile.scan.roi[self.tile.n_roi].origin_x_um 115 y_um += self.tile.scan.roi[self.tile.n_roi].origin_y_um 116 return x_um, y_um
Get the position of the event in the scanner's coordinate frame.
Returns
the scan position of the event in micrometers (um).
118 def get_slide_position(self) -> tuple[float, float]: 119 """ 120 Get the slide position of the event in micrometers (um). 121 :return: the slide position of the event. 122 """ 123 # Turn scan_position into a 3x1 vector 124 scan_position = self.get_scan_position() 125 scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]]) 126 127 # Multiply by the appropriate homogeneous matrix 128 if self.tile.scan.scanner_id.startswith(self.tile.scan.Type.AXIOSCAN7.value): 129 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.tile.scan.Type.AXIOSCAN7] 130 elif self.tile.scan.scanner_id.startswith(self.tile.scan.Type.BZSCANNER.value): 131 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.tile.scan.Type.BZSCANNER] 132 else: 133 raise ValueError(f"Scanner type {self.tile.scan.scanner_id} not supported.") 134 slide_position = np.matmul(transform, scan_position) 135 return float(slide_position[0][0]), float(slide_position[1][0])
Get the slide position of the event in micrometers (um).
Returns
the slide position of the event.
137 def crop( 138 self, images: Iterable[np.ndarray], crop_size: int = 100, in_pixels: bool = True 139 ) -> list[np.ndarray]: 140 """ 141 Crop the event from the provided frame images. Use if you have already gotten 142 frame images; useful for cropping multiple events from the same frame image. 143 :param images: the frame images. 144 :param crop_size: the square size of the image crop to get for this event. 145 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 146 :return: image_size x image_size crops of the event in the provided frames. If 147 the event is too close to the edge, the crop will be smaller and not centered. 148 """ 149 # Convert a crop size in micrometers to pixels 150 if not in_pixels: 151 crop_size = round(crop_size / self.tile.scan.pixel_size_um) 152 image_height, image_width = 0, 0 153 for image in images: 154 if image_height == 0 and image_width == 0: 155 image_height, image_width = image.shape 156 else: 157 if image_height != image.shape[0] or image_width != image.shape[1]: 158 raise ValueError("All images must be the same size") 159 if image_height == 0 or image_width == 0: 160 raise ValueError("No images provided") 161 162 # Find the crop bounds 163 bounds = [ 164 self.x - (crop_size // 2) + 1, 165 self.y - (crop_size // 2) + 1, 166 self.x + math.ceil(crop_size / 2) + 1, 167 self.y + math.ceil(crop_size / 2) + 1, 168 ] 169 # Determine how much the bounds violate the image size 170 displacements = [ 171 max(0, -bounds[0]), 172 max(0, -bounds[1]), 173 max(0, bounds[2] - image_width), 174 max(0, bounds[3] - image_height), 175 ] 176 # Cap off the bounds 177 bounds = [ 178 max(0, bounds[0]), 179 max(0, bounds[1]), 180 min(image_width, bounds[2]), 181 min(image_height, bounds[3]), 182 ] 183 184 # Crop the images 185 crops = [] 186 for image in images: 187 # Create a blank image of the right size 188 crop = np.zeros((crop_size, crop_size), dtype=image.dtype) 189 190 # Insert the cropped image into the blank image, leaving a black buffer 191 # around the edges if the crop would go beyond the original image bounds 192 crop[ 193 displacements[1] : crop_size - displacements[3], 194 displacements[0] : crop_size - displacements[2], 195 ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]] 196 crops.append(crop) 197 return crops
Crop the event from the provided frame images. Use if you have already gotten frame images; useful for cropping multiple events from the same frame image.
Parameters
- images: the frame images.
- crop_size: the square size of the image crop to get for this event.
- in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
Returns
image_size x image_size crops of the event in the provided frames. If the event is too close to the edge, the crop will be smaller and not centered.
199 def get_crops( 200 self, 201 crop_size: int = 100, 202 in_pixels: bool = True, 203 input_path: str = None, 204 channels: Iterable[int | str] = None, 205 apply_gain: bool | Iterable[bool] = True, 206 ) -> list[np.ndarray]: 207 """ 208 Gets the frame images for this event and then crops the event from the images. 209 Convenient for retrieving a single event's crops, but less efficient when 210 retrieving multiple events from the same tile as it will reread the images. 211 :param crop_size: the square size of the image crop to get for this event. 212 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 213 :param input_path: the path to the input images. Defaults to None (uses the scan's path). 214 :param channels: the channels to extract images for. Defaults to all channels. 215 :param apply_gain: whether to apply scanner-calculated gain to the images, if 216 not already applied. If a list, matches the channels. 217 :return: a list of cropped images from the scan in the order of the channels. 218 """ 219 # This function validates channels 220 frames = Frame.get_frames(self.tile, channels) 221 # Convert individual inputs to lists of appropriate length 222 if isinstance(apply_gain, bool): 223 apply_gain = [apply_gain] * len(frames) 224 images = [f.get_image(input_path, a) for f, a in zip(frames, apply_gain)] 225 return self.crop(images, crop_size, in_pixels)
Gets the frame images for this event and then crops the event from the images. Convenient for retrieving a single event's crops, but less efficient when retrieving multiple events from the same tile as it will reread the images.
Parameters
- crop_size: the square size of the image crop to get for this event.
- in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
- input_path: the path to the input images. Defaults to None (uses the scan's path).
- channels: the channels to extract images for. Defaults to all channels.
- apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. If a list, matches the channels.
Returns
a list of cropped images from the scan in the order of the channels.
227 def save_crops( 228 self, 229 crops: Sequence[np.ndarray], 230 output_path: str, 231 labels: Sequence[str], 232 ext: str = "auto", 233 ): 234 """ 235 Save the crops to image files. 236 :param crops: the crops to save. Will save as RGB if 3 channel [h, w, 3] or 237 grayscale if 1 channel [h, w] or [h, w, 1]. 238 :param labels: the labels to append to the file name, usually the channel names 239 associated with each crop. 240 :param output_path: the folder to save the crops to. Will make if needed. 241 :param ext: the file extension to save the crops as. Defaults to "auto", which 242 will save as .tif for grayscale images and .jpg for RGB images. 243 :return: None 244 """ 245 if len(crops) != len(labels): 246 raise ValueError("Crops and labels must be the same length") 247 248 if csi_images is None or imageio is None: 249 raise ModuleNotFoundError( 250 "imageio libraries not installed! " 251 "run `pip install csi_images[imageio]` to resolve." 252 ) 253 254 os.makedirs(output_path, exist_ok=True) 255 256 for crop, label in zip(crops, labels): 257 if ext == "auto": 258 if len(crop.shape) == 2 or crop.shape[2] == 1: 259 file_extension = ".tif" 260 elif crop.shape[2] == 3: 261 file_extension = ".jpg" 262 else: 263 warnings.warn( 264 f"Image shape {crop.shape} not recognized; saving as .tif" 265 ) 266 file_extension = ".tif" 267 else: 268 file_extension = ext 269 file = os.path.join(output_path, f"{self}-{label}{file_extension}") 270 # TODO: add more file types here 271 if file_extension == ".tif": 272 imageio.imwrite(file, crop, compression="deflate") 273 elif file_extension in [".jpg", ".jpeg"]: 274 crop = csi_images.scale_bit_depth(crop, np.uint8) 275 imageio.imwrite(file, crop, quality=80) 276 else: 277 imageio.imwrite(file, crop)
Save the crops to image files.
Parameters
- crops: the crops to save. Will save as RGB if 3 channel [h, w, 3] or grayscale if 1 channel [h, w] or [h, w, 1].
- labels: the labels to append to the file name, usually the channel names associated with each crop.
- output_path: the folder to save the crops to. Will make if needed.
- ext: the file extension to save the crops as. Defaults to "auto", which will save as .tif for grayscale images and .jpg for RGB images.
Returns
None
279 def load_crops( 280 self, input_path: str, labels: list[str] = None 281 ) -> dict[str, np.ndarray]: 282 """ 283 Loads previously saved crop files from a folder. 284 :param input_path: folder containing crop files. 285 :param labels: optional label filter, will only return crops with these labels. 286 :return: a tuple of lists containing the crops and their labels. 287 """ 288 crops = {} 289 for file in glob.glob(os.path.join(input_path, f"{self}-*")): 290 label = os.path.splitext(os.path.basename(file))[0].split("-")[-1] 291 # Skip if we have labels to target 292 if labels is not None and label not in labels: 293 continue 294 crops[label] = imageio.imread(file) 295 return crops
Loads previously saved crop files from a folder.
Parameters
- input_path: folder containing crop files.
- labels: optional label filter, will only return crops with these labels.
Returns
a tuple of lists containing the crops and their labels.
297 def get_montage_channels( 298 self, 299 channels: Sequence[int | str] | None = None, 300 composites: dict[int | str, tuple[float, float, float]] | None = None, 301 ) -> tuple[list[int], list[int], dict[int, tuple[float, float, float]]]: 302 """ 303 Get the channel names for the montage from the event's tile. 304 :param channels: channel indices or names for grayscale channels 305 :param composites: dictionary of channel indices or names and RGB values 306 :return: (1) channel indices to retrieve, 307 (2) relative grayscale channel indices, and 308 (3) composite channel indices and RGB values. 309 """ 310 if channels is None: 311 channels = list(range(len(self.tile.scan.channels))) 312 if (len(channels) == 0) and (composites is None or len(composites) == 0): 313 raise ValueError("Must provide at least one channel type to montage") 314 315 channels_to_get = [] 316 317 # Build the list of channels to retrieve 318 if channels is not None: 319 if isinstance(channels[0], str): 320 channels = self.tile.scan.get_channel_indices(channels) 321 channels_to_get += channels 322 order = list(range(len(channels))) # Always the first n channels 323 else: 324 order = None 325 326 if composites is not None: 327 relative_composites = {} # Relative indices for retrieved channels 328 # Convert to scan indices 329 rgb_channels = list(composites.keys()) 330 if isinstance(rgb_channels[0], str): 331 rgb_channels = self.tile.scan.get_channel_indices(rgb_channels) 332 # Find the index or add to the end 333 for channel, rgb in zip(rgb_channels, composites.values()): 334 if channel not in channels_to_get: 335 channels_to_get.append(channel) 336 relative_composites[channel] = rgb 337 else: 338 relative_composites[channels_to_get.index(channel)] = rgb 339 else: 340 relative_composites = None 341 342 return channels_to_get, order, relative_composites
Get the channel names for the montage from the event's tile.
Parameters
- channels: channel indices or names for grayscale channels
- composites: dictionary of channel indices or names and RGB values
Returns
(1) channel indices to retrieve, (2) relative grayscale channel indices, and (3) composite channel indices and RGB values.
344 def get_montage( 345 self, 346 channels: Sequence[int | str] = None, 347 composites: dict[int | str, tuple[float, float, float]] = None, 348 mask: np.ndarray[np.uint8] = None, 349 labels: Sequence[str] = None, 350 crop_size: int = 100, 351 in_pixels: bool = True, 352 input_path: str = None, 353 apply_gain: bool = True, 354 **kwargs, 355 ) -> np.ndarray: 356 """ 357 Convenience function for getting frame images and creating a montage. Mirrors 358 csi_images.make_montage(). Convenient for a single event's montage, but less 359 efficient when for multiple events from the same tile. 360 :param channels: the channels to use for black-and-white montages. 361 :param composites: dictionary of indices and RGB tuples for a composite. 362 :param mask: a mask to apply to the montage. Must be the same size as the crop. 363 :param labels: the labels to subtitle montage images, usually the channel names 364 :param crop_size: the square size of the image crop to get for this event. 365 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 366 :param input_path: the path to the input images. Defaults to None (uses the scan's path). 367 :param apply_gain: whether to apply scanner-calculated gain to the images, if 368 not already applied. If a list, matches the channels. 369 :param kwargs: montage options. See csi_images.make_montage() for more details. 370 :return: numpy array representing the montage. 371 """ 372 channels, order, composites = self.get_montage_channels(channels, composites) 373 images = self.get_crops(crop_size, in_pixels, input_path, channels, apply_gain) 374 return csi_images.make_montage( 375 images, order, composites, mask, labels, **kwargs 376 )
Convenience function for getting frame images and creating a montage. Mirrors csi_images.make_montage(). Convenient for a single event's montage, but less efficient when for multiple events from the same tile.
Parameters
- channels: the channels to use for black-and-white montages.
- composites: dictionary of indices and RGB tuples for a composite.
- mask: a mask to apply to the montage. Must be the same size as the crop.
- labels: the labels to subtitle montage images, usually the channel names
- crop_size: the square size of the image crop to get for this event.
- in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
- input_path: the path to the input images. Defaults to None (uses the scan's path).
- apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. If a list, matches the channels.
- kwargs: montage options. See csi_images.make_montage() for more details.
Returns
numpy array representing the montage.
378 def save_montage( 379 self, 380 montage: np.ndarray, 381 output_path: str, 382 ocular_names: bool = False, 383 tag: str = "", 384 file_extension: str = ".jpeg", 385 **kwargs, 386 ): 387 """ 388 Save the montage as a JPEG image with a set name. 389 :param montage: the montage to save. 390 :param output_path: the folder to save the montage in. Will make if needed. 391 :param ocular_names: whether to use the OCULAR naming convention. 392 :param tag: a tag to append to the file name. Ignored if ocular_names is True. 393 :param file_extension: the file extension to save the montage as. Defaults to .jpeg. 394 :param kwargs: additional arguments to pass to imageio.imwrite(). 395 :return: None 396 """ 397 if csi_images is None or imageio is None: 398 raise ModuleNotFoundError( 399 "imageio libraries not installed! " 400 "run `pip install csi_images[imageio]` to resolve." 401 ) 402 403 montage = csi_images.scale_bit_depth(montage, np.uint8) 404 405 if not file_extension.startswith("."): 406 file_extension = f".{file_extension}" 407 408 if ocular_names: 409 if "cell_id" not in self.metadata.index: 410 raise ValueError( 411 "Event metadata must include 'cell_id' for OCULAR naming." 412 ) 413 file = f"{self.tile.n}-{self.metadata['cell_id']}-{self.x}-{self.y}{file_extension}" 414 else: 415 file = f"{self}{tag}{file_extension}" 416 417 os.makedirs(output_path, exist_ok=True) 418 imageio.imwrite(os.path.join(output_path, file), montage, **kwargs)
Save the montage as a JPEG image with a set name.
Parameters
- montage: the montage to save.
- output_path: the folder to save the montage in. Will make if needed.
- ocular_names: whether to use the OCULAR naming convention.
- tag: a tag to append to the file name. Ignored if ocular_names is True.
- file_extension: the file extension to save the montage as. Defaults to .jpeg.
- kwargs: additional arguments to pass to imageio.imwrite().
Returns
None
420 def load_montage(self, input_path: str, tag: str = "") -> np.ndarray: 421 """ 422 Loads the montage from a file saved by Event.save_montage. 423 :param input_path: the path to the folder where the montage was saved. 424 :param tag: a string to add to the file name, before the extension. 425 :return: 426 """ 427 file = f"{self}{tag}.jpeg" 428 return imageio.imread(os.path.join(input_path, file))
Loads the montage from a file saved by Event.save_montage.
Parameters
- input_path: the path to the folder where the montage was saved.
- tag: a string to add to the file name, before the extension.
Returns
430 @classmethod 431 def get_many_crops( 432 cls, 433 events: Sequence[Self], 434 crop_size: int | Sequence[int] = 100, 435 in_pixels: bool = True, 436 input_path: str | Sequence[str] = None, 437 channels: Sequence[int | str] = None, 438 apply_gain: bool | Sequence[bool] = True, 439 ) -> list[list[np.ndarray]]: 440 """ 441 Get the crops for a list of events, ensuring that there is no wasteful reading 442 of the same tile multiple times. This function is more efficient than calling 443 get_crops() for each event. 444 :param events: the events to get crops for. 445 :param crop_size: the square size of the image crop to get for this event. 446 Defaults to four times the size of the event. 447 :param in_pixels: whether the crop size is in pixels or micrometers. 448 Defaults to pixels, and is ignored if crop_size is None. 449 :param input_path: the path to the input images. Will only work for lists of events 450 from the same scan. Defaults to None (uses the scan's path). 451 :param channels: the channels to extract images for. Defaults to all channels. 452 :param apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True. 453 Can be supplied as a list to apply gain to individual channels. 454 :return: a list of lists of cropped images for each event. 455 """ 456 if len(events) == 0: 457 return [] 458 # Adapt singular inputs to lists of appropriate length 459 if isinstance(crop_size, int): 460 crop_size = [crop_size] * len(events) 461 if input_path is None or isinstance(input_path, str): 462 input_path = [input_path] * len(events) 463 464 # Get the order of the events when sorted by slide/tile 465 order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__())) 466 467 # Allocate the list to size 468 crops = [[]] * len(events) 469 last_tile = None 470 images = None # Holds large numpy arrays, so expensive to compare 471 # Iterate through in slide/tile sorted order 472 for i in order: 473 if last_tile != events[i].tile: 474 # Gather the frame images, preserving them for the next event 475 frames = Frame.get_frames(events[i].tile, channels) 476 if isinstance(apply_gain, bool): 477 apply = [apply_gain] * len(frames) 478 else: 479 apply = apply_gain 480 images = [f.get_image(input_path[i], a) for f, a in zip(frames, apply)] 481 last_tile = events[i].tile 482 # Use the frame images to crop the event images 483 crops[i] = events[i].crop(images, crop_size[i], in_pixels) 484 return crops
Get the crops for a list of events, ensuring that there is no wasteful reading of the same tile multiple times. This function is more efficient than calling get_crops() for each event.
Parameters
- events: the events to get crops for.
- crop_size: the square size of the image crop to get for this event. Defaults to four times the size of the event.
- in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels, and is ignored if crop_size is None.
- input_path: the path to the input images. Will only work for lists of events from the same scan. Defaults to None (uses the scan's path).
- channels: the channels to extract images for. Defaults to all channels.
- apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True. Can be supplied as a list to apply gain to individual channels.
Returns
a list of lists of cropped images for each event.
486 @classmethod 487 def get_many_montages( 488 cls, 489 events: Sequence[Self], 490 channels: Sequence[int | str] = None, 491 composites: dict[int | str, tuple[float, float, float]] = None, 492 masks: Sequence[np.ndarray[np.uint8]] = None, 493 labels: Sequence[str] = None, 494 crop_size: int = 100, 495 in_pixels: bool = True, 496 input_path: str = None, 497 apply_gain: bool | Iterable[bool] = True, 498 **kwargs, 499 ) -> list[np.ndarray]: 500 """ 501 Convenience function for get_montage(), but for a list of events. More efficient 502 than get_montage() when working with multiple events from the same tile. 503 :param events: a list of Event objects. 504 :param channels: the channels to extract images for. Defaults to all channels. 505 :param composites: dictionary of indices and RGB tuples for a composite. 506 :param masks: a list of masks to apply to the montages. Must be the same size as the crops. 507 :param labels: the labels to subtitle montage images, usually the channel names 508 :param crop_size: the square size of the image crop to get for this event. 509 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 510 :param input_path: the path to the input images. Defaults to None (uses the scan's path). 511 :param apply_gain: whether to apply scanner-calculated gain to the images, if 512 not already applied. If a list, matches the channels. 513 :param kwargs: montage options. See csi_images.make_montage() for more details. 514 :return: a list of numpy arrays representing the montages. 515 """ 516 if len(events) == 0: 517 return [] 518 # Adapt singular inputs to lists of appropriate length 519 if isinstance(crop_size, int): 520 crop_size = [crop_size] * len(events) 521 if input_path is None or isinstance(input_path, str): 522 input_path = [input_path] * len(events) 523 if masks is None or isinstance(masks, np.ndarray): 524 masks = [masks] * len(events) 525 526 # Get the order of the events when sorted by slide/tile 527 event_order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__())) 528 529 # Allocate the list to size 530 montages = [np.empty(0)] * len(events) 531 # Placeholder variables to avoid rereading the same tile 532 images = None # Holds large numpy arrays, so expensive to compare 533 order = None 534 rel_composites = None 535 last_tile = None 536 # Iterate through in slide/tile sorted order 537 for i in event_order: 538 if last_tile != events[i].tile: 539 channels_to_get, order, rel_composites = events[i].get_montage_channels( 540 channels, composites 541 ) 542 # Gather the frame images, preserving them for the next event 543 frames = Frame.get_frames(events[i].tile, channels_to_get) 544 if isinstance(apply_gain, bool): 545 apply = [apply_gain] * len(frames) 546 else: 547 apply = apply_gain 548 images = [f.get_image(input_path[i], a) for f, a in zip(frames, apply)] 549 last_tile = events[i].tile 550 # Use the frame images to crop the event images and make montages 551 crops = events[i].crop(images, crop_size[i], in_pixels) 552 montages[i] = csi_images.make_montage( 553 crops, order, rel_composites, masks[i], labels, **kwargs 554 ) 555 556 return montages
Convenience function for get_montage(), but for a list of events. More efficient than get_montage() when working with multiple events from the same tile.
Parameters
- events: a list of Event objects.
- channels: the channels to extract images for. Defaults to all channels.
- composites: dictionary of indices and RGB tuples for a composite.
- masks: a list of masks to apply to the montages. Must be the same size as the crops.
- labels: the labels to subtitle montage images, usually the channel names
- crop_size: the square size of the image crop to get for this event.
- in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
- input_path: the path to the input images. Defaults to None (uses the scan's path).
- apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. If a list, matches the channels.
- kwargs: montage options. See csi_images.make_montage() for more details.
Returns
a list of numpy arrays representing the montages.
558 @classmethod 559 def get_and_save_many_crops( 560 cls, 561 events: list[Self], 562 output_path: str, 563 labels: Sequence[str], 564 ext: str = "auto", 565 additional_gain: Sequence[float] = None, 566 **kwargs, 567 ) -> None: 568 """ 569 Get and save the crops for a list of events, ensuring that there is no wasteful 570 reading and limiting the image data in memory to 1 tile at a time. This function 571 is more efficient that chaining get_crops() and save_crops() for each event or 572 get_many_crops() and then save_crops(). 573 :param events: list of events to get, crop, and save. 574 :param output_path: the folder to save the crops in. Will make if needed. 575 :param labels: the labels to save the crops with. See save_crops(). 576 :param ext: the file extension to save the crops as. See save_crops(). 577 :param additional_gain: additional gain to apply to the crops. If not None, must 578 match the length of the number of crop channels. 579 :param kwargs: see get_many_crops() for more parameters. 580 :return: 581 """ 582 unique_tiles = set([event.tile for event in events]) 583 584 for tile in unique_tiles: 585 # Get one tile's worth of event crops 586 tile_events = [e for e in events if e.tile == tile] 587 crops_list = cls.get_many_crops(tile_events, **kwargs) 588 for event, crops in zip(tile_events, crops_list): 589 # Apply any additional gains 590 if additional_gain is not None: 591 crops = [gain * crop for gain, crop in zip(additional_gain, crops)] 592 event.save_crops(crops, output_path, labels, ext)
Get and save the crops for a list of events, ensuring that there is no wasteful reading and limiting the image data in memory to 1 tile at a time. This function is more efficient that chaining get_crops() and save_crops() for each event or get_many_crops() and then save_crops().
Parameters
- events: list of events to get, crop, and save.
- output_path: the folder to save the crops in. Will make if needed.
- labels: the labels to save the crops with. See save_crops().
- ext: the file extension to save the crops as. See save_crops().
- additional_gain: additional gain to apply to the crops. If not None, must match the length of the number of crop channels.
- kwargs: see get_many_crops() for more parameters.
Returns
594 @classmethod 595 def get_and_save_many_montages( 596 cls, 597 events: list[Self], 598 output_path: str, 599 ocular_names: bool = False, 600 tag: str = "", 601 **kwargs, 602 ) -> None: 603 """ 604 Save montages of the events to image files. 605 :param events: the events to get, montage, and save. 606 :param output_path: the folder to save the montages to. Will make if needed. 607 :param ocular_names: whether to use the OCULAR naming convention. 608 :param tag: a tag to append to the file name. Ignored if ocular_names is True. 609 :param kwargs: see get_many_montages() for more parameters. 610 """ 611 unique_tiles = set([event.tile for event in events]) 612 613 for tile in unique_tiles: 614 # Get one tile's worth of event crops 615 tile_events = [e for e in events if e.tile == tile] 616 montages = cls.get_many_montages(tile_events, **kwargs) 617 for event, montage in zip(tile_events, montages): 618 event.save_montage(montage, output_path, ocular_names, tag)
Save montages of the events to image files.
Parameters
- events: the events to get, montage, and save.
- output_path: the folder to save the montages to. Will make if needed.
- ocular_names: whether to use the OCULAR naming convention.
- tag: a tag to append to the file name. Ignored if ocular_names is True.
- kwargs: see get_many_montages() for more parameters.
621class EventArray: 622 """ 623 A class that holds a large number of events' data, making it easy to analyze and 624 manipulate many events at once. A more separated version of the Event class. 625 """ 626 627 INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y"] 628 629 def __init__( 630 self, 631 info: pd.DataFrame = None, 632 metadata: pd.DataFrame = None, 633 features: pd.DataFrame = None, 634 ): 635 636 # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y" 637 self.info = info 638 if self.info is not None: 639 # Special case: "roi" is often not required, so we'll fill in if its missing 640 if "roi" not in info.columns: 641 self.info = self.info.assign(roi=0) 642 if set(self.info.columns) != set(self.INFO_COLUMNS): 643 raise ValueError( 644 f"EventArray.info must have columns:" 645 f"{self.INFO_COLUMNS}; had {list(self.info.columns)}" 646 ) 647 # Ensure order and data types 648 self.info = pd.DataFrame( 649 { 650 "slide_id": self.info["slide_id"].astype(str), 651 "tile": self.info["tile"].astype(np.uint16), 652 "roi": self.info["roi"].astype(np.uint8), 653 "x": self.info["x"].round().astype(np.uint16), 654 "y": self.info["y"].round().astype(np.uint16), 655 } 656 ) 657 658 # All DataFrames must all have the same number of rows 659 if metadata is not None and (info is None or len(info) != len(metadata)): 660 raise ValueError( 661 "If EventArray.metadata is not None, it should match rows with .info" 662 ) 663 if features is not None and (info is None or len(info) != len(features)): 664 raise ValueError( 665 "If EventArray.features is not None, it should match rows with .info" 666 ) 667 # No columns named "metadata_", "features_", or "None" 668 column_names = [] 669 if metadata is not None: 670 column_names += metadata.columns.tolist() 671 if features is not None: 672 column_names += features.columns.tolist() 673 if any([col.lower().startswith("metadata_") for col in column_names]): 674 raise ValueError("EventArray column names cannot start with 'metadata_'") 675 if any([col.lower().startswith("features_") for col in column_names]): 676 raise ValueError("EventArray column names cannot start with 'features_'") 677 if any([col.lower() == "none" for col in column_names]): 678 raise ValueError("EventArray column names cannot be 'none'") 679 680 # Add metadata and features 681 self.metadata = None 682 self.features = None 683 if metadata is not None: 684 self.add_metadata(metadata) 685 if features is not None: 686 self.add_features(features) 687 688 def __len__(self) -> int: 689 # Convenience method to get the number of events 690 if self.info is None: 691 return 0 692 else: 693 return len(self.info) 694 695 def __eq__(self, other): 696 # Parse all possibilities for info 697 if isinstance(self.info, pd.DataFrame): 698 if isinstance(other.info, pd.DataFrame): 699 if not self.info.equals(other.info): 700 return False 701 else: 702 return False 703 elif self.info is None: 704 if other.info is not None: 705 return False 706 707 # Parse all possibilities for metadata 708 if isinstance(self.metadata, pd.DataFrame): 709 if isinstance(other.metadata, pd.DataFrame): 710 is_equal = self.metadata.equals(other.metadata) 711 if not is_equal: 712 return False 713 else: 714 return False 715 elif self.metadata is None: 716 if other.metadata is not None: 717 return False 718 719 # Parse all possibilities for features 720 if isinstance(self.features, pd.DataFrame): 721 if isinstance(other.features, pd.DataFrame): 722 is_equal = self.features.equals(other.features) 723 if not is_equal: 724 return False 725 else: 726 return False 727 elif self.features is None: 728 if other.features is not None: 729 return False 730 731 return is_equal 732 733 def get_sort_order( 734 self, by: Hashable | Sequence[Hashable], ascending: bool | Sequence[bool] = True 735 ): 736 """ 737 Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames. 738 :param by: name of the column(s) to sort by. 739 :param ascending: whether to sort in ascending order; can be a list to match by 740 :return: the order of the indices to sort by. 741 """ 742 columns = self.get(by) 743 return columns.sort_values(by=by, ascending=ascending).index 744 745 def sort( 746 self, 747 by: Hashable | Sequence[Hashable], 748 ascending: bool | Sequence[bool] = True, 749 ) -> Self: 750 """ 751 Sort the EventArray by column(s) in the info, metadata, or features DataFrames. 752 :param by: name of the column(s) to sort by. 753 :param ascending: whether to sort in ascending order; can be a list to match by 754 :return: a new, sorted EventArray. 755 """ 756 order = self.get_sort_order(by, ascending) 757 info = self.info.loc[order].reset_index(drop=True) 758 if self.metadata is not None: 759 metadata = self.metadata.loc[order].reset_index(drop=True) 760 else: 761 metadata = None 762 if self.features is not None: 763 features = self.features.loc[order].reset_index(drop=True) 764 else: 765 features = None 766 return EventArray(info, metadata, features) 767 768 def get(self, column_names: Hashable | Sequence[Hashable]) -> pd.DataFrame: 769 """ 770 Get a DataFrame with the specified columns from the EventArray, by value. 771 :param column_names: the names of the columns to get. 772 :return: a DataFrame with the specified columns. 773 """ 774 if isinstance(column_names, Hashable): 775 column_names = [column_names] # Drop into a list for the loop 776 columns = [] 777 for column_name in column_names: 778 if column_name in self.info.columns: 779 columns.append(self.info[column_name]) 780 elif self.metadata is not None and column_name in self.metadata.columns: 781 columns.append(self.metadata[column_name]) 782 elif self.features is not None and column_name in self.features.columns: 783 columns.append(self.features[column_name]) 784 else: 785 raise ValueError(f"Column {column_name} not found in EventArray") 786 return pd.concat(columns, axis=1) 787 788 def rows(self, rows: Sequence[Hashable]) -> Self: 789 """ 790 Get a subset of the EventArray rows based on a boolean or integer index, by value. 791 :param rows: row labels, indices, or boolean mask; anything for .loc[] 792 :return: a new EventArray with the subset of events. 793 """ 794 info = self.info.loc[rows].reset_index(drop=True) 795 if self.metadata is not None: 796 metadata = self.metadata.loc[rows].reset_index(drop=True) 797 else: 798 metadata = None 799 if self.features is not None: 800 features = self.features.loc[rows].reset_index(drop=True) 801 else: 802 features = None 803 return EventArray(info, metadata, features) 804 805 def copy(self) -> Self: 806 """ 807 Create a deep copy of the EventArray. 808 :return: a deep copy of the EventArray. 809 """ 810 return EventArray( 811 info=self.info.copy(), 812 metadata=None if self.metadata is None else self.metadata.copy(), 813 features=None if self.features is None else self.features.copy(), 814 ) 815 816 # TODO: add a "filter" convenience function that takes a column name and values to filter by 817 818 def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None: 819 """ 820 Add metadata to the EventArray. Removes the need to check if metadata is None. 821 Overwrites any existing metadata with the same column names as the new metadata. 822 :param new_metadata: the metadata to add. 823 """ 824 if self.info is None or len(self.info) != len(new_metadata): 825 raise ValueError("New metadata must match length of existing info") 826 827 if isinstance(new_metadata, pd.Series): 828 # Convert to a DataFrame 829 new_metadata = pd.DataFrame(new_metadata) 830 831 for col in new_metadata.columns: 832 if col in self.INFO_COLUMNS: 833 warnings.warn( 834 f"Column name {col} is reserved for info; you can only " 835 "access this column through the .metadata attribute" 836 ) 837 elif self.features is not None and col in self.features.columns: 838 warnings.warn( 839 f"Column name {col} also exists in the .features attribute; " 840 f"calling this.get({col}) will return the .metadata column" 841 ) 842 843 if self.metadata is None: 844 self.metadata = new_metadata 845 else: 846 self.metadata.loc[:, new_metadata.columns] = new_metadata 847 848 def add_features(self, new_features: pd.Series | pd.DataFrame) -> None: 849 """ 850 Add features to the EventArray. Removes the need to check if features is None. 851 Overwrites any existing features with the same column names as the new features. 852 :param new_features: the features to add. 853 """ 854 if self.info is None or len(self.info) != len(new_features): 855 raise ValueError("New features must match length of existing info") 856 857 if isinstance(new_features, pd.Series): 858 # Convert to a DataFrame 859 new_features = pd.DataFrame(new_features) 860 861 for col in new_features.columns: 862 if col in self.INFO_COLUMNS: 863 warnings.warn( 864 f"Column name {col} is reserved for info; you can only " 865 "access this column through the .features attribute" 866 ) 867 elif self.metadata is not None and col in self.metadata.columns: 868 warnings.warn( 869 f"Column name {col} already exists in the .metadata attribute;" 870 f"calling this.get({col}) will return the .metadata column" 871 ) 872 873 if self.features is None: 874 self.features = new_features 875 else: 876 self.features.loc[:, new_features.columns] = new_features 877 878 @classmethod 879 def merge(cls, events: Iterable[Self]) -> Self: 880 """ 881 Combine EventArrays in a list into a single EventArray. 882 :param events: the new list of events. 883 """ 884 all_info = [] 885 all_metadata = [] 886 all_features = [] 887 for event_array in events: 888 # Skip empty EventArrays 889 if event_array.info is not None: 890 all_info.append(event_array.info) 891 if event_array.metadata is not None: 892 all_metadata.append(event_array.metadata) 893 if event_array.features is not None: 894 all_features.append(event_array.features) 895 if len(all_info) == 0: 896 return EventArray() 897 else: 898 all_info = pd.concat(all_info, ignore_index=True) 899 if len(all_metadata) == 0: 900 all_metadata = None 901 else: 902 all_metadata = pd.concat(all_metadata, ignore_index=True) 903 if len(all_features) == 0: 904 all_features = None 905 else: 906 all_features = pd.concat(all_features, ignore_index=True) 907 908 return EventArray(all_info, all_metadata, all_features) 909 910 def to_events( 911 self, 912 scans: Scan | Iterable[Scan], 913 ignore_missing_scans=True, 914 ignore_metadata=False, 915 ignore_features=False, 916 ) -> list[Event]: 917 """ 918 Get the events in the EventArray as a list of events. Returns [] if empty. 919 :param scans: the scans that the events belong to, auto-matched by slide_id. 920 Pass None if you don't care about scan metadata (pass ignore_missing_scans). 921 :param ignore_missing_scans: whether to create blank scans for events without scans. 922 :param ignore_metadata: whether to ignore metadata or not 923 :param ignore_features: whether to ignore features or not 924 :return: 925 """ 926 if len(self) == 0: 927 return [] 928 if isinstance(scans, Scan): 929 scans = [scans] 930 scans = {scan.slide_id: scan for scan in scans} 931 events = [] 932 for i in range(len(self.info)): 933 # Determine the associated scan 934 slide_id = self.info["slide_id"][i] 935 if slide_id not in scans: 936 if ignore_missing_scans: 937 # Create a placeholder scan if the scan is missing 938 scan = Scan.make_placeholder( 939 slide_id, 940 self.info["tile"][i], 941 self.info["roi"][i], 942 ) 943 else: 944 raise ValueError( 945 f"Scan {self.info['slide_id'][i]} not found for event {i}." 946 ) 947 else: 948 scan = scans[slide_id] 949 950 # Prepare the metadata and features 951 if ignore_metadata or self.metadata is None: 952 metadata = None 953 else: 954 # This Series creation method is less efficient, 955 # but required for preserving dtypes 956 metadata = pd.Series( 957 {col: self.metadata.loc[i, col] for col in self.metadata.columns}, 958 dtype=object, 959 ) 960 if ignore_features or self.features is None: 961 features = None 962 else: 963 features = pd.Series( 964 {col: self.features.loc[i, col] for col in self.features.columns}, 965 dtype=object, 966 ) 967 # Create the event and append it to the list 968 events.append( 969 Event( 970 Tile(scan, self.info["tile"][i], self.info["roi"][i]), 971 self.info["x"][i], 972 self.info["y"][i], 973 metadata=metadata, 974 features=features, 975 ) 976 ) 977 return events 978 979 @classmethod 980 def from_events(cls, events: Iterable[Event]) -> Self: 981 """ 982 Set the events in the EventArray to a new list of events. 983 :param events: the new list of events. 984 """ 985 info = pd.DataFrame( 986 { 987 "slide_id": [event.tile.scan.slide_id for event in events], 988 "tile": [event.tile.n for event in events], 989 "roi": [event.tile.n_roi for event in events], 990 "x": [event.x for event in events], 991 "y": [event.y for event in events], 992 } 993 ) 994 metadata_list = [event.metadata for event in events] 995 # Iterate through and ensure that all metadata is the same shape 996 for metadata in metadata_list: 997 if type(metadata) != type(metadata_list[0]): 998 raise ValueError("All metadata must be the same type.") 999 if metadata is not None and metadata.shape != metadata_list[0].shape: 1000 raise ValueError("All metadata must be the same shape.") 1001 if metadata_list[0] is None: 1002 metadata = None 1003 else: 1004 metadata = pd.DataFrame(metadata_list) 1005 features_list = [event.features for event in events] 1006 # Iterate through and ensure that all features are the same shape 1007 for features in features_list: 1008 if type(features) != type(features_list[0]): 1009 raise ValueError("All features must be the same type.") 1010 if features is not None and features.shape != features_list[0].shape: 1011 raise ValueError("All features must be the same shape.") 1012 if features_list[0] is None: 1013 features = None 1014 else: 1015 features = pd.DataFrame(features_list) 1016 return EventArray(info=info, metadata=metadata, features=features) 1017 1018 def to_dataframe(self) -> pd.DataFrame: 1019 """ 1020 Convert all the data in the EventArray to a single DataFrame. 1021 :return: a DataFrame with all the data in the EventArray. 1022 """ 1023 # Make a copy of the info DataFrame and prepend "info_" to the column names 1024 output = self.info.copy() 1025 # Combine with the metadata and prepend "metadata_" to the column names 1026 if self.metadata is not None: 1027 metadata = self.metadata.copy() 1028 metadata.columns = [f"metadata_{col}" for col in metadata.columns] 1029 output = pd.concat([output, metadata], axis=1) 1030 # Combine with the features and prepend "features_" to the column names 1031 if self.features is not None: 1032 features = self.features.copy() 1033 features.columns = [f"features_{col}" for col in features.columns] 1034 output = pd.concat([output, features], axis=1) 1035 return output 1036 1037 @classmethod 1038 def from_dataframe( 1039 cls, df, metadata_prefix: str = "metadata_", features_prefix: str = "features_" 1040 ) -> Self: 1041 """ 1042 From a single, special DataFrame, create an EventArray. 1043 :param df: the DataFrame to convert to an EventArray. 1044 :param metadata_prefix: the prefix for metadata columns. 1045 :param features_prefix: the prefix for features columns. 1046 :return: a DataFrame with all the data in the EventArray. 1047 """ 1048 # Split the columns into info, metadata, and features and strip prefix 1049 info = df[[col for col in df.columns if col in cls.INFO_COLUMNS]].copy() 1050 if info.size == 0: 1051 info = None 1052 metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy() 1053 metadata.columns = [ 1054 col.replace(metadata_prefix, "") for col in metadata.columns 1055 ] 1056 if metadata.size == 0: 1057 metadata = None 1058 features = df[[col for col in df.columns if col.startswith("features_")]].copy() 1059 features.columns = [ 1060 col.replace(features_prefix, "") for col in features.columns 1061 ] 1062 if features.size == 0: 1063 features = None 1064 return cls(info=info, metadata=metadata, features=features) 1065 1066 @classmethod 1067 def from_mask( 1068 cls, 1069 mask: np.ndarray, 1070 tile: Tile, 1071 include_cell_id: bool = True, 1072 images: list[np.ndarray] = None, 1073 image_labels: list[str] = None, 1074 properties: list[str] = None, 1075 ) -> Self: 1076 """ 1077 Extract events from a mask DataFrame, including metadata and features. 1078 :param mask: the mask to extract events from. 1079 :param tile: the Tile object associated with this mask. 1080 :param include_cell_id: whether to include the cell_id, or numerical 1081 mask label, as metadata in the EventArray. 1082 :param images: the intensity images to extract features from. 1083 :param image_labels: the labels for the intensity images. 1084 :param properties: list of properties to extract in addition to the defaults: 1085 :return: EventArray corresponding to the mask labels. 1086 """ 1087 if csi_images is None: 1088 raise ModuleNotFoundError( 1089 "imageio libraries not installed! " 1090 "run `pip install csi_images[imageio]` to resolve." 1091 ) 1092 # Gather mask_info 1093 if images is not None and image_labels is not None: 1094 if len(images) != len(image_labels): 1095 raise ValueError("Intensity images and labels must match lengths.") 1096 1097 mask_info = csi_images.extract_mask_info(mask, images, image_labels, properties) 1098 1099 if len(mask_info) == 0: 1100 return EventArray() 1101 1102 # Combine provided info and mask info 1103 info = pd.DataFrame( 1104 { 1105 "slide_id": tile.scan.slide_id, 1106 "tile": tile.n, 1107 "roi": tile.n_roi, 1108 "x": mask_info["x"], 1109 "y": mask_info["y"], 1110 }, 1111 ) 1112 # Extract a metadata column if desired 1113 if include_cell_id: 1114 metadata = pd.DataFrame({"cell_id": mask_info["id"]}) 1115 else: 1116 metadata = None 1117 # If any additional properties were extracted, add them as features 1118 mask_info = mask_info.drop(columns=["id", "x", "y"], errors="ignore") 1119 if len(mask_info.columns) > 0: 1120 features = mask_info 1121 features.columns = [col.lower() for col in features.columns] 1122 else: 1123 features = None 1124 return EventArray(info, metadata, features) 1125 1126 def save_csv(self, output_path: str) -> bool: 1127 """ 1128 Save the events to an CSV file, including metadata and features. 1129 :param output_path: 1130 :return: 1131 """ 1132 if not output_path.endswith(".csv"): 1133 output_path += ".csv" 1134 self.to_dataframe().to_csv(output_path, index=False) 1135 return os.path.exists(output_path) 1136 1137 @classmethod 1138 def load_csv( 1139 cls, 1140 input_path: str, 1141 metadata_prefix: str = "metadata_", 1142 features_prefix: str = "features_", 1143 ) -> Self: 1144 """ 1145 Load the events from an CSV file, including metadata and features. 1146 :param input_path: 1147 :param metadata_prefix: 1148 :param features_prefix: 1149 :return: 1150 """ 1151 # Load the CSV file 1152 df = pd.read_csv(input_path) 1153 return cls.from_dataframe(df, metadata_prefix, features_prefix) 1154 1155 def save_json(self, output_path: str, orient: str = "records") -> bool: 1156 """ 1157 Save the events to a JSON file, including metadata and features. 1158 :param output_path: 1159 :param orient: the orientation of the JSON file, see pandas.DataFrame.to_json() 1160 :return: 1161 """ 1162 if not output_path.endswith(".json"): 1163 output_path += ".json" 1164 self.to_dataframe().to_json(output_path, orient=orient, indent=2) 1165 return os.path.exists(output_path) 1166 1167 @classmethod 1168 def load_json( 1169 cls, 1170 input_path: str, 1171 metadata_prefix: str = "metadata_", 1172 features_prefix: str = "features_", 1173 ) -> Self: 1174 """ 1175 Load the events from a JSON file, including metadata and features. 1176 :param input_path: 1177 :param metadata_prefix: 1178 :param features_prefix: 1179 :return: 1180 """ 1181 # Load the JSON file 1182 df = pd.read_json(input_path, orient="records") 1183 return cls.from_dataframe(df, metadata_prefix, features_prefix) 1184 1185 def save_hdf5( 1186 self, output_path: str, complevel: int = 1, complib="blosc:zstd" 1187 ) -> bool: 1188 """ 1189 Save the events to an HDF5 file, including metadata and features. 1190 Uses the pandas-provided HDF5 functions for ease, and external compatibility, 1191 though these files are slightly harder to view in HDFView or similar. 1192 Compression defaults remain very quick while cutting file size by 50%+. 1193 :param output_path: 1194 :param complevel: see pandas.HDFStore for more details. 1195 :param complib: see pandas.HDFStore for more details. 1196 :return: 1197 """ 1198 if not output_path.endswith(".hdf5") and not output_path.endswith(".h5"): 1199 output_path += ".hdf5" 1200 # Open the output_path as an HDF5 file 1201 with pd.HDFStore( 1202 output_path, mode="w", complevel=complevel, complib=complib 1203 ) as store: 1204 # Store the dataframes in the HDF5 file 1205 if self.info is not None: 1206 store.put("info", self.info, index=False) 1207 if self.metadata is not None: 1208 store.put("metadata", self.metadata, index=False) 1209 if self.features is not None: 1210 store.put("features", self.features, index=False) 1211 return os.path.exists(output_path) 1212 1213 @classmethod 1214 def load_hdf5(cls, input_path: str) -> Self: 1215 """ 1216 Load the events from an HDF5 file, including metadata and features. 1217 :param input_path: 1218 :return: 1219 """ 1220 # Open the input_path as an HDF5 file 1221 with pd.HDFStore(input_path, "r") as store: 1222 # Load the dataframes from the HDF5 file 1223 info = store.get("info") if "info" in store else None 1224 metadata = store.get("metadata") if "metadata" in store else None 1225 features = store.get("features") if "features" in store else None 1226 return cls(info=info, metadata=metadata, features=features) 1227 1228 def save_ocular(self, output_path: str, event_type: str = "cells"): 1229 """ 1230 Save the events to an OCULAR file. Relies on the dataframe originating 1231 from an OCULAR file (same columns; duplicate metadata/info). 1232 :param output_path: 1233 :param event_type: 1234 :return: 1235 """ 1236 if pyreadr is None: 1237 raise ModuleNotFoundError( 1238 "pyreadr not installed! Install pyreadr directly " 1239 "or run `pip install csi-images[rds]` option to resolve." 1240 ) 1241 if event_type == "cells": 1242 file_stub = "rc-final" 1243 elif event_type == "others": 1244 file_stub = "others-final" 1245 else: 1246 raise ValueError("Invalid event type. Must be cells or others.") 1247 1248 # Ensure good metadata 1249 metadata = pd.DataFrame( 1250 { 1251 "slide_id": self.info["slide_id"], 1252 "frame_id": self.info["tile"] + 1, # Convert to 1-indexed for R 1253 "cell_id": ( 1254 self.metadata["cell_id"] 1255 if "cell_id" in self.metadata.columns 1256 else range(len(self.info)) 1257 ), 1258 "cellx": self.info["x"], 1259 "celly": self.info["y"], 1260 } 1261 ) 1262 if self.metadata is not None: 1263 metadata[self.metadata.columns] = self.metadata.copy() 1264 1265 # Check for the "ocular_interesting" column 1266 if event_type == "cells": 1267 if "ocular_interesting" in metadata.columns: 1268 interesting_rows = metadata["ocular_interesting"].to_numpy(dtype=bool) 1269 elif "hcpc" in metadata.columns: 1270 # Interesting cells don't get an hcpc designation, leaving them as -1 1271 interesting_rows = ( 1272 metadata["hcpc"].to_numpy() == -1 1273 ) # interesting cells 1274 else: 1275 interesting_rows = [] 1276 if sum(interesting_rows) > 0: 1277 # Split the metadata into interesting and regular 1278 interesting_events = self.rows(interesting_rows) 1279 interesting_df = pd.concat( 1280 [interesting_events.features, interesting_events.metadata], axis=1 1281 ) 1282 data_events = self.rows(~interesting_rows) 1283 data_df = pd.concat( 1284 [data_events.features, data_events.metadata], axis=1 1285 ) 1286 data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore") 1287 1288 # Drop particular columns for "interesting" 1289 interesting_df = interesting_df.drop( 1290 [ 1291 "clust", 1292 "hcpc", 1293 "frame_id", 1294 "cell_id", 1295 "unique_id", 1296 "ocular_interesting", 1297 ], 1298 axis=1, 1299 errors="ignore", 1300 ) 1301 # Save both .csv and .rds 1302 interesting_stub = os.path.join(output_path, "ocular_interesting") 1303 interesting_df.to_csv(f"{interesting_stub}.csv") 1304 # Suppress pandas FutureWarning 1305 with warnings.catch_warnings(): 1306 warnings.simplefilter(action="ignore", category=FutureWarning) 1307 pyreadr.write_rds(f"{interesting_stub}.rds", interesting_df) 1308 else: 1309 data_df = pd.concat([self.features, metadata], axis=1) 1310 else: 1311 # Get all data and reset_index (will copy it) 1312 data_df = pd.concat([self.features, metadata], axis=1) 1313 1314 # Split based on cluster number to conform to *-final[1-4].rds 1315 n_clusters = max(data_df["clust"]) + 1 1316 split_idx = [round(i * n_clusters / 4) for i in range(5)] 1317 for i in range(4): 1318 subset = (split_idx[i] <= data_df["clust"]) & ( 1319 data_df["clust"] < split_idx[i + 1] 1320 ) 1321 data_df.loc[subset, "hcpc"] = i + 1 1322 subset = data_df[subset].reset_index(drop=True) 1323 # Suppress pandas FutureWarning 1324 with warnings.catch_warnings(): 1325 warnings.simplefilter(action="ignore", category=FutureWarning) 1326 pyreadr.write_rds( 1327 os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset 1328 ) 1329 1330 # Create new example cell strings 1331 data_df["example_cell_id"] = ( 1332 data_df["slide_id"] 1333 + " " 1334 + data_df["frame_id"].astype(str) 1335 + " " 1336 + data_df["cell_id"].astype(str) 1337 + " " 1338 + data_df["cellx"].astype(int).astype(str) 1339 + " " 1340 + data_df["celly"].astype(int).astype(str) 1341 ) 1342 # Find averagable data columns 1343 if "cellcluster_id" in data_df.columns: 1344 end_idx = data_df.columns.get_loc("cellcluster_id") 1345 else: 1346 end_idx = data_df.columns.get_loc("slide_id") 1347 avg_cols = data_df.columns[:end_idx].tolist() 1348 # Group by cluster and average 1349 data_df = data_df.groupby("clust").agg( 1350 **{col: (col, "mean") for col in avg_cols}, 1351 count=("clust", "size"), # count rows in each cluster 1352 example_cells=("example_cell_id", lambda x: ",".join(x)), 1353 hcpc=("hcpc", lambda x: x.iloc[0]), 1354 ) 1355 data_df = data_df.reset_index() # Do NOT drop, index is "clust" 1356 # Create new columns 1357 metadata = pd.DataFrame( 1358 { 1359 "count": data_df["count"], 1360 "example_cells": data_df["example_cells"], 1361 "clust": data_df["clust"].astype(int), 1362 "hcpc": data_df["hcpc"].astype(int), 1363 "id": data_df["clust"].astype(int).astype(str), 1364 "cccluster": "0", # Dummy value 1365 "ccdistance": 0.0, # Dummy value 1366 "rownum": list(range(len(data_df))), 1367 "framegroup": 0, # Dummy value 1368 } 1369 ) 1370 # Need to pad the features to 761 columns, as per OCULAR report needs 1371 additional_columns = range(len(avg_cols), 761) 1372 if len(additional_columns) > 0: 1373 padding = pd.DataFrame( 1374 np.zeros((len(data_df), len(additional_columns))), 1375 columns=[f"pad{i}" for i in additional_columns], 1376 ) 1377 data_df = pd.concat([data_df[avg_cols], padding, metadata], axis=1) 1378 else: 1379 data_df = pd.concat([data_df[avg_cols], metadata], axis=1) 1380 1381 # Save the cluster data 1382 data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv")) 1383 # Suppress pandas FutureWarning 1384 with warnings.catch_warnings(): 1385 warnings.simplefilter(action="ignore", category=FutureWarning) 1386 pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df) 1387 1388 @classmethod 1389 def load_ocular( 1390 cls, 1391 input_path: str, 1392 event_type="cells", 1393 cell_data_files=( 1394 "rc-final1.rds", 1395 "rc-final2.rds", 1396 "rc-final3.rds", 1397 "rc-final4.rds", 1398 "ocular_interesting.rds", 1399 ), 1400 others_data_files=( 1401 "others-final1.rds", 1402 "others-final2.rds", 1403 "others-final3.rds", 1404 "others-final4.rds", 1405 ), 1406 atlas_data_files=( 1407 "ocular_interesting.rds", 1408 "ocular_not_interesting.rds", 1409 ), 1410 drop_common_events=True, 1411 ) -> Self: 1412 """ 1413 1414 :param input_path: 1415 :param event_type: 1416 :param cell_data_files: 1417 :param others_data_files: 1418 :param atlas_data_files: 1419 :param drop_common_events: 1420 :return: 1421 """ 1422 if pyreadr is None: 1423 raise ModuleNotFoundError( 1424 "pyreadr not installed! Install pyreadr directly " 1425 "or run `pip install csi-images[rds]` option to resolve." 1426 ) 1427 # Check if the input path is a directory or a file 1428 if os.path.isfile(input_path): 1429 data_files = [os.path.basename(input_path)] 1430 input_path = os.path.dirname(input_path) 1431 if event_type == "cells": 1432 data_files = cell_data_files 1433 elif event_type == "others": 1434 data_files = others_data_files 1435 else: 1436 raise ValueError("Invalid event type.") 1437 1438 # Load the data from the OCULAR files 1439 file_data = {} 1440 for file in data_files: 1441 file_path = os.path.join(input_path, file) 1442 if not os.path.isfile(file_path): 1443 warnings.warn(f"{file} not found for in {input_path}") 1444 continue 1445 file_data[file] = pyreadr.read_r(file_path) 1446 # Get the DataFrame associated with None (pyreadr dict quirk) 1447 file_data[file] = file_data[file][None] 1448 if len(file_data[file]) == 0: 1449 # File gets dropped from the dict 1450 file_data.pop(file) 1451 warnings.warn(f"{file} has no cells") 1452 continue 1453 1454 # Drop common cells if requested and in this file 1455 if ( 1456 file in atlas_data_files 1457 and drop_common_events 1458 and "catalogue_classification" in file_data[file] 1459 ): 1460 common_cell_indices = ( 1461 file_data[file]["catalogue_classification"] == "common_cell" 1462 ) 1463 file_data[file] = file_data[file][common_cell_indices == False] 1464 1465 if len(file_data[file]) == 0: 1466 # File gets dropped from the dict 1467 file_data.pop(file) 1468 warnings.warn(f"{file} has no cells after dropping common cells") 1469 continue 1470 1471 # Extract frame_id and cell_id 1472 # DAPI- events already have frame_id cell_id outside rowname 1473 if event_type == "cells" and "frame_id" not in file_data[file].columns: 1474 file_data[file]["rowname"] = file_data[file]["rowname"].astype("str") 1475 # get frame_id cell_id from rownames column and split into two columns 1476 split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True) 1477 if len(split_res.columns) != 2: 1478 warnings.warn( 1479 f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}' 1480 ) 1481 # then assign it back to the dataframe 1482 file_data[file][["frame_id", "cell_id"]] = split_res.astype("int") 1483 # Ensure frame_id and cell_id are integers 1484 file_data[file]["frame_id"] = file_data[file]["frame_id"].astype("int") 1485 file_data[file]["cell_id"] = file_data[file]["cell_id"].astype("int") 1486 # reset indexes since they can cause NaN values in concat 1487 file_data[file] = file_data[file].reset_index(drop=True) 1488 1489 # Merge the data from all files 1490 if len(file_data) == 0: 1491 return EventArray() 1492 elif len(file_data) == 1: 1493 data = [file_data[file] for file in file_data.keys()][0] 1494 else: 1495 data = pd.concat(file_data.values()) 1496 1497 # Others is missing the "slide_id". Insert it right before "frame_id" column 1498 if event_type == "others" and "slide_id" not in data.columns: 1499 if os.path.basename(input_path) == "ocular": 1500 slide_id = os.path.basename(os.path.dirname(input_path)) 1501 else: 1502 slide_id = "UNKNOWN" 1503 data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id) 1504 1505 # Sort according to ascending cell_id to keep the original, which is in manual_df 1506 data = data.sort_values(by=["cell_id"], ascending=True) 1507 # Filter out duplicates by x & y 1508 data = data.assign( 1509 unique_id=data["slide_id"] 1510 + "_" 1511 + data["frame_id"].astype(str) 1512 + "_" 1513 + data["cellx"].astype(int).astype(str) 1514 + "_" 1515 + data["celly"].astype(int).astype(str) 1516 ) 1517 data = data.drop_duplicates(subset=["unique_id"], keep="first") 1518 # Normal unique_id is with cell_id 1519 data = data.assign( 1520 unique_id=data["slide_id"] 1521 + "_" 1522 + data["frame_id"].astype(str) 1523 + "_" 1524 + data["cell_id"].astype(str) 1525 ) 1526 data = data.reset_index(drop=True) 1527 # All columns up to "slide_id" are features; drop the "slide_id" 1528 features = data.loc[:, :"slide_id"].iloc[:, :-1] 1529 data = data.loc[:, "slide_id":] 1530 # Grab the info columns 1531 info = data[["slide_id", "frame_id", "cellx", "celly"]] 1532 info.columns = ["slide_id", "tile", "x", "y"] 1533 info = info.assign(roi=0) # OCULAR only works on 1 ROI, as far as known 1534 info = info[["slide_id", "tile", "roi", "x", "y"]] 1535 # Metadata has duplicate columns for later convenience 1536 metadata = data 1537 # Certain columns tend to be problematic with mixed data formats... 1538 for col in ["TRITC", "CY5", "FITC"]: 1539 if col in metadata: 1540 labels = { 1541 "False": False, 1542 "True": True, 1543 "FALSE": False, 1544 "TRUE": True, 1545 False: False, 1546 True: True, 1547 } 1548 metadata[col] = metadata[col].map(labels).astype(bool) 1549 for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]: 1550 if col in metadata: 1551 metadata[col] = metadata[col].fillna(-1).astype(int) 1552 info["tile"] = info["tile"] - 1 # Convert to 0-based indexing 1553 return EventArray(info, metadata, features)
A class that holds a large number of events' data, making it easy to analyze and manipulate many events at once. A more separated version of the Event class.
629 def __init__( 630 self, 631 info: pd.DataFrame = None, 632 metadata: pd.DataFrame = None, 633 features: pd.DataFrame = None, 634 ): 635 636 # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y" 637 self.info = info 638 if self.info is not None: 639 # Special case: "roi" is often not required, so we'll fill in if its missing 640 if "roi" not in info.columns: 641 self.info = self.info.assign(roi=0) 642 if set(self.info.columns) != set(self.INFO_COLUMNS): 643 raise ValueError( 644 f"EventArray.info must have columns:" 645 f"{self.INFO_COLUMNS}; had {list(self.info.columns)}" 646 ) 647 # Ensure order and data types 648 self.info = pd.DataFrame( 649 { 650 "slide_id": self.info["slide_id"].astype(str), 651 "tile": self.info["tile"].astype(np.uint16), 652 "roi": self.info["roi"].astype(np.uint8), 653 "x": self.info["x"].round().astype(np.uint16), 654 "y": self.info["y"].round().astype(np.uint16), 655 } 656 ) 657 658 # All DataFrames must all have the same number of rows 659 if metadata is not None and (info is None or len(info) != len(metadata)): 660 raise ValueError( 661 "If EventArray.metadata is not None, it should match rows with .info" 662 ) 663 if features is not None and (info is None or len(info) != len(features)): 664 raise ValueError( 665 "If EventArray.features is not None, it should match rows with .info" 666 ) 667 # No columns named "metadata_", "features_", or "None" 668 column_names = [] 669 if metadata is not None: 670 column_names += metadata.columns.tolist() 671 if features is not None: 672 column_names += features.columns.tolist() 673 if any([col.lower().startswith("metadata_") for col in column_names]): 674 raise ValueError("EventArray column names cannot start with 'metadata_'") 675 if any([col.lower().startswith("features_") for col in column_names]): 676 raise ValueError("EventArray column names cannot start with 'features_'") 677 if any([col.lower() == "none" for col in column_names]): 678 raise ValueError("EventArray column names cannot be 'none'") 679 680 # Add metadata and features 681 self.metadata = None 682 self.features = None 683 if metadata is not None: 684 self.add_metadata(metadata) 685 if features is not None: 686 self.add_features(features)
733 def get_sort_order( 734 self, by: Hashable | Sequence[Hashable], ascending: bool | Sequence[bool] = True 735 ): 736 """ 737 Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames. 738 :param by: name of the column(s) to sort by. 739 :param ascending: whether to sort in ascending order; can be a list to match by 740 :return: the order of the indices to sort by. 741 """ 742 columns = self.get(by) 743 return columns.sort_values(by=by, ascending=ascending).index
Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.
Parameters
- by: name of the column(s) to sort by.
- ascending: whether to sort in ascending order; can be a list to match by
Returns
the order of the indices to sort by.
745 def sort( 746 self, 747 by: Hashable | Sequence[Hashable], 748 ascending: bool | Sequence[bool] = True, 749 ) -> Self: 750 """ 751 Sort the EventArray by column(s) in the info, metadata, or features DataFrames. 752 :param by: name of the column(s) to sort by. 753 :param ascending: whether to sort in ascending order; can be a list to match by 754 :return: a new, sorted EventArray. 755 """ 756 order = self.get_sort_order(by, ascending) 757 info = self.info.loc[order].reset_index(drop=True) 758 if self.metadata is not None: 759 metadata = self.metadata.loc[order].reset_index(drop=True) 760 else: 761 metadata = None 762 if self.features is not None: 763 features = self.features.loc[order].reset_index(drop=True) 764 else: 765 features = None 766 return EventArray(info, metadata, features)
Sort the EventArray by column(s) in the info, metadata, or features DataFrames.
Parameters
- by: name of the column(s) to sort by.
- ascending: whether to sort in ascending order; can be a list to match by
Returns
a new, sorted EventArray.
768 def get(self, column_names: Hashable | Sequence[Hashable]) -> pd.DataFrame: 769 """ 770 Get a DataFrame with the specified columns from the EventArray, by value. 771 :param column_names: the names of the columns to get. 772 :return: a DataFrame with the specified columns. 773 """ 774 if isinstance(column_names, Hashable): 775 column_names = [column_names] # Drop into a list for the loop 776 columns = [] 777 for column_name in column_names: 778 if column_name in self.info.columns: 779 columns.append(self.info[column_name]) 780 elif self.metadata is not None and column_name in self.metadata.columns: 781 columns.append(self.metadata[column_name]) 782 elif self.features is not None and column_name in self.features.columns: 783 columns.append(self.features[column_name]) 784 else: 785 raise ValueError(f"Column {column_name} not found in EventArray") 786 return pd.concat(columns, axis=1)
Get a DataFrame with the specified columns from the EventArray, by value.
Parameters
- column_names: the names of the columns to get.
Returns
a DataFrame with the specified columns.
788 def rows(self, rows: Sequence[Hashable]) -> Self: 789 """ 790 Get a subset of the EventArray rows based on a boolean or integer index, by value. 791 :param rows: row labels, indices, or boolean mask; anything for .loc[] 792 :return: a new EventArray with the subset of events. 793 """ 794 info = self.info.loc[rows].reset_index(drop=True) 795 if self.metadata is not None: 796 metadata = self.metadata.loc[rows].reset_index(drop=True) 797 else: 798 metadata = None 799 if self.features is not None: 800 features = self.features.loc[rows].reset_index(drop=True) 801 else: 802 features = None 803 return EventArray(info, metadata, features)
Get a subset of the EventArray rows based on a boolean or integer index, by value.
Parameters
- rows: row labels, indices, or boolean mask; anything for .loc[]
Returns
a new EventArray with the subset of events.
805 def copy(self) -> Self: 806 """ 807 Create a deep copy of the EventArray. 808 :return: a deep copy of the EventArray. 809 """ 810 return EventArray( 811 info=self.info.copy(), 812 metadata=None if self.metadata is None else self.metadata.copy(), 813 features=None if self.features is None else self.features.copy(), 814 )
Create a deep copy of the EventArray.
Returns
a deep copy of the EventArray.
818 def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None: 819 """ 820 Add metadata to the EventArray. Removes the need to check if metadata is None. 821 Overwrites any existing metadata with the same column names as the new metadata. 822 :param new_metadata: the metadata to add. 823 """ 824 if self.info is None or len(self.info) != len(new_metadata): 825 raise ValueError("New metadata must match length of existing info") 826 827 if isinstance(new_metadata, pd.Series): 828 # Convert to a DataFrame 829 new_metadata = pd.DataFrame(new_metadata) 830 831 for col in new_metadata.columns: 832 if col in self.INFO_COLUMNS: 833 warnings.warn( 834 f"Column name {col} is reserved for info; you can only " 835 "access this column through the .metadata attribute" 836 ) 837 elif self.features is not None and col in self.features.columns: 838 warnings.warn( 839 f"Column name {col} also exists in the .features attribute; " 840 f"calling this.get({col}) will return the .metadata column" 841 ) 842 843 if self.metadata is None: 844 self.metadata = new_metadata 845 else: 846 self.metadata.loc[:, new_metadata.columns] = new_metadata
Add metadata to the EventArray. Removes the need to check if metadata is None. Overwrites any existing metadata with the same column names as the new metadata.
Parameters
- new_metadata: the metadata to add.
848 def add_features(self, new_features: pd.Series | pd.DataFrame) -> None: 849 """ 850 Add features to the EventArray. Removes the need to check if features is None. 851 Overwrites any existing features with the same column names as the new features. 852 :param new_features: the features to add. 853 """ 854 if self.info is None or len(self.info) != len(new_features): 855 raise ValueError("New features must match length of existing info") 856 857 if isinstance(new_features, pd.Series): 858 # Convert to a DataFrame 859 new_features = pd.DataFrame(new_features) 860 861 for col in new_features.columns: 862 if col in self.INFO_COLUMNS: 863 warnings.warn( 864 f"Column name {col} is reserved for info; you can only " 865 "access this column through the .features attribute" 866 ) 867 elif self.metadata is not None and col in self.metadata.columns: 868 warnings.warn( 869 f"Column name {col} already exists in the .metadata attribute;" 870 f"calling this.get({col}) will return the .metadata column" 871 ) 872 873 if self.features is None: 874 self.features = new_features 875 else: 876 self.features.loc[:, new_features.columns] = new_features
Add features to the EventArray. Removes the need to check if features is None. Overwrites any existing features with the same column names as the new features.
Parameters
- new_features: the features to add.
878 @classmethod 879 def merge(cls, events: Iterable[Self]) -> Self: 880 """ 881 Combine EventArrays in a list into a single EventArray. 882 :param events: the new list of events. 883 """ 884 all_info = [] 885 all_metadata = [] 886 all_features = [] 887 for event_array in events: 888 # Skip empty EventArrays 889 if event_array.info is not None: 890 all_info.append(event_array.info) 891 if event_array.metadata is not None: 892 all_metadata.append(event_array.metadata) 893 if event_array.features is not None: 894 all_features.append(event_array.features) 895 if len(all_info) == 0: 896 return EventArray() 897 else: 898 all_info = pd.concat(all_info, ignore_index=True) 899 if len(all_metadata) == 0: 900 all_metadata = None 901 else: 902 all_metadata = pd.concat(all_metadata, ignore_index=True) 903 if len(all_features) == 0: 904 all_features = None 905 else: 906 all_features = pd.concat(all_features, ignore_index=True) 907 908 return EventArray(all_info, all_metadata, all_features)
Combine EventArrays in a list into a single EventArray.
Parameters
- events: the new list of events.
910 def to_events( 911 self, 912 scans: Scan | Iterable[Scan], 913 ignore_missing_scans=True, 914 ignore_metadata=False, 915 ignore_features=False, 916 ) -> list[Event]: 917 """ 918 Get the events in the EventArray as a list of events. Returns [] if empty. 919 :param scans: the scans that the events belong to, auto-matched by slide_id. 920 Pass None if you don't care about scan metadata (pass ignore_missing_scans). 921 :param ignore_missing_scans: whether to create blank scans for events without scans. 922 :param ignore_metadata: whether to ignore metadata or not 923 :param ignore_features: whether to ignore features or not 924 :return: 925 """ 926 if len(self) == 0: 927 return [] 928 if isinstance(scans, Scan): 929 scans = [scans] 930 scans = {scan.slide_id: scan for scan in scans} 931 events = [] 932 for i in range(len(self.info)): 933 # Determine the associated scan 934 slide_id = self.info["slide_id"][i] 935 if slide_id not in scans: 936 if ignore_missing_scans: 937 # Create a placeholder scan if the scan is missing 938 scan = Scan.make_placeholder( 939 slide_id, 940 self.info["tile"][i], 941 self.info["roi"][i], 942 ) 943 else: 944 raise ValueError( 945 f"Scan {self.info['slide_id'][i]} not found for event {i}." 946 ) 947 else: 948 scan = scans[slide_id] 949 950 # Prepare the metadata and features 951 if ignore_metadata or self.metadata is None: 952 metadata = None 953 else: 954 # This Series creation method is less efficient, 955 # but required for preserving dtypes 956 metadata = pd.Series( 957 {col: self.metadata.loc[i, col] for col in self.metadata.columns}, 958 dtype=object, 959 ) 960 if ignore_features or self.features is None: 961 features = None 962 else: 963 features = pd.Series( 964 {col: self.features.loc[i, col] for col in self.features.columns}, 965 dtype=object, 966 ) 967 # Create the event and append it to the list 968 events.append( 969 Event( 970 Tile(scan, self.info["tile"][i], self.info["roi"][i]), 971 self.info["x"][i], 972 self.info["y"][i], 973 metadata=metadata, 974 features=features, 975 ) 976 ) 977 return events
Get the events in the EventArray as a list of events. Returns [] if empty.
Parameters
- scans: the scans that the events belong to, auto-matched by slide_id. Pass None if you don't care about scan metadata (pass ignore_missing_scans).
- ignore_missing_scans: whether to create blank scans for events without scans.
- ignore_metadata: whether to ignore metadata or not
- ignore_features: whether to ignore features or not
Returns
979 @classmethod 980 def from_events(cls, events: Iterable[Event]) -> Self: 981 """ 982 Set the events in the EventArray to a new list of events. 983 :param events: the new list of events. 984 """ 985 info = pd.DataFrame( 986 { 987 "slide_id": [event.tile.scan.slide_id for event in events], 988 "tile": [event.tile.n for event in events], 989 "roi": [event.tile.n_roi for event in events], 990 "x": [event.x for event in events], 991 "y": [event.y for event in events], 992 } 993 ) 994 metadata_list = [event.metadata for event in events] 995 # Iterate through and ensure that all metadata is the same shape 996 for metadata in metadata_list: 997 if type(metadata) != type(metadata_list[0]): 998 raise ValueError("All metadata must be the same type.") 999 if metadata is not None and metadata.shape != metadata_list[0].shape: 1000 raise ValueError("All metadata must be the same shape.") 1001 if metadata_list[0] is None: 1002 metadata = None 1003 else: 1004 metadata = pd.DataFrame(metadata_list) 1005 features_list = [event.features for event in events] 1006 # Iterate through and ensure that all features are the same shape 1007 for features in features_list: 1008 if type(features) != type(features_list[0]): 1009 raise ValueError("All features must be the same type.") 1010 if features is not None and features.shape != features_list[0].shape: 1011 raise ValueError("All features must be the same shape.") 1012 if features_list[0] is None: 1013 features = None 1014 else: 1015 features = pd.DataFrame(features_list) 1016 return EventArray(info=info, metadata=metadata, features=features)
Set the events in the EventArray to a new list of events.
Parameters
- events: the new list of events.
1018 def to_dataframe(self) -> pd.DataFrame: 1019 """ 1020 Convert all the data in the EventArray to a single DataFrame. 1021 :return: a DataFrame with all the data in the EventArray. 1022 """ 1023 # Make a copy of the info DataFrame and prepend "info_" to the column names 1024 output = self.info.copy() 1025 # Combine with the metadata and prepend "metadata_" to the column names 1026 if self.metadata is not None: 1027 metadata = self.metadata.copy() 1028 metadata.columns = [f"metadata_{col}" for col in metadata.columns] 1029 output = pd.concat([output, metadata], axis=1) 1030 # Combine with the features and prepend "features_" to the column names 1031 if self.features is not None: 1032 features = self.features.copy() 1033 features.columns = [f"features_{col}" for col in features.columns] 1034 output = pd.concat([output, features], axis=1) 1035 return output
Convert all the data in the EventArray to a single DataFrame.
Returns
a DataFrame with all the data in the EventArray.
1037 @classmethod 1038 def from_dataframe( 1039 cls, df, metadata_prefix: str = "metadata_", features_prefix: str = "features_" 1040 ) -> Self: 1041 """ 1042 From a single, special DataFrame, create an EventArray. 1043 :param df: the DataFrame to convert to an EventArray. 1044 :param metadata_prefix: the prefix for metadata columns. 1045 :param features_prefix: the prefix for features columns. 1046 :return: a DataFrame with all the data in the EventArray. 1047 """ 1048 # Split the columns into info, metadata, and features and strip prefix 1049 info = df[[col for col in df.columns if col in cls.INFO_COLUMNS]].copy() 1050 if info.size == 0: 1051 info = None 1052 metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy() 1053 metadata.columns = [ 1054 col.replace(metadata_prefix, "") for col in metadata.columns 1055 ] 1056 if metadata.size == 0: 1057 metadata = None 1058 features = df[[col for col in df.columns if col.startswith("features_")]].copy() 1059 features.columns = [ 1060 col.replace(features_prefix, "") for col in features.columns 1061 ] 1062 if features.size == 0: 1063 features = None 1064 return cls(info=info, metadata=metadata, features=features)
From a single, special DataFrame, create an EventArray.
Parameters
- df: the DataFrame to convert to an EventArray.
- metadata_prefix: the prefix for metadata columns.
- features_prefix: the prefix for features columns.
Returns
a DataFrame with all the data in the EventArray.
1066 @classmethod 1067 def from_mask( 1068 cls, 1069 mask: np.ndarray, 1070 tile: Tile, 1071 include_cell_id: bool = True, 1072 images: list[np.ndarray] = None, 1073 image_labels: list[str] = None, 1074 properties: list[str] = None, 1075 ) -> Self: 1076 """ 1077 Extract events from a mask DataFrame, including metadata and features. 1078 :param mask: the mask to extract events from. 1079 :param tile: the Tile object associated with this mask. 1080 :param include_cell_id: whether to include the cell_id, or numerical 1081 mask label, as metadata in the EventArray. 1082 :param images: the intensity images to extract features from. 1083 :param image_labels: the labels for the intensity images. 1084 :param properties: list of properties to extract in addition to the defaults: 1085 :return: EventArray corresponding to the mask labels. 1086 """ 1087 if csi_images is None: 1088 raise ModuleNotFoundError( 1089 "imageio libraries not installed! " 1090 "run `pip install csi_images[imageio]` to resolve." 1091 ) 1092 # Gather mask_info 1093 if images is not None and image_labels is not None: 1094 if len(images) != len(image_labels): 1095 raise ValueError("Intensity images and labels must match lengths.") 1096 1097 mask_info = csi_images.extract_mask_info(mask, images, image_labels, properties) 1098 1099 if len(mask_info) == 0: 1100 return EventArray() 1101 1102 # Combine provided info and mask info 1103 info = pd.DataFrame( 1104 { 1105 "slide_id": tile.scan.slide_id, 1106 "tile": tile.n, 1107 "roi": tile.n_roi, 1108 "x": mask_info["x"], 1109 "y": mask_info["y"], 1110 }, 1111 ) 1112 # Extract a metadata column if desired 1113 if include_cell_id: 1114 metadata = pd.DataFrame({"cell_id": mask_info["id"]}) 1115 else: 1116 metadata = None 1117 # If any additional properties were extracted, add them as features 1118 mask_info = mask_info.drop(columns=["id", "x", "y"], errors="ignore") 1119 if len(mask_info.columns) > 0: 1120 features = mask_info 1121 features.columns = [col.lower() for col in features.columns] 1122 else: 1123 features = None 1124 return EventArray(info, metadata, features)
Extract events from a mask DataFrame, including metadata and features.
Parameters
- mask: the mask to extract events from.
- tile: the Tile object associated with this mask.
- include_cell_id: whether to include the cell_id, or numerical mask label, as metadata in the EventArray.
- images: the intensity images to extract features from.
- image_labels: the labels for the intensity images.
- properties: list of properties to extract in addition to the defaults:
Returns
EventArray corresponding to the mask labels.
1126 def save_csv(self, output_path: str) -> bool: 1127 """ 1128 Save the events to an CSV file, including metadata and features. 1129 :param output_path: 1130 :return: 1131 """ 1132 if not output_path.endswith(".csv"): 1133 output_path += ".csv" 1134 self.to_dataframe().to_csv(output_path, index=False) 1135 return os.path.exists(output_path)
Save the events to an CSV file, including metadata and features.
Parameters
- output_path:
Returns
1137 @classmethod 1138 def load_csv( 1139 cls, 1140 input_path: str, 1141 metadata_prefix: str = "metadata_", 1142 features_prefix: str = "features_", 1143 ) -> Self: 1144 """ 1145 Load the events from an CSV file, including metadata and features. 1146 :param input_path: 1147 :param metadata_prefix: 1148 :param features_prefix: 1149 :return: 1150 """ 1151 # Load the CSV file 1152 df = pd.read_csv(input_path) 1153 return cls.from_dataframe(df, metadata_prefix, features_prefix)
Load the events from an CSV file, including metadata and features.
Parameters
- input_path:
- metadata_prefix:
- features_prefix:
Returns
1155 def save_json(self, output_path: str, orient: str = "records") -> bool: 1156 """ 1157 Save the events to a JSON file, including metadata and features. 1158 :param output_path: 1159 :param orient: the orientation of the JSON file, see pandas.DataFrame.to_json() 1160 :return: 1161 """ 1162 if not output_path.endswith(".json"): 1163 output_path += ".json" 1164 self.to_dataframe().to_json(output_path, orient=orient, indent=2) 1165 return os.path.exists(output_path)
Save the events to a JSON file, including metadata and features.
Parameters
- output_path:
- orient: the orientation of the JSON file, see pandas.DataFrame.to_json()
Returns
1167 @classmethod 1168 def load_json( 1169 cls, 1170 input_path: str, 1171 metadata_prefix: str = "metadata_", 1172 features_prefix: str = "features_", 1173 ) -> Self: 1174 """ 1175 Load the events from a JSON file, including metadata and features. 1176 :param input_path: 1177 :param metadata_prefix: 1178 :param features_prefix: 1179 :return: 1180 """ 1181 # Load the JSON file 1182 df = pd.read_json(input_path, orient="records") 1183 return cls.from_dataframe(df, metadata_prefix, features_prefix)
Load the events from a JSON file, including metadata and features.
Parameters
- input_path:
- metadata_prefix:
- features_prefix:
Returns
1185 def save_hdf5( 1186 self, output_path: str, complevel: int = 1, complib="blosc:zstd" 1187 ) -> bool: 1188 """ 1189 Save the events to an HDF5 file, including metadata and features. 1190 Uses the pandas-provided HDF5 functions for ease, and external compatibility, 1191 though these files are slightly harder to view in HDFView or similar. 1192 Compression defaults remain very quick while cutting file size by 50%+. 1193 :param output_path: 1194 :param complevel: see pandas.HDFStore for more details. 1195 :param complib: see pandas.HDFStore for more details. 1196 :return: 1197 """ 1198 if not output_path.endswith(".hdf5") and not output_path.endswith(".h5"): 1199 output_path += ".hdf5" 1200 # Open the output_path as an HDF5 file 1201 with pd.HDFStore( 1202 output_path, mode="w", complevel=complevel, complib=complib 1203 ) as store: 1204 # Store the dataframes in the HDF5 file 1205 if self.info is not None: 1206 store.put("info", self.info, index=False) 1207 if self.metadata is not None: 1208 store.put("metadata", self.metadata, index=False) 1209 if self.features is not None: 1210 store.put("features", self.features, index=False) 1211 return os.path.exists(output_path)
Save the events to an HDF5 file, including metadata and features. Uses the pandas-provided HDF5 functions for ease, and external compatibility, though these files are slightly harder to view in HDFView or similar. Compression defaults remain very quick while cutting file size by 50%+.
Parameters
- output_path:
- complevel: see pandas.HDFStore for more details.
- complib: see pandas.HDFStore for more details.
Returns
1213 @classmethod 1214 def load_hdf5(cls, input_path: str) -> Self: 1215 """ 1216 Load the events from an HDF5 file, including metadata and features. 1217 :param input_path: 1218 :return: 1219 """ 1220 # Open the input_path as an HDF5 file 1221 with pd.HDFStore(input_path, "r") as store: 1222 # Load the dataframes from the HDF5 file 1223 info = store.get("info") if "info" in store else None 1224 metadata = store.get("metadata") if "metadata" in store else None 1225 features = store.get("features") if "features" in store else None 1226 return cls(info=info, metadata=metadata, features=features)
Load the events from an HDF5 file, including metadata and features.
Parameters
- input_path:
Returns
1228 def save_ocular(self, output_path: str, event_type: str = "cells"): 1229 """ 1230 Save the events to an OCULAR file. Relies on the dataframe originating 1231 from an OCULAR file (same columns; duplicate metadata/info). 1232 :param output_path: 1233 :param event_type: 1234 :return: 1235 """ 1236 if pyreadr is None: 1237 raise ModuleNotFoundError( 1238 "pyreadr not installed! Install pyreadr directly " 1239 "or run `pip install csi-images[rds]` option to resolve." 1240 ) 1241 if event_type == "cells": 1242 file_stub = "rc-final" 1243 elif event_type == "others": 1244 file_stub = "others-final" 1245 else: 1246 raise ValueError("Invalid event type. Must be cells or others.") 1247 1248 # Ensure good metadata 1249 metadata = pd.DataFrame( 1250 { 1251 "slide_id": self.info["slide_id"], 1252 "frame_id": self.info["tile"] + 1, # Convert to 1-indexed for R 1253 "cell_id": ( 1254 self.metadata["cell_id"] 1255 if "cell_id" in self.metadata.columns 1256 else range(len(self.info)) 1257 ), 1258 "cellx": self.info["x"], 1259 "celly": self.info["y"], 1260 } 1261 ) 1262 if self.metadata is not None: 1263 metadata[self.metadata.columns] = self.metadata.copy() 1264 1265 # Check for the "ocular_interesting" column 1266 if event_type == "cells": 1267 if "ocular_interesting" in metadata.columns: 1268 interesting_rows = metadata["ocular_interesting"].to_numpy(dtype=bool) 1269 elif "hcpc" in metadata.columns: 1270 # Interesting cells don't get an hcpc designation, leaving them as -1 1271 interesting_rows = ( 1272 metadata["hcpc"].to_numpy() == -1 1273 ) # interesting cells 1274 else: 1275 interesting_rows = [] 1276 if sum(interesting_rows) > 0: 1277 # Split the metadata into interesting and regular 1278 interesting_events = self.rows(interesting_rows) 1279 interesting_df = pd.concat( 1280 [interesting_events.features, interesting_events.metadata], axis=1 1281 ) 1282 data_events = self.rows(~interesting_rows) 1283 data_df = pd.concat( 1284 [data_events.features, data_events.metadata], axis=1 1285 ) 1286 data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore") 1287 1288 # Drop particular columns for "interesting" 1289 interesting_df = interesting_df.drop( 1290 [ 1291 "clust", 1292 "hcpc", 1293 "frame_id", 1294 "cell_id", 1295 "unique_id", 1296 "ocular_interesting", 1297 ], 1298 axis=1, 1299 errors="ignore", 1300 ) 1301 # Save both .csv and .rds 1302 interesting_stub = os.path.join(output_path, "ocular_interesting") 1303 interesting_df.to_csv(f"{interesting_stub}.csv") 1304 # Suppress pandas FutureWarning 1305 with warnings.catch_warnings(): 1306 warnings.simplefilter(action="ignore", category=FutureWarning) 1307 pyreadr.write_rds(f"{interesting_stub}.rds", interesting_df) 1308 else: 1309 data_df = pd.concat([self.features, metadata], axis=1) 1310 else: 1311 # Get all data and reset_index (will copy it) 1312 data_df = pd.concat([self.features, metadata], axis=1) 1313 1314 # Split based on cluster number to conform to *-final[1-4].rds 1315 n_clusters = max(data_df["clust"]) + 1 1316 split_idx = [round(i * n_clusters / 4) for i in range(5)] 1317 for i in range(4): 1318 subset = (split_idx[i] <= data_df["clust"]) & ( 1319 data_df["clust"] < split_idx[i + 1] 1320 ) 1321 data_df.loc[subset, "hcpc"] = i + 1 1322 subset = data_df[subset].reset_index(drop=True) 1323 # Suppress pandas FutureWarning 1324 with warnings.catch_warnings(): 1325 warnings.simplefilter(action="ignore", category=FutureWarning) 1326 pyreadr.write_rds( 1327 os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset 1328 ) 1329 1330 # Create new example cell strings 1331 data_df["example_cell_id"] = ( 1332 data_df["slide_id"] 1333 + " " 1334 + data_df["frame_id"].astype(str) 1335 + " " 1336 + data_df["cell_id"].astype(str) 1337 + " " 1338 + data_df["cellx"].astype(int).astype(str) 1339 + " " 1340 + data_df["celly"].astype(int).astype(str) 1341 ) 1342 # Find averagable data columns 1343 if "cellcluster_id" in data_df.columns: 1344 end_idx = data_df.columns.get_loc("cellcluster_id") 1345 else: 1346 end_idx = data_df.columns.get_loc("slide_id") 1347 avg_cols = data_df.columns[:end_idx].tolist() 1348 # Group by cluster and average 1349 data_df = data_df.groupby("clust").agg( 1350 **{col: (col, "mean") for col in avg_cols}, 1351 count=("clust", "size"), # count rows in each cluster 1352 example_cells=("example_cell_id", lambda x: ",".join(x)), 1353 hcpc=("hcpc", lambda x: x.iloc[0]), 1354 ) 1355 data_df = data_df.reset_index() # Do NOT drop, index is "clust" 1356 # Create new columns 1357 metadata = pd.DataFrame( 1358 { 1359 "count": data_df["count"], 1360 "example_cells": data_df["example_cells"], 1361 "clust": data_df["clust"].astype(int), 1362 "hcpc": data_df["hcpc"].astype(int), 1363 "id": data_df["clust"].astype(int).astype(str), 1364 "cccluster": "0", # Dummy value 1365 "ccdistance": 0.0, # Dummy value 1366 "rownum": list(range(len(data_df))), 1367 "framegroup": 0, # Dummy value 1368 } 1369 ) 1370 # Need to pad the features to 761 columns, as per OCULAR report needs 1371 additional_columns = range(len(avg_cols), 761) 1372 if len(additional_columns) > 0: 1373 padding = pd.DataFrame( 1374 np.zeros((len(data_df), len(additional_columns))), 1375 columns=[f"pad{i}" for i in additional_columns], 1376 ) 1377 data_df = pd.concat([data_df[avg_cols], padding, metadata], axis=1) 1378 else: 1379 data_df = pd.concat([data_df[avg_cols], metadata], axis=1) 1380 1381 # Save the cluster data 1382 data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv")) 1383 # Suppress pandas FutureWarning 1384 with warnings.catch_warnings(): 1385 warnings.simplefilter(action="ignore", category=FutureWarning) 1386 pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df)
Save the events to an OCULAR file. Relies on the dataframe originating from an OCULAR file (same columns; duplicate metadata/info).
Parameters
- output_path:
- event_type:
Returns
1388 @classmethod 1389 def load_ocular( 1390 cls, 1391 input_path: str, 1392 event_type="cells", 1393 cell_data_files=( 1394 "rc-final1.rds", 1395 "rc-final2.rds", 1396 "rc-final3.rds", 1397 "rc-final4.rds", 1398 "ocular_interesting.rds", 1399 ), 1400 others_data_files=( 1401 "others-final1.rds", 1402 "others-final2.rds", 1403 "others-final3.rds", 1404 "others-final4.rds", 1405 ), 1406 atlas_data_files=( 1407 "ocular_interesting.rds", 1408 "ocular_not_interesting.rds", 1409 ), 1410 drop_common_events=True, 1411 ) -> Self: 1412 """ 1413 1414 :param input_path: 1415 :param event_type: 1416 :param cell_data_files: 1417 :param others_data_files: 1418 :param atlas_data_files: 1419 :param drop_common_events: 1420 :return: 1421 """ 1422 if pyreadr is None: 1423 raise ModuleNotFoundError( 1424 "pyreadr not installed! Install pyreadr directly " 1425 "or run `pip install csi-images[rds]` option to resolve." 1426 ) 1427 # Check if the input path is a directory or a file 1428 if os.path.isfile(input_path): 1429 data_files = [os.path.basename(input_path)] 1430 input_path = os.path.dirname(input_path) 1431 if event_type == "cells": 1432 data_files = cell_data_files 1433 elif event_type == "others": 1434 data_files = others_data_files 1435 else: 1436 raise ValueError("Invalid event type.") 1437 1438 # Load the data from the OCULAR files 1439 file_data = {} 1440 for file in data_files: 1441 file_path = os.path.join(input_path, file) 1442 if not os.path.isfile(file_path): 1443 warnings.warn(f"{file} not found for in {input_path}") 1444 continue 1445 file_data[file] = pyreadr.read_r(file_path) 1446 # Get the DataFrame associated with None (pyreadr dict quirk) 1447 file_data[file] = file_data[file][None] 1448 if len(file_data[file]) == 0: 1449 # File gets dropped from the dict 1450 file_data.pop(file) 1451 warnings.warn(f"{file} has no cells") 1452 continue 1453 1454 # Drop common cells if requested and in this file 1455 if ( 1456 file in atlas_data_files 1457 and drop_common_events 1458 and "catalogue_classification" in file_data[file] 1459 ): 1460 common_cell_indices = ( 1461 file_data[file]["catalogue_classification"] == "common_cell" 1462 ) 1463 file_data[file] = file_data[file][common_cell_indices == False] 1464 1465 if len(file_data[file]) == 0: 1466 # File gets dropped from the dict 1467 file_data.pop(file) 1468 warnings.warn(f"{file} has no cells after dropping common cells") 1469 continue 1470 1471 # Extract frame_id and cell_id 1472 # DAPI- events already have frame_id cell_id outside rowname 1473 if event_type == "cells" and "frame_id" not in file_data[file].columns: 1474 file_data[file]["rowname"] = file_data[file]["rowname"].astype("str") 1475 # get frame_id cell_id from rownames column and split into two columns 1476 split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True) 1477 if len(split_res.columns) != 2: 1478 warnings.warn( 1479 f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}' 1480 ) 1481 # then assign it back to the dataframe 1482 file_data[file][["frame_id", "cell_id"]] = split_res.astype("int") 1483 # Ensure frame_id and cell_id are integers 1484 file_data[file]["frame_id"] = file_data[file]["frame_id"].astype("int") 1485 file_data[file]["cell_id"] = file_data[file]["cell_id"].astype("int") 1486 # reset indexes since they can cause NaN values in concat 1487 file_data[file] = file_data[file].reset_index(drop=True) 1488 1489 # Merge the data from all files 1490 if len(file_data) == 0: 1491 return EventArray() 1492 elif len(file_data) == 1: 1493 data = [file_data[file] for file in file_data.keys()][0] 1494 else: 1495 data = pd.concat(file_data.values()) 1496 1497 # Others is missing the "slide_id". Insert it right before "frame_id" column 1498 if event_type == "others" and "slide_id" not in data.columns: 1499 if os.path.basename(input_path) == "ocular": 1500 slide_id = os.path.basename(os.path.dirname(input_path)) 1501 else: 1502 slide_id = "UNKNOWN" 1503 data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id) 1504 1505 # Sort according to ascending cell_id to keep the original, which is in manual_df 1506 data = data.sort_values(by=["cell_id"], ascending=True) 1507 # Filter out duplicates by x & y 1508 data = data.assign( 1509 unique_id=data["slide_id"] 1510 + "_" 1511 + data["frame_id"].astype(str) 1512 + "_" 1513 + data["cellx"].astype(int).astype(str) 1514 + "_" 1515 + data["celly"].astype(int).astype(str) 1516 ) 1517 data = data.drop_duplicates(subset=["unique_id"], keep="first") 1518 # Normal unique_id is with cell_id 1519 data = data.assign( 1520 unique_id=data["slide_id"] 1521 + "_" 1522 + data["frame_id"].astype(str) 1523 + "_" 1524 + data["cell_id"].astype(str) 1525 ) 1526 data = data.reset_index(drop=True) 1527 # All columns up to "slide_id" are features; drop the "slide_id" 1528 features = data.loc[:, :"slide_id"].iloc[:, :-1] 1529 data = data.loc[:, "slide_id":] 1530 # Grab the info columns 1531 info = data[["slide_id", "frame_id", "cellx", "celly"]] 1532 info.columns = ["slide_id", "tile", "x", "y"] 1533 info = info.assign(roi=0) # OCULAR only works on 1 ROI, as far as known 1534 info = info[["slide_id", "tile", "roi", "x", "y"]] 1535 # Metadata has duplicate columns for later convenience 1536 metadata = data 1537 # Certain columns tend to be problematic with mixed data formats... 1538 for col in ["TRITC", "CY5", "FITC"]: 1539 if col in metadata: 1540 labels = { 1541 "False": False, 1542 "True": True, 1543 "FALSE": False, 1544 "TRUE": True, 1545 False: False, 1546 True: True, 1547 } 1548 metadata[col] = metadata[col].map(labels).astype(bool) 1549 for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]: 1550 if col in metadata: 1551 metadata[col] = metadata[col].fillna(-1).astype(int) 1552 info["tile"] = info["tile"] - 1 # Convert to 0-based indexing 1553 return EventArray(info, metadata, features)
Parameters
- input_path:
- event_type:
- cell_data_files:
- others_data_files:
- atlas_data_files:
- drop_common_events: