csi_images.csi_events
Contains the Event class, which represents a single event in a scan. The Event class optionally holds metadata and features. Lists of events with similar metadata or features can be combined into DataFrames for analysis.
The Event class holds the position of the event in the frame, which can be converted to the position in the scanner or slide coordinate positions. See the csi_utils.csi_scans documentation page for more information on the coordinate systems.
1""" 2Contains the Event class, which represents a single event in a scan. 3The Event class optionally holds metadata and features. Lists of events with 4similar metadata or features can be combined into DataFrames for analysis. 5 6The Event class holds the position of the event in the frame, which can be converted 7to the position in the scanner or slide coordinate positions. See the 8csi_utils.csi_scans documentation page for more information on the coordinate systems. 9""" 10 11import os 12import math 13import warnings 14from typing import Self 15 16import numpy as np 17import pandas as pd 18 19from .csi_scans import Scan 20from .csi_tiles import Tile 21from .csi_frames import Frame 22from .csi_images import extract_mask_info 23 24# Optional dependencies; will raise errors in particular functions if not installed 25try: 26 import pyreadr 27except ImportError: 28 pyreadr = None 29 30 31class Event: 32 """ 33 A class that represents a single event in a scan, making it easy to evaluate 34 singular events. Required metadata is exposed as attributes, and optional 35 metadata and features are stored as DataFrames. 36 """ 37 38 SCAN_TO_SLIDE_TRANSFORM = { 39 # Axioscan zero is in the top-right corner instead of top-left 40 Scan.Type.AXIOSCAN7: np.array( 41 [ 42 [1, 0, 75000], 43 [0, 1, 0], 44 [0, 0, 1], 45 ] 46 ), 47 # BZScanner coordinates are a special kind of messed up: 48 # - The slide is upside-down. 49 # - The slide is oriented vertically, with the barcode at the bottom. 50 # - Tiles are numbered from the top-right 51 Scan.Type.BZSCANNER: np.array( 52 [ 53 [0, -1, 75000], 54 [-1, 0, 25000], 55 [0, 0, 1], 56 ] 57 ), 58 } 59 """ 60 Homogeneous transformation matrices for converting between scanner and slide 61 coordinates. The matrices are 3x3, with the final column representing the 62 translation in micrometers (um). For more information, see 63 [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations). 64 65 Transformations are nominal, and accuracy is not guaranteed; this is due to 66 imperfections in slides and alignment in the scanners. Units are in micrometers. 67 """ 68 69 def __init__( 70 self, 71 scan: Scan, 72 tile: Tile, 73 x: int, 74 y: int, 75 size: int = 12, # End-to-end size in pixels 76 metadata: pd.Series = None, 77 features: pd.Series = None, 78 ): 79 self.scan = scan 80 self.tile = tile 81 self.x = x 82 self.y = y 83 self.size = size 84 self.metadata = metadata 85 self.features = features 86 87 def __repr__(self) -> str: 88 return f"{self.scan.slide_id}-{self.tile.n}-{self.x}-{self.y}" 89 90 def __eq__(self, other) -> bool: 91 return self.__repr__() == other.__repr__() 92 93 def __lt__(self, other): 94 return self.__repr__() < other.__repr__() 95 96 def get_scan_position(self) -> tuple[float, float]: 97 """ 98 Get the position of the event in the scanner's coordinate frame. 99 :return: the scan position of the event in micrometers (um). 100 """ 101 # Get overall pixel position 102 pixel_x = self.x + (self.scan.tile_width_px * self.tile.x) 103 pixel_y = self.y + (self.scan.tile_height_px * self.tile.y) 104 # Convert to micrometers 105 x_um = pixel_x * self.scan.pixel_size_um 106 y_um = pixel_y * self.scan.pixel_size_um 107 # Add the scan's origin in the scanner frame 108 x_um += self.scan.roi[self.tile.n_roi].origin_x_um 109 y_um += self.scan.roi[self.tile.n_roi].origin_y_um 110 return x_um, y_um 111 112 def get_slide_position(self) -> tuple[float, float]: 113 """ 114 Get the slide position of the event in micrometers (um). 115 :return: the slide position of the event. 116 """ 117 # Turn scan_position into a 3x1 vector 118 scan_position = self.get_scan_position() 119 scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]]) 120 121 # Multiply by the appropriate homogeneous matrix 122 if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value): 123 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7] 124 elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value): 125 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER] 126 else: 127 raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.") 128 slide_position = np.matmul(transform, scan_position) 129 return float(slide_position[0][0]), float(slide_position[1][0]) 130 131 def crop_images( 132 self, images: list[np.ndarray], crop_size: int = 100, in_pixels: bool = True 133 ) -> list[np.ndarray]: 134 """ 135 Get the event crops from the frame images. Called "get" because it does not 136 need to extract anything; it is very quick for extracting multiple events from 137 the same tile. 138 Use this if you're interested in many events. 139 :param images: the frame images. 140 :param crop_size: the square size of the image crop to get for this event. 141 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 142 :return: image_size x image_size crops of the event in the provided frames. If 143 the event is too close to the edge, the crop will be smaller and not centered. 144 """ 145 # Convert a crop size in micrometers to pixels 146 if not in_pixels: 147 crop_size = round(crop_size / self.scan.pixel_size_um) 148 # Find the crop bounds 149 bounds = [ 150 self.x - crop_size // 2, 151 self.y - crop_size // 2, 152 self.x + math.ceil(crop_size / 2), 153 self.y + math.ceil(crop_size / 2), 154 ] 155 # Determine how much the bounds violate the image size 156 displacements = [ 157 max(0, -bounds[0]), 158 max(0, -bounds[1]), 159 max(0, bounds[2] - images[0].shape[1]), 160 max(0, bounds[3] - images[0].shape[0]), 161 ] 162 # Cap off the bounds 163 bounds = [ 164 max(0, bounds[0]), 165 max(0, bounds[1]), 166 min(images[0].shape[1], bounds[2]), 167 min(images[0].shape[0], bounds[3]), 168 ] 169 170 # Crop the images 171 cropped_images = [] 172 for image in images: 173 # Create a blank image of the right size 174 cropped_image = np.zeros((crop_size, crop_size), dtype=image.dtype) 175 176 # Insert the cropped image into the blank image, leaving a black buffer 177 # around the edges if the crop would go beyond the original image bounds 178 cropped_image[ 179 displacements[1] : crop_size - displacements[3], 180 displacements[0] : crop_size - displacements[2], 181 ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]] 182 cropped_images.append(cropped_image) 183 return cropped_images 184 185 def extract_images( 186 self, crop_size: int = 100, in_pixels: bool = True 187 ) -> list[np.ndarray]: 188 """ 189 Extract the images from the scan and tile, reading from the file. Called 190 "extract" because it must read and extract the images from file, which is slow. 191 Use this if you're interested in only a few events, as it is inefficient when 192 reading multiple events from the same tile. 193 :param crop_size: the square size of the image crop to get for this event. 194 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 195 :return: a list of cropped images from the scan in the order of the channels. 196 """ 197 frames = Frame.get_frames(self.tile) 198 images = [frame.get_image() for frame in frames] 199 return self.crop_images(images, crop_size, in_pixels) 200 201 @classmethod 202 def extract_images_for_list( 203 cls, 204 events: list[Self], 205 crop_size: int | list[int] = None, 206 in_pixels: bool = True, 207 ) -> list[list[np.ndarray]]: 208 """ 209 Get the images for a list of events, ensuring that there is no wasteful reading 210 of the same tile multiple times. This function is more efficient than calling 211 extract_event_images for each event. 212 TODO: test this function 213 :param events: the events to extract images for. 214 :param crop_size: the square size of the image crop to get for this event. 215 Defaults to four times the size of the event. 216 :param in_pixels: whether the crop size is in pixels or micrometers. 217 Defaults to pixels, and is ignored if crop_size is None. 218 :return: a list of lists of cropped images for each event. 219 """ 220 if len(events) == 0: 221 return [] 222 223 # Populate a crop size if none provided 224 if crop_size is None: 225 crop_size = [4 * event.size for event in events] 226 in_pixels = True 227 # Propagate a constant crop size 228 elif isinstance(crop_size, int): 229 crop_size = [crop_size] * len(events) 230 231 # Sort the events by tile; use a shallow copy to avoid modifying the original 232 order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__())) 233 234 # Allocate the list to size 235 images = [None] * len(events) 236 last_tile = None 237 frame_images = None # Holds large numpy arrays, so expensive to compare 238 # Iterate through in sorted order 239 for i in order: 240 if last_tile != events[i].tile: 241 # Gather the frame images, preserving them for the next event 242 frames = Frame.get_frames(events[i].tile) 243 frame_images = [frame.get_image() for frame in frames] 244 245 last_tile = events[i].tile 246 # Use the frame images to crop the event images 247 # Preserve the original order using order[i] 248 images[i] = events[i].crop_images(frame_images, crop_size[i], in_pixels) 249 return images 250 251 252class EventArray: 253 """ 254 A class that holds a large number of events' data, making it easy to analyze and 255 manipulate many events at once. A more separated version of the Event class. 256 """ 257 258 INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y", "size"] 259 260 def __init__( 261 self, 262 info: pd.DataFrame = None, 263 metadata: pd.DataFrame = None, 264 features: pd.DataFrame = None, 265 ): 266 # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y", "size" 267 if info is not None: 268 if list(info.columns) != self.INFO_COLUMNS: 269 raise ValueError( 270 "EventArray.info must have columns 'slide_id', 'tile', 'roi', 'x', 'y', 'size'" 271 ) 272 # Copy first to avoid modifying the original 273 info = info.copy() 274 # Ensure that the columns are the right types 275 info["slide_id"] = info["slide_id"].astype(str) 276 info["tile"] = info["tile"].astype(np.uint16) 277 info["roi"] = info["roi"].astype(np.uint8) 278 info["x"] = info["x"].round().astype(np.uint16) 279 info["y"] = info["y"].round().astype(np.uint16) 280 info["size"] = info["size"].round().astype(np.uint16) 281 # All DataFrames must all have the same number of rows 282 if metadata is not None and (info is None or len(info) != len(metadata)): 283 raise ValueError( 284 "If EventArray.metadata is not None, it should match rows with .info" 285 ) 286 if features is not None and (info is None or len(info) != len(features)): 287 raise ValueError( 288 "If EventArray.features is not None, it should match rows with .info" 289 ) 290 self.info = info 291 self.metadata = metadata 292 self.features = features 293 294 def __len__(self) -> int: 295 # Convenience method to get the number of events 296 if self.info is None: 297 return 0 298 else: 299 return len(self.info) 300 301 def __eq__(self, other): 302 is_equal = True 303 # Parse all possibilities for info 304 if isinstance(self.info, pd.DataFrame): 305 if isinstance(other.info, pd.DataFrame): 306 is_equal = self.info.equals(other.info) 307 if not is_equal: 308 return False 309 else: 310 return False 311 elif self.info is None: 312 if other.info is not None: 313 return False 314 315 # Parse all possibilities for metadata 316 if isinstance(self.metadata, pd.DataFrame): 317 if isinstance(other.metadata, pd.DataFrame): 318 is_equal = self.metadata.equals(other.metadata) 319 if not is_equal: 320 return False 321 else: 322 return False 323 elif self.metadata is None: 324 if other.metadata is not None: 325 return False 326 327 # Parse all possibilities for features 328 if isinstance(self.features, pd.DataFrame): 329 if isinstance(other.features, pd.DataFrame): 330 is_equal = self.features.equals(other.features) 331 if not is_equal: 332 return False 333 else: 334 return False 335 elif self.features is None: 336 if other.features is not None: 337 return False 338 339 return is_equal 340 341 def get_sort_order(self, by: str | list[str], ascending: bool | list[bool] = True): 342 """ 343 Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames. 344 :param by: name of the column(s) to sort by. 345 :param ascending: whether to sort in ascending order; can be a list to match by 346 :return: the order of the indices to sort by. 347 """ 348 columns = self.get(by) 349 return columns.sort_values(by=by, ascending=ascending).index 350 351 def sort(self, by: str | list[str], ascending: bool | list[bool] = True) -> Self: 352 """ 353 Sort the EventArray by column(s) in the info, metadata, or features DataFrames. 354 :param by: name of the column(s) to sort by. 355 :param ascending: whether to sort in ascending order; can be a list to match by 356 :return: a new, sorted EventArray. 357 """ 358 order = self.get_sort_order(by, ascending) 359 info = self.info.loc[order].reset_index(drop=True) 360 if self.metadata is not None: 361 metadata = self.metadata.loc[order].reset_index(drop=True) 362 else: 363 metadata = None 364 if self.features is not None: 365 features = self.features.loc[order].reset_index(drop=True) 366 else: 367 features = None 368 return EventArray(info, metadata, features) 369 370 def get(self, column_names: int | str | list[int] | list[str]) -> pd.DataFrame: 371 """ 372 Get a DataFrame with the specified columns from the EventArray, by value. 373 :param column_names: the names of the columns to get. 374 :return: a DataFrame with the specified columns. 375 """ 376 if isinstance(column_names, int) or isinstance(column_names, str): 377 column_names = [column_names] 378 columns = [] 379 for column_name in column_names: 380 if column_name in self.info.columns: 381 columns.append(self.info[column_name]) 382 elif self.metadata is not None and column_name in self.metadata.columns: 383 columns.append(self.metadata[column_name]) 384 elif self.features is not None and column_name in self.features.columns: 385 columns.append(self.features[column_name]) 386 else: 387 raise ValueError(f"Column {column_name} not found in EventArray") 388 return pd.concat(columns, axis=1) 389 390 def rows(self, rows) -> Self: 391 """ 392 Get a subset of the EventArray rows based on a boolean or integer index, by value. 393 :param rows: the indices to get as a 1D boolean/integer list/array/series 394 :return: a new EventArray with the subset of events. 395 """ 396 info = self.info.loc[rows].reset_index(drop=True) 397 if self.metadata is not None: 398 metadata = self.metadata.loc[rows].reset_index(drop=True) 399 else: 400 metadata = None 401 if self.features is not None: 402 features = self.features.loc[rows].reset_index(drop=True) 403 else: 404 features = None 405 return EventArray(info, metadata, features) 406 407 def copy(self) -> Self: 408 """ 409 Create a deep copy of the EventArray. 410 :return: a deep copy of the EventArray. 411 """ 412 return EventArray( 413 info=self.info.copy(), 414 metadata=None if self.metadata is None else self.metadata.copy(), 415 features=None if self.features is None else self.features.copy(), 416 ) 417 418 def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None: 419 """ 420 Add metadata to the EventArray. Removes the need to check if metadata is None. 421 Overwrites any existing metadata with the same column names as the new metadata. 422 :param new_metadata: the metadata to add. 423 """ 424 if len(self) != len(new_metadata): 425 raise ValueError("New metadata must match length of existing info") 426 427 if self.metadata is None: 428 self.metadata = new_metadata 429 else: 430 if isinstance(new_metadata, pd.Series): 431 self.metadata[new_metadata.name] = new_metadata 432 else: 433 # It's a DataFrame 434 self.metadata[new_metadata.columns] = new_metadata 435 436 def add_features(self, new_features: pd.Series | pd.DataFrame) -> None: 437 """ 438 Add features to the EventArray. Removes the need to check if features is None. 439 Overwrites any existing features with the same column names as the new features. 440 :param new_features: the features to add. 441 """ 442 if len(self) != len(new_features): 443 raise ValueError("New features must match length of existing info") 444 445 if self.features is None: 446 self.features = new_features 447 else: 448 if isinstance(new_features, pd.Series): 449 self.features[new_features.name] = new_features 450 else: 451 # It's a DataFrame 452 self.features[new_features.columns] = new_features 453 454 @classmethod 455 def merge(cls, events: list[Self]) -> Self: 456 """ 457 Combine EventArrays in a list into a single EventArray. 458 :param events: the new list of events. 459 """ 460 all_info = [] 461 all_metadata = [] 462 all_features = [] 463 for event_array in events: 464 # Skip empty EventArrays 465 if event_array.info is not None: 466 all_info.append(event_array.info) 467 if event_array.metadata is not None: 468 all_metadata.append(event_array.metadata) 469 if event_array.features is not None: 470 all_features.append(event_array.features) 471 if len(all_info) == 0: 472 return EventArray() 473 else: 474 all_info = pd.concat(all_info, ignore_index=True) 475 if len(all_metadata) == 0: 476 all_metadata = None 477 else: 478 all_metadata = pd.concat(all_metadata, ignore_index=True) 479 if len(all_features) == 0: 480 all_features = None 481 else: 482 all_features = pd.concat(all_features, ignore_index=True) 483 484 return EventArray(all_info, all_metadata, all_features) 485 486 def to_events( 487 self, 488 scans: Scan | list[Scan], 489 ignore_missing_scans=True, 490 ignore_metadata=False, 491 ignore_features=False, 492 ) -> list[Event]: 493 """ 494 Get the events in the EventArray as a list of events. 495 :param scans: the scans that the events belong to, auto-matched by slide_id. 496 Pass None if you don't care about scan metadata (pass ignore_missing_scans). 497 :param ignore_missing_scans: whether to create blank scans for events without scans. 498 :param ignore_metadata: whether to ignore metadata or not 499 :param ignore_features: whether to ignore features or not 500 :return: 501 """ 502 if isinstance(scans, Scan): 503 scans = [scans] * len(self.info) 504 events = [] 505 for i in range(len(self.info)): 506 # Determine the associated scan 507 scan = None 508 for s in scans: 509 if s.slide_id == self.info["slide_id"][i]: 510 scan = s 511 break 512 if scan is None: 513 if ignore_missing_scans: 514 # Create a placeholder scan if the scan is missing 515 scan = Scan.make_placeholder( 516 self.info["slide_id"][i], 517 self.info["tile"][i], 518 self.info["roi"][i], 519 ) 520 else: 521 raise ValueError( 522 f"Scan {self.info['slide_id'][i]} not found for event {i}." 523 ) 524 # Prepare the metadata and features 525 if ignore_metadata or self.metadata is None: 526 metadata = None 527 else: 528 # This Series creation method is less efficient, 529 # but required for preserving dtypes 530 metadata = pd.Series( 531 {col: self.metadata.loc[i, col] for col in self.metadata.columns}, 532 dtype=object, 533 ) 534 if ignore_features or self.features is None: 535 features = None 536 else: 537 features = pd.Series( 538 {col: self.features.loc[i, col] for col in self.features.columns}, 539 dtype=object, 540 ) 541 # Create the event and append it to the list 542 events.append( 543 Event( 544 scan, 545 Tile(scan, self.info["tile"][i], self.info["roi"][i]), 546 self.info["x"][i], 547 self.info["y"][i], 548 size=self.info["size"][i], 549 metadata=metadata, 550 features=features, 551 ) 552 ) 553 return events 554 555 @classmethod 556 def from_events(cls, events: list[Event]) -> Self: 557 """ 558 Set the events in the EventArray to a new list of events. 559 :param events: the new list of events. 560 """ 561 # Return an empty array if we were passed nothing 562 if events is None or len(events) == 0: 563 return EventArray() 564 # Otherwise, grab the info 565 info = pd.DataFrame( 566 { 567 "slide_id": [event.scan.slide_id for event in events], 568 "tile": [event.tile.n for event in events], 569 "roi": [event.tile.n_roi for event in events], 570 "x": [event.x for event in events], 571 "y": [event.y for event in events], 572 "size": [event.size for event in events], 573 } 574 ) 575 metadata_list = [event.metadata for event in events] 576 # Iterate through and ensure that all metadata is the same shape 577 for metadata in metadata_list: 578 if type(metadata) != type(metadata_list[0]): 579 raise ValueError("All metadata must be the same type.") 580 if metadata is not None and metadata.shape != metadata_list[0].shape: 581 raise ValueError("All metadata must be the same shape.") 582 if metadata_list[0] is None: 583 metadata = None 584 else: 585 metadata = pd.DataFrame(metadata_list) 586 features_list = [event.features for event in events] 587 # Iterate through and ensure that all features are the same shape 588 for features in features_list: 589 if type(features) != type(features_list[0]): 590 raise ValueError("All features must be the same type.") 591 if features is not None and features.shape != features_list[0].shape: 592 raise ValueError("All features must be the same shape.") 593 if features_list[0] is None: 594 features = None 595 else: 596 features = pd.DataFrame(features_list) 597 return EventArray(info=info, metadata=metadata, features=features) 598 599 def to_dataframe(self) -> pd.DataFrame: 600 """ 601 Convert all the data in the EventArray to a single DataFrame. 602 :return: a DataFrame with all the data in the EventArray. 603 """ 604 # Make a copy of the info DataFrame and prepend "info_" to the column names 605 output = self.info.copy() 606 output.columns = [f"info_{col}" for col in output.columns] 607 # Combine with the metadata and prepend "metadata_" to the column names 608 if self.metadata is not None: 609 metadata = self.metadata.copy() 610 metadata.columns = [f"metadata_{col}" for col in metadata.columns] 611 output = pd.concat([output, metadata], axis=1) 612 # Combine with the features and prepend "features_" to the column names 613 if self.features is not None: 614 features = self.features.copy() 615 features.columns = [f"features_{col}" for col in features.columns] 616 output = pd.concat([output, features], axis=1) 617 return output 618 619 @classmethod 620 def from_dataframe(cls, df) -> Self: 621 """ 622 From a single, special DataFrame, create an EventArray. 623 :return: a DataFrame with all the data in the EventArray. 624 """ 625 # Split the columns into info, metadata, and features and strip prefix 626 info = df[[col for col in df.columns if col.startswith("info_")]].copy() 627 info.columns = [col.replace("info_", "") for col in info.columns] 628 if info.size == 0: 629 info = None 630 metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy() 631 metadata.columns = [col.replace("metadata_", "") for col in metadata.columns] 632 if metadata.size == 0: 633 metadata = None 634 features = df[[col for col in df.columns if col.startswith("features_")]].copy() 635 features.columns = [col.replace("features_", "") for col in features.columns] 636 if features.size == 0: 637 features = None 638 return cls(info=info, metadata=metadata, features=features) 639 640 @classmethod 641 def from_mask( 642 cls, 643 mask: np.ndarray, 644 slide_id: str, 645 tile_n: int, 646 n_roi: int = 0, 647 include_cell_id: bool = True, 648 images: list[np.ndarray] = None, 649 image_labels: list[str] = None, 650 properties: list[str] = None, 651 ) -> Self: 652 """ 653 Extract events from a mask DataFrame, including metadata and features. 654 :param mask: the mask to extract events from. 655 :param slide_id: the slide ID the mask is from. 656 :param tile_n: the tile number the mask is from. 657 :param n_roi: the ROI number the mask is from. 658 :param include_cell_id: whether to include the cell_id, or numerical 659 mask label, as metadata in the EventArray. 660 :param images: the intensity images to extract features from. 661 :param image_labels: the labels for the intensity images. 662 :param properties: list of properties to extract in addition to the defaults: 663 :return: EventArray corresponding to the mask labels. 664 """ 665 # Gather mask_info 666 if images is not None and image_labels is not None: 667 if len(images) != len(image_labels): 668 raise ValueError("Intensity images and labels must match lengths.") 669 670 mask_info = extract_mask_info(mask, images, image_labels, properties) 671 672 if len(mask_info) == 0: 673 return EventArray() 674 675 # Combine provided info and mask info 676 info = pd.DataFrame( 677 { 678 "slide_id": slide_id, 679 "tile": tile_n, 680 "roi": n_roi, 681 "x": mask_info["x"], 682 "y": mask_info["y"], 683 "size": mask_info["size"], 684 }, 685 ) 686 # Extract a metadata column if desired 687 if include_cell_id: 688 metadata = pd.DataFrame({"cell_id": mask_info["id"]}) 689 else: 690 metadata = None 691 # If any additional properties were extracted, add them as features 692 mask_info = mask_info.drop(columns=["id", "x", "y", "size"], errors="ignore") 693 if len(mask_info.columns) > 0: 694 features = mask_info 695 else: 696 features = None 697 return EventArray(info, metadata, features) 698 699 def save_csv(self, output_path: str) -> bool: 700 """ 701 Save the events to an CSV file, including metadata and features. 702 :param output_path: 703 :return: 704 """ 705 self.to_dataframe().to_csv(output_path, index=False) 706 return os.path.exists(output_path) 707 708 @classmethod 709 def load_csv(cls, input_path: str) -> Self: 710 """ 711 Load the events from an CSV file, including metadata and features. 712 :param input_path: 713 :return: 714 """ 715 # Load the CSV file 716 df = pd.read_csv(input_path) 717 return cls.from_dataframe(df) 718 719 def save_hdf5(self, output_path: str) -> bool: 720 """ 721 Save the events to an HDF5 file, including metadata and features. 722 Uses the pandas-provided HDF5 functions for ease, and external compatibility, 723 though these files are slightly harder to view in HDFView or similar. 724 :param output_path: 725 :return: 726 """ 727 # Open the output_path as an HDF5 file 728 with pd.HDFStore(output_path) as store: 729 # Store the dataframes in the HDF5 file 730 if self.info is not None: 731 store.put("info", self.info, index=False) 732 if self.metadata is not None: 733 store.put("metadata", self.metadata, index=False) 734 if self.features is not None: 735 store.put("features", self.features, index=False) 736 return os.path.exists(output_path) 737 738 @classmethod 739 def load_hdf5(cls, input_path: str) -> Self: 740 """ 741 Load the events from an HDF5 file, including metadata and features. 742 :param input_path: 743 :return: 744 """ 745 # Open the input_path as an HDF5 file 746 with pd.HDFStore(input_path) as store: 747 # Load the dataframes from the HDF5 file 748 info = store.get("info") if "info" in store else None 749 metadata = store.get("metadata") if "metadata" in store else None 750 features = store.get("features") if "features" in store else None 751 return cls(info=info, metadata=metadata, features=features) 752 753 def save_ocular(self, output_path: str, event_type: str = "cells"): 754 """ 755 Save the events to an OCULAR file. Relies on the dataframe originating 756 from an OCULAR file (same columns; duplicate metadata/info). 757 :param output_path: 758 :param event_type: 759 :return: 760 """ 761 if pyreadr is None: 762 raise ModuleNotFoundError( 763 "pyreadr not installed. Install pyreadr directly " 764 "or install csi-images with [rds] option to resolve." 765 ) 766 if event_type == "cells": 767 file_stub = "rc-final" 768 elif event_type == "others": 769 file_stub = "others-final" 770 else: 771 raise ValueError("Invalid event type. Must be cells or others.") 772 773 # Ensure good metadata 774 metadata = pd.DataFrame( 775 { 776 "slide_id": self.info["slide_id"], 777 "frame_id": self.info["tile"], 778 "cellx": self.info["x"], 779 "celly": self.info["y"], 780 "cell_id": ( 781 self.metadata["cell_id"] 782 if "cell_id" in self.metadata.columns 783 else range(len(self.info)) 784 ), 785 } 786 ) 787 if self.metadata is not None: 788 metadata[self.metadata.columns] = self.metadata.copy() 789 790 # Check for the "ocular_interesting" column 791 if event_type == "cells": 792 if "ocular_interesting" in metadata.columns: 793 interesting_rows = metadata["ocular_interesting"].to_numpy(dtype=bool) 794 elif "hcpc" in metadata.columns: 795 # Interesting cells don't get an hcpc designation, leaving them as -1 796 interesting_rows = ( 797 metadata["hcpc"].to_numpy() == -1 798 ) # interesting cells 799 else: 800 interesting_rows = [] 801 if sum(interesting_rows) > 0: 802 # Split the metadata into interesting and regular 803 interesting_events = self.rows(interesting_rows) 804 interesting_df = pd.concat( 805 [interesting_events.features, interesting_events.metadata], axis=1 806 ) 807 data_events = self.rows(~interesting_rows) 808 data_df = pd.concat( 809 [data_events.features, data_events.metadata], axis=1 810 ) 811 data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore") 812 813 # Drop particular columns for "interesting" 814 interesting_df = interesting_df.drop( 815 [ 816 "clust", 817 "hcpc", 818 "frame_id", 819 "cell_id", 820 "unique_id", 821 "ocular_interesting", 822 ], 823 axis=1, 824 errors="ignore", 825 ) 826 # Save both .csv and .rds 827 file_stub = os.path.join(output_path, "ocular_interesting") 828 interesting_df.to_csv(f"{file_stub}.csv") 829 # Suppress pandas FutureWarning 830 with warnings.catch_warnings(): 831 warnings.simplefilter(action="ignore", category=FutureWarning) 832 pyreadr.write_rds(f"{file_stub}.rds", interesting_df) 833 else: 834 data_df = pd.concat([self.features, metadata], axis=1) 835 else: 836 # Get all data and reset_index (will copy it) 837 data_df = pd.concat([self.features, metadata], axis=1) 838 839 # Split based on cluster number to conform to *-final[1-4].rds 840 n_clusters = max(data_df["clust"]) + 1 841 split_idx = [round(i * n_clusters / 4) for i in range(5)] 842 for i in range(4): 843 subset = (split_idx[i] <= data_df["clust"]) & ( 844 data_df["clust"] < split_idx[i + 1] 845 ) 846 data_df.loc[subset, "hcpc"] = i + 1 847 subset = data_df[subset].reset_index(drop=True) 848 # Suppress pandas FutureWarning 849 with warnings.catch_warnings(): 850 warnings.simplefilter(action="ignore", category=FutureWarning) 851 pyreadr.write_rds( 852 os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset 853 ) 854 855 # Create new example cell strings 856 data_df["example_cell_id"] = ( 857 data_df["slide_id"] 858 + " " 859 + data_df["frame_id"].astype(str) 860 + " " 861 + data_df["cell_id"].astype(str) 862 + " " 863 + data_df["cellx"].astype(int).astype(str) 864 + " " 865 + data_df["celly"].astype(int).astype(str) 866 ) 867 # Find averagable data columns 868 if "cellcluster_id" in data_df.columns: 869 end_idx = data_df.columns.get_loc("cellcluster_id") 870 else: 871 end_idx = data_df.columns.get_loc("slide_id") 872 avg_cols = data_df.columns[:end_idx].tolist() 873 # Group by cluster and average 874 data_df = data_df.groupby("clust").agg( 875 **{col: (col, "mean") for col in avg_cols}, 876 count=("clust", "size"), # count rows in each cluster 877 example_cells=("example_cell_id", lambda x: ",".join(x)), 878 hcpc=("hcpc", lambda x: x.iloc[0]), 879 ) 880 data_df = data_df.reset_index() # Do NOT drop, index is "clust" 881 # Create new columns 882 metadata = pd.DataFrame( 883 { 884 "count": data_df["count"], 885 "example_cells": data_df["example_cells"], 886 "clust": data_df["clust"].astype(int), 887 "hcpc": data_df["hcpc"].astype(int), 888 "id": data_df["clust"].astype(int).astype(str), 889 "cccluster": "0", # Dummy value 890 "ccdistance": 0.0, # Dummy value 891 "rownum": list(range(len(data_df))), 892 "framegroup": 0, # Dummy value 893 } 894 ) 895 # Need to pad the features to 761 columns, as per OCULAR report needs 896 additional_columns = range(len(avg_cols), 761) 897 if len(additional_columns) > 0: 898 padding = pd.DataFrame( 899 np.zeros((len(data_df), len(additional_columns))), 900 columns=[f"pad{i}" for i in additional_columns], 901 ) 902 data_df = pd.concat([data_df[avg_cols], padding, metadata], axis=1) 903 else: 904 data_df = pd.concat([data_df[avg_cols], metadata], axis=1) 905 906 # Save the cluster data 907 data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv")) 908 # Suppress pandas FutureWarning 909 with warnings.catch_warnings(): 910 warnings.simplefilter(action="ignore", category=FutureWarning) 911 pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df) 912 913 @classmethod 914 def load_ocular( 915 cls, 916 input_path: str, 917 event_type="cells", 918 cell_data_files=( 919 "rc-final1.rds", 920 "rc-final2.rds", 921 "rc-final3.rds", 922 "rc-final4.rds", 923 "ocular_interesting.rds", 924 ), 925 others_data_files=( 926 "others-final1.rds", 927 "others-final2.rds", 928 "others-final3.rds", 929 "others-final4.rds", 930 ), 931 atlas_data_files=( 932 "ocular_interesting.rds", 933 "ocular_not_interesting.rds", 934 ), 935 drop_common_events=True, 936 log=None, 937 ) -> Self: 938 """ 939 940 :param input_path: 941 :param event_type: 942 :param cell_data_files: 943 :param others_data_files: 944 :param atlas_data_files: 945 :param drop_common_events: 946 :param log: 947 :return: 948 """ 949 if pyreadr is None: 950 raise ModuleNotFoundError( 951 "pyreadr not installed. Install pyreadr directly " 952 "or install csi-images with [rds] option to resolve." 953 ) 954 # Check if the input path is a directory or a file 955 if os.path.isfile(input_path): 956 data_files = [os.path.basename(input_path)] 957 input_path = os.path.dirname(input_path) 958 if event_type == "cells": 959 data_files = cell_data_files 960 elif event_type == "others": 961 data_files = others_data_files 962 else: 963 raise ValueError("Invalid event type.") 964 965 # Load the data from the OCULAR files 966 file_data = {} 967 for file in data_files: 968 file_path = os.path.join(input_path, file) 969 if not os.path.isfile(file_path): 970 if log is not None: 971 log.warning(f"{file} not found for in {input_path}") 972 continue 973 file_data[file] = pyreadr.read_r(file_path) 974 # Get the DataFrame associated with None (pyreadr dict quirk) 975 file_data[file] = file_data[file][None] 976 if len(file_data[file]) == 0: 977 # File gets dropped from the dict 978 file_data.pop(file) 979 if log is not None: 980 log.warning(f"{file} has no cells") 981 continue 982 983 if log is not None: 984 log.debug(f"{file} has {len(file_data[file])} cells") 985 986 # Drop common cells if requested and in this file 987 if file in atlas_data_files and drop_common_events: 988 common_cell_indices = ( 989 file_data[file]["catalogue_classification"] == "common_cell" 990 ) 991 if log is not None: 992 log.debug( 993 f"Dropping {int(pd.Series.sum(common_cell_indices))}" 994 f"common cells from {file}" 995 ) 996 file_data[file] = file_data[file][common_cell_indices == False] 997 998 if len(file_data[file]) == 0: 999 # File gets dropped from the dict 1000 file_data.pop(file) 1001 if log is not None: 1002 log.warning(f"{file} has no cells after dropping common cells") 1003 continue 1004 1005 # Extract frame_id and cell_id 1006 # DAPI- events already have frame_id cell_id outside rowname 1007 if event_type == "cells" and "frame_id" not in file_data[file].columns: 1008 file_data[file]["rowname"] = file_data[file]["rowname"].astype("str") 1009 # get frame_id cell_id from rownames column and split into two columns 1010 split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True) 1011 if len(split_res.columns) != 2: 1012 log.warning( 1013 f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}' 1014 ) 1015 # then assign it back to the dataframe 1016 file_data[file][["frame_id", "cell_id"]] = split_res.astype("int") 1017 # reset indexes since they can cause NaN values in concat 1018 file_data[file] = file_data[file].reset_index(drop=True) 1019 1020 # Merge the data from all files 1021 if len(file_data) == 0: 1022 return EventArray() 1023 elif len(file_data) == 1: 1024 data = [file_data[file] for file in file_data.keys()][0] 1025 else: 1026 data = pd.concat(file_data.values()) 1027 1028 if log is not None: 1029 log.debug(f"Gathered a total of {len(data)} events") 1030 1031 # Others is missing the "slide_id". Insert it right before "frame_id" column 1032 if event_type == "others" and "slide_id" not in data.columns: 1033 if os.path.basename(input_path) == "ocular": 1034 slide_id = os.path.basename(os.path.dirname(input_path)) 1035 else: 1036 slide_id = "UNKNOWN" 1037 data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id) 1038 1039 # Sort according to ascending cell_id to keep the original, which is in manual_df 1040 data = data.sort_values(by=["cell_id"], ascending=True) 1041 # Filter out duplicates by x & y 1042 data = data.assign( 1043 unique_id=data["slide_id"] 1044 + "_" 1045 + data["frame_id"].astype(str) 1046 + "_" 1047 + data["cellx"].astype(int).astype(str) 1048 + "_" 1049 + data["celly"].astype(int).astype(str) 1050 ) 1051 data = data.drop_duplicates(subset=["unique_id"], keep="first") 1052 # Normal unique_id is with cell_id 1053 data = data.assign( 1054 unique_id=data["slide_id"] 1055 + "_" 1056 + data["frame_id"].astype(str) 1057 + "_" 1058 + data["cell_id"].astype(str) 1059 ) 1060 data = data.reset_index(drop=True) 1061 # All columns up to "slide_id" are features; drop the "slide_id" 1062 features = data.loc[:, :"slide_id"].iloc[:, :-1] 1063 data = data.loc[:, "slide_id":] 1064 # Grab the info columns 1065 info = data[["slide_id", "frame_id", "cellx", "celly"]] 1066 info.columns = ["slide_id", "tile", "x", "y"] 1067 info = info.assign( 1068 roi=0, # OCULAR only works on 1 ROI, as far as known 1069 size=25, # Static, for later montaging 1070 ) 1071 info = info[["slide_id", "tile", "roi", "x", "y", "size"]] 1072 # Metadata has duplicate columns for later convenience 1073 metadata = data 1074 # Certain columns tend to be problematic with mixed data formats... 1075 for col in ["TRITC", "CY5", "FITC"]: 1076 if col in metadata: 1077 labels = { 1078 "False": False, 1079 "True": True, 1080 "FALSE": False, 1081 "TRUE": True, 1082 } 1083 metadata[col] = metadata[col].map(labels).astype(bool) 1084 for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]: 1085 if col in metadata: 1086 metadata[col] = metadata[col].fillna(-1).astype(int) 1087 return EventArray(info, metadata, features)
32class Event: 33 """ 34 A class that represents a single event in a scan, making it easy to evaluate 35 singular events. Required metadata is exposed as attributes, and optional 36 metadata and features are stored as DataFrames. 37 """ 38 39 SCAN_TO_SLIDE_TRANSFORM = { 40 # Axioscan zero is in the top-right corner instead of top-left 41 Scan.Type.AXIOSCAN7: np.array( 42 [ 43 [1, 0, 75000], 44 [0, 1, 0], 45 [0, 0, 1], 46 ] 47 ), 48 # BZScanner coordinates are a special kind of messed up: 49 # - The slide is upside-down. 50 # - The slide is oriented vertically, with the barcode at the bottom. 51 # - Tiles are numbered from the top-right 52 Scan.Type.BZSCANNER: np.array( 53 [ 54 [0, -1, 75000], 55 [-1, 0, 25000], 56 [0, 0, 1], 57 ] 58 ), 59 } 60 """ 61 Homogeneous transformation matrices for converting between scanner and slide 62 coordinates. The matrices are 3x3, with the final column representing the 63 translation in micrometers (um). For more information, see 64 [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations). 65 66 Transformations are nominal, and accuracy is not guaranteed; this is due to 67 imperfections in slides and alignment in the scanners. Units are in micrometers. 68 """ 69 70 def __init__( 71 self, 72 scan: Scan, 73 tile: Tile, 74 x: int, 75 y: int, 76 size: int = 12, # End-to-end size in pixels 77 metadata: pd.Series = None, 78 features: pd.Series = None, 79 ): 80 self.scan = scan 81 self.tile = tile 82 self.x = x 83 self.y = y 84 self.size = size 85 self.metadata = metadata 86 self.features = features 87 88 def __repr__(self) -> str: 89 return f"{self.scan.slide_id}-{self.tile.n}-{self.x}-{self.y}" 90 91 def __eq__(self, other) -> bool: 92 return self.__repr__() == other.__repr__() 93 94 def __lt__(self, other): 95 return self.__repr__() < other.__repr__() 96 97 def get_scan_position(self) -> tuple[float, float]: 98 """ 99 Get the position of the event in the scanner's coordinate frame. 100 :return: the scan position of the event in micrometers (um). 101 """ 102 # Get overall pixel position 103 pixel_x = self.x + (self.scan.tile_width_px * self.tile.x) 104 pixel_y = self.y + (self.scan.tile_height_px * self.tile.y) 105 # Convert to micrometers 106 x_um = pixel_x * self.scan.pixel_size_um 107 y_um = pixel_y * self.scan.pixel_size_um 108 # Add the scan's origin in the scanner frame 109 x_um += self.scan.roi[self.tile.n_roi].origin_x_um 110 y_um += self.scan.roi[self.tile.n_roi].origin_y_um 111 return x_um, y_um 112 113 def get_slide_position(self) -> tuple[float, float]: 114 """ 115 Get the slide position of the event in micrometers (um). 116 :return: the slide position of the event. 117 """ 118 # Turn scan_position into a 3x1 vector 119 scan_position = self.get_scan_position() 120 scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]]) 121 122 # Multiply by the appropriate homogeneous matrix 123 if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value): 124 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7] 125 elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value): 126 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER] 127 else: 128 raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.") 129 slide_position = np.matmul(transform, scan_position) 130 return float(slide_position[0][0]), float(slide_position[1][0]) 131 132 def crop_images( 133 self, images: list[np.ndarray], crop_size: int = 100, in_pixels: bool = True 134 ) -> list[np.ndarray]: 135 """ 136 Get the event crops from the frame images. Called "get" because it does not 137 need to extract anything; it is very quick for extracting multiple events from 138 the same tile. 139 Use this if you're interested in many events. 140 :param images: the frame images. 141 :param crop_size: the square size of the image crop to get for this event. 142 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 143 :return: image_size x image_size crops of the event in the provided frames. If 144 the event is too close to the edge, the crop will be smaller and not centered. 145 """ 146 # Convert a crop size in micrometers to pixels 147 if not in_pixels: 148 crop_size = round(crop_size / self.scan.pixel_size_um) 149 # Find the crop bounds 150 bounds = [ 151 self.x - crop_size // 2, 152 self.y - crop_size // 2, 153 self.x + math.ceil(crop_size / 2), 154 self.y + math.ceil(crop_size / 2), 155 ] 156 # Determine how much the bounds violate the image size 157 displacements = [ 158 max(0, -bounds[0]), 159 max(0, -bounds[1]), 160 max(0, bounds[2] - images[0].shape[1]), 161 max(0, bounds[3] - images[0].shape[0]), 162 ] 163 # Cap off the bounds 164 bounds = [ 165 max(0, bounds[0]), 166 max(0, bounds[1]), 167 min(images[0].shape[1], bounds[2]), 168 min(images[0].shape[0], bounds[3]), 169 ] 170 171 # Crop the images 172 cropped_images = [] 173 for image in images: 174 # Create a blank image of the right size 175 cropped_image = np.zeros((crop_size, crop_size), dtype=image.dtype) 176 177 # Insert the cropped image into the blank image, leaving a black buffer 178 # around the edges if the crop would go beyond the original image bounds 179 cropped_image[ 180 displacements[1] : crop_size - displacements[3], 181 displacements[0] : crop_size - displacements[2], 182 ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]] 183 cropped_images.append(cropped_image) 184 return cropped_images 185 186 def extract_images( 187 self, crop_size: int = 100, in_pixels: bool = True 188 ) -> list[np.ndarray]: 189 """ 190 Extract the images from the scan and tile, reading from the file. Called 191 "extract" because it must read and extract the images from file, which is slow. 192 Use this if you're interested in only a few events, as it is inefficient when 193 reading multiple events from the same tile. 194 :param crop_size: the square size of the image crop to get for this event. 195 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 196 :return: a list of cropped images from the scan in the order of the channels. 197 """ 198 frames = Frame.get_frames(self.tile) 199 images = [frame.get_image() for frame in frames] 200 return self.crop_images(images, crop_size, in_pixels) 201 202 @classmethod 203 def extract_images_for_list( 204 cls, 205 events: list[Self], 206 crop_size: int | list[int] = None, 207 in_pixels: bool = True, 208 ) -> list[list[np.ndarray]]: 209 """ 210 Get the images for a list of events, ensuring that there is no wasteful reading 211 of the same tile multiple times. This function is more efficient than calling 212 extract_event_images for each event. 213 TODO: test this function 214 :param events: the events to extract images for. 215 :param crop_size: the square size of the image crop to get for this event. 216 Defaults to four times the size of the event. 217 :param in_pixels: whether the crop size is in pixels or micrometers. 218 Defaults to pixels, and is ignored if crop_size is None. 219 :return: a list of lists of cropped images for each event. 220 """ 221 if len(events) == 0: 222 return [] 223 224 # Populate a crop size if none provided 225 if crop_size is None: 226 crop_size = [4 * event.size for event in events] 227 in_pixels = True 228 # Propagate a constant crop size 229 elif isinstance(crop_size, int): 230 crop_size = [crop_size] * len(events) 231 232 # Sort the events by tile; use a shallow copy to avoid modifying the original 233 order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__())) 234 235 # Allocate the list to size 236 images = [None] * len(events) 237 last_tile = None 238 frame_images = None # Holds large numpy arrays, so expensive to compare 239 # Iterate through in sorted order 240 for i in order: 241 if last_tile != events[i].tile: 242 # Gather the frame images, preserving them for the next event 243 frames = Frame.get_frames(events[i].tile) 244 frame_images = [frame.get_image() for frame in frames] 245 246 last_tile = events[i].tile 247 # Use the frame images to crop the event images 248 # Preserve the original order using order[i] 249 images[i] = events[i].crop_images(frame_images, crop_size[i], in_pixels) 250 return images
A class that represents a single event in a scan, making it easy to evaluate singular events. Required metadata is exposed as attributes, and optional metadata and features are stored as DataFrames.
70 def __init__( 71 self, 72 scan: Scan, 73 tile: Tile, 74 x: int, 75 y: int, 76 size: int = 12, # End-to-end size in pixels 77 metadata: pd.Series = None, 78 features: pd.Series = None, 79 ): 80 self.scan = scan 81 self.tile = tile 82 self.x = x 83 self.y = y 84 self.size = size 85 self.metadata = metadata 86 self.features = features
Homogeneous transformation matrices for converting between scanner and slide coordinates. The matrices are 3x3, with the final column representing the translation in micrometers (um). For more information, see affine transformations.
Transformations are nominal, and accuracy is not guaranteed; this is due to imperfections in slides and alignment in the scanners. Units are in micrometers.
97 def get_scan_position(self) -> tuple[float, float]: 98 """ 99 Get the position of the event in the scanner's coordinate frame. 100 :return: the scan position of the event in micrometers (um). 101 """ 102 # Get overall pixel position 103 pixel_x = self.x + (self.scan.tile_width_px * self.tile.x) 104 pixel_y = self.y + (self.scan.tile_height_px * self.tile.y) 105 # Convert to micrometers 106 x_um = pixel_x * self.scan.pixel_size_um 107 y_um = pixel_y * self.scan.pixel_size_um 108 # Add the scan's origin in the scanner frame 109 x_um += self.scan.roi[self.tile.n_roi].origin_x_um 110 y_um += self.scan.roi[self.tile.n_roi].origin_y_um 111 return x_um, y_um
Get the position of the event in the scanner's coordinate frame.
Returns
the scan position of the event in micrometers (um).
113 def get_slide_position(self) -> tuple[float, float]: 114 """ 115 Get the slide position of the event in micrometers (um). 116 :return: the slide position of the event. 117 """ 118 # Turn scan_position into a 3x1 vector 119 scan_position = self.get_scan_position() 120 scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]]) 121 122 # Multiply by the appropriate homogeneous matrix 123 if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value): 124 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7] 125 elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value): 126 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER] 127 else: 128 raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.") 129 slide_position = np.matmul(transform, scan_position) 130 return float(slide_position[0][0]), float(slide_position[1][0])
Get the slide position of the event in micrometers (um).
Returns
the slide position of the event.
132 def crop_images( 133 self, images: list[np.ndarray], crop_size: int = 100, in_pixels: bool = True 134 ) -> list[np.ndarray]: 135 """ 136 Get the event crops from the frame images. Called "get" because it does not 137 need to extract anything; it is very quick for extracting multiple events from 138 the same tile. 139 Use this if you're interested in many events. 140 :param images: the frame images. 141 :param crop_size: the square size of the image crop to get for this event. 142 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 143 :return: image_size x image_size crops of the event in the provided frames. If 144 the event is too close to the edge, the crop will be smaller and not centered. 145 """ 146 # Convert a crop size in micrometers to pixels 147 if not in_pixels: 148 crop_size = round(crop_size / self.scan.pixel_size_um) 149 # Find the crop bounds 150 bounds = [ 151 self.x - crop_size // 2, 152 self.y - crop_size // 2, 153 self.x + math.ceil(crop_size / 2), 154 self.y + math.ceil(crop_size / 2), 155 ] 156 # Determine how much the bounds violate the image size 157 displacements = [ 158 max(0, -bounds[0]), 159 max(0, -bounds[1]), 160 max(0, bounds[2] - images[0].shape[1]), 161 max(0, bounds[3] - images[0].shape[0]), 162 ] 163 # Cap off the bounds 164 bounds = [ 165 max(0, bounds[0]), 166 max(0, bounds[1]), 167 min(images[0].shape[1], bounds[2]), 168 min(images[0].shape[0], bounds[3]), 169 ] 170 171 # Crop the images 172 cropped_images = [] 173 for image in images: 174 # Create a blank image of the right size 175 cropped_image = np.zeros((crop_size, crop_size), dtype=image.dtype) 176 177 # Insert the cropped image into the blank image, leaving a black buffer 178 # around the edges if the crop would go beyond the original image bounds 179 cropped_image[ 180 displacements[1] : crop_size - displacements[3], 181 displacements[0] : crop_size - displacements[2], 182 ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]] 183 cropped_images.append(cropped_image) 184 return cropped_images
Get the event crops from the frame images. Called "get" because it does not need to extract anything; it is very quick for extracting multiple events from the same tile. Use this if you're interested in many events.
Parameters
- images: the frame images.
- crop_size: the square size of the image crop to get for this event.
- in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
Returns
image_size x image_size crops of the event in the provided frames. If the event is too close to the edge, the crop will be smaller and not centered.
186 def extract_images( 187 self, crop_size: int = 100, in_pixels: bool = True 188 ) -> list[np.ndarray]: 189 """ 190 Extract the images from the scan and tile, reading from the file. Called 191 "extract" because it must read and extract the images from file, which is slow. 192 Use this if you're interested in only a few events, as it is inefficient when 193 reading multiple events from the same tile. 194 :param crop_size: the square size of the image crop to get for this event. 195 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 196 :return: a list of cropped images from the scan in the order of the channels. 197 """ 198 frames = Frame.get_frames(self.tile) 199 images = [frame.get_image() for frame in frames] 200 return self.crop_images(images, crop_size, in_pixels)
Extract the images from the scan and tile, reading from the file. Called "extract" because it must read and extract the images from file, which is slow. Use this if you're interested in only a few events, as it is inefficient when reading multiple events from the same tile.
Parameters
- crop_size: the square size of the image crop to get for this event.
- in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
Returns
a list of cropped images from the scan in the order of the channels.
202 @classmethod 203 def extract_images_for_list( 204 cls, 205 events: list[Self], 206 crop_size: int | list[int] = None, 207 in_pixels: bool = True, 208 ) -> list[list[np.ndarray]]: 209 """ 210 Get the images for a list of events, ensuring that there is no wasteful reading 211 of the same tile multiple times. This function is more efficient than calling 212 extract_event_images for each event. 213 TODO: test this function 214 :param events: the events to extract images for. 215 :param crop_size: the square size of the image crop to get for this event. 216 Defaults to four times the size of the event. 217 :param in_pixels: whether the crop size is in pixels or micrometers. 218 Defaults to pixels, and is ignored if crop_size is None. 219 :return: a list of lists of cropped images for each event. 220 """ 221 if len(events) == 0: 222 return [] 223 224 # Populate a crop size if none provided 225 if crop_size is None: 226 crop_size = [4 * event.size for event in events] 227 in_pixels = True 228 # Propagate a constant crop size 229 elif isinstance(crop_size, int): 230 crop_size = [crop_size] * len(events) 231 232 # Sort the events by tile; use a shallow copy to avoid modifying the original 233 order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__())) 234 235 # Allocate the list to size 236 images = [None] * len(events) 237 last_tile = None 238 frame_images = None # Holds large numpy arrays, so expensive to compare 239 # Iterate through in sorted order 240 for i in order: 241 if last_tile != events[i].tile: 242 # Gather the frame images, preserving them for the next event 243 frames = Frame.get_frames(events[i].tile) 244 frame_images = [frame.get_image() for frame in frames] 245 246 last_tile = events[i].tile 247 # Use the frame images to crop the event images 248 # Preserve the original order using order[i] 249 images[i] = events[i].crop_images(frame_images, crop_size[i], in_pixels) 250 return images
Get the images for a list of events, ensuring that there is no wasteful reading of the same tile multiple times. This function is more efficient than calling extract_event_images for each event. TODO: test this function
Parameters
- events: the events to extract images for.
- crop_size: the square size of the image crop to get for this event. Defaults to four times the size of the event.
- in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels, and is ignored if crop_size is None.
Returns
a list of lists of cropped images for each event.
253class EventArray: 254 """ 255 A class that holds a large number of events' data, making it easy to analyze and 256 manipulate many events at once. A more separated version of the Event class. 257 """ 258 259 INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y", "size"] 260 261 def __init__( 262 self, 263 info: pd.DataFrame = None, 264 metadata: pd.DataFrame = None, 265 features: pd.DataFrame = None, 266 ): 267 # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y", "size" 268 if info is not None: 269 if list(info.columns) != self.INFO_COLUMNS: 270 raise ValueError( 271 "EventArray.info must have columns 'slide_id', 'tile', 'roi', 'x', 'y', 'size'" 272 ) 273 # Copy first to avoid modifying the original 274 info = info.copy() 275 # Ensure that the columns are the right types 276 info["slide_id"] = info["slide_id"].astype(str) 277 info["tile"] = info["tile"].astype(np.uint16) 278 info["roi"] = info["roi"].astype(np.uint8) 279 info["x"] = info["x"].round().astype(np.uint16) 280 info["y"] = info["y"].round().astype(np.uint16) 281 info["size"] = info["size"].round().astype(np.uint16) 282 # All DataFrames must all have the same number of rows 283 if metadata is not None and (info is None or len(info) != len(metadata)): 284 raise ValueError( 285 "If EventArray.metadata is not None, it should match rows with .info" 286 ) 287 if features is not None and (info is None or len(info) != len(features)): 288 raise ValueError( 289 "If EventArray.features is not None, it should match rows with .info" 290 ) 291 self.info = info 292 self.metadata = metadata 293 self.features = features 294 295 def __len__(self) -> int: 296 # Convenience method to get the number of events 297 if self.info is None: 298 return 0 299 else: 300 return len(self.info) 301 302 def __eq__(self, other): 303 is_equal = True 304 # Parse all possibilities for info 305 if isinstance(self.info, pd.DataFrame): 306 if isinstance(other.info, pd.DataFrame): 307 is_equal = self.info.equals(other.info) 308 if not is_equal: 309 return False 310 else: 311 return False 312 elif self.info is None: 313 if other.info is not None: 314 return False 315 316 # Parse all possibilities for metadata 317 if isinstance(self.metadata, pd.DataFrame): 318 if isinstance(other.metadata, pd.DataFrame): 319 is_equal = self.metadata.equals(other.metadata) 320 if not is_equal: 321 return False 322 else: 323 return False 324 elif self.metadata is None: 325 if other.metadata is not None: 326 return False 327 328 # Parse all possibilities for features 329 if isinstance(self.features, pd.DataFrame): 330 if isinstance(other.features, pd.DataFrame): 331 is_equal = self.features.equals(other.features) 332 if not is_equal: 333 return False 334 else: 335 return False 336 elif self.features is None: 337 if other.features is not None: 338 return False 339 340 return is_equal 341 342 def get_sort_order(self, by: str | list[str], ascending: bool | list[bool] = True): 343 """ 344 Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames. 345 :param by: name of the column(s) to sort by. 346 :param ascending: whether to sort in ascending order; can be a list to match by 347 :return: the order of the indices to sort by. 348 """ 349 columns = self.get(by) 350 return columns.sort_values(by=by, ascending=ascending).index 351 352 def sort(self, by: str | list[str], ascending: bool | list[bool] = True) -> Self: 353 """ 354 Sort the EventArray by column(s) in the info, metadata, or features DataFrames. 355 :param by: name of the column(s) to sort by. 356 :param ascending: whether to sort in ascending order; can be a list to match by 357 :return: a new, sorted EventArray. 358 """ 359 order = self.get_sort_order(by, ascending) 360 info = self.info.loc[order].reset_index(drop=True) 361 if self.metadata is not None: 362 metadata = self.metadata.loc[order].reset_index(drop=True) 363 else: 364 metadata = None 365 if self.features is not None: 366 features = self.features.loc[order].reset_index(drop=True) 367 else: 368 features = None 369 return EventArray(info, metadata, features) 370 371 def get(self, column_names: int | str | list[int] | list[str]) -> pd.DataFrame: 372 """ 373 Get a DataFrame with the specified columns from the EventArray, by value. 374 :param column_names: the names of the columns to get. 375 :return: a DataFrame with the specified columns. 376 """ 377 if isinstance(column_names, int) or isinstance(column_names, str): 378 column_names = [column_names] 379 columns = [] 380 for column_name in column_names: 381 if column_name in self.info.columns: 382 columns.append(self.info[column_name]) 383 elif self.metadata is not None and column_name in self.metadata.columns: 384 columns.append(self.metadata[column_name]) 385 elif self.features is not None and column_name in self.features.columns: 386 columns.append(self.features[column_name]) 387 else: 388 raise ValueError(f"Column {column_name} not found in EventArray") 389 return pd.concat(columns, axis=1) 390 391 def rows(self, rows) -> Self: 392 """ 393 Get a subset of the EventArray rows based on a boolean or integer index, by value. 394 :param rows: the indices to get as a 1D boolean/integer list/array/series 395 :return: a new EventArray with the subset of events. 396 """ 397 info = self.info.loc[rows].reset_index(drop=True) 398 if self.metadata is not None: 399 metadata = self.metadata.loc[rows].reset_index(drop=True) 400 else: 401 metadata = None 402 if self.features is not None: 403 features = self.features.loc[rows].reset_index(drop=True) 404 else: 405 features = None 406 return EventArray(info, metadata, features) 407 408 def copy(self) -> Self: 409 """ 410 Create a deep copy of the EventArray. 411 :return: a deep copy of the EventArray. 412 """ 413 return EventArray( 414 info=self.info.copy(), 415 metadata=None if self.metadata is None else self.metadata.copy(), 416 features=None if self.features is None else self.features.copy(), 417 ) 418 419 def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None: 420 """ 421 Add metadata to the EventArray. Removes the need to check if metadata is None. 422 Overwrites any existing metadata with the same column names as the new metadata. 423 :param new_metadata: the metadata to add. 424 """ 425 if len(self) != len(new_metadata): 426 raise ValueError("New metadata must match length of existing info") 427 428 if self.metadata is None: 429 self.metadata = new_metadata 430 else: 431 if isinstance(new_metadata, pd.Series): 432 self.metadata[new_metadata.name] = new_metadata 433 else: 434 # It's a DataFrame 435 self.metadata[new_metadata.columns] = new_metadata 436 437 def add_features(self, new_features: pd.Series | pd.DataFrame) -> None: 438 """ 439 Add features to the EventArray. Removes the need to check if features is None. 440 Overwrites any existing features with the same column names as the new features. 441 :param new_features: the features to add. 442 """ 443 if len(self) != len(new_features): 444 raise ValueError("New features must match length of existing info") 445 446 if self.features is None: 447 self.features = new_features 448 else: 449 if isinstance(new_features, pd.Series): 450 self.features[new_features.name] = new_features 451 else: 452 # It's a DataFrame 453 self.features[new_features.columns] = new_features 454 455 @classmethod 456 def merge(cls, events: list[Self]) -> Self: 457 """ 458 Combine EventArrays in a list into a single EventArray. 459 :param events: the new list of events. 460 """ 461 all_info = [] 462 all_metadata = [] 463 all_features = [] 464 for event_array in events: 465 # Skip empty EventArrays 466 if event_array.info is not None: 467 all_info.append(event_array.info) 468 if event_array.metadata is not None: 469 all_metadata.append(event_array.metadata) 470 if event_array.features is not None: 471 all_features.append(event_array.features) 472 if len(all_info) == 0: 473 return EventArray() 474 else: 475 all_info = pd.concat(all_info, ignore_index=True) 476 if len(all_metadata) == 0: 477 all_metadata = None 478 else: 479 all_metadata = pd.concat(all_metadata, ignore_index=True) 480 if len(all_features) == 0: 481 all_features = None 482 else: 483 all_features = pd.concat(all_features, ignore_index=True) 484 485 return EventArray(all_info, all_metadata, all_features) 486 487 def to_events( 488 self, 489 scans: Scan | list[Scan], 490 ignore_missing_scans=True, 491 ignore_metadata=False, 492 ignore_features=False, 493 ) -> list[Event]: 494 """ 495 Get the events in the EventArray as a list of events. 496 :param scans: the scans that the events belong to, auto-matched by slide_id. 497 Pass None if you don't care about scan metadata (pass ignore_missing_scans). 498 :param ignore_missing_scans: whether to create blank scans for events without scans. 499 :param ignore_metadata: whether to ignore metadata or not 500 :param ignore_features: whether to ignore features or not 501 :return: 502 """ 503 if isinstance(scans, Scan): 504 scans = [scans] * len(self.info) 505 events = [] 506 for i in range(len(self.info)): 507 # Determine the associated scan 508 scan = None 509 for s in scans: 510 if s.slide_id == self.info["slide_id"][i]: 511 scan = s 512 break 513 if scan is None: 514 if ignore_missing_scans: 515 # Create a placeholder scan if the scan is missing 516 scan = Scan.make_placeholder( 517 self.info["slide_id"][i], 518 self.info["tile"][i], 519 self.info["roi"][i], 520 ) 521 else: 522 raise ValueError( 523 f"Scan {self.info['slide_id'][i]} not found for event {i}." 524 ) 525 # Prepare the metadata and features 526 if ignore_metadata or self.metadata is None: 527 metadata = None 528 else: 529 # This Series creation method is less efficient, 530 # but required for preserving dtypes 531 metadata = pd.Series( 532 {col: self.metadata.loc[i, col] for col in self.metadata.columns}, 533 dtype=object, 534 ) 535 if ignore_features or self.features is None: 536 features = None 537 else: 538 features = pd.Series( 539 {col: self.features.loc[i, col] for col in self.features.columns}, 540 dtype=object, 541 ) 542 # Create the event and append it to the list 543 events.append( 544 Event( 545 scan, 546 Tile(scan, self.info["tile"][i], self.info["roi"][i]), 547 self.info["x"][i], 548 self.info["y"][i], 549 size=self.info["size"][i], 550 metadata=metadata, 551 features=features, 552 ) 553 ) 554 return events 555 556 @classmethod 557 def from_events(cls, events: list[Event]) -> Self: 558 """ 559 Set the events in the EventArray to a new list of events. 560 :param events: the new list of events. 561 """ 562 # Return an empty array if we were passed nothing 563 if events is None or len(events) == 0: 564 return EventArray() 565 # Otherwise, grab the info 566 info = pd.DataFrame( 567 { 568 "slide_id": [event.scan.slide_id for event in events], 569 "tile": [event.tile.n for event in events], 570 "roi": [event.tile.n_roi for event in events], 571 "x": [event.x for event in events], 572 "y": [event.y for event in events], 573 "size": [event.size for event in events], 574 } 575 ) 576 metadata_list = [event.metadata for event in events] 577 # Iterate through and ensure that all metadata is the same shape 578 for metadata in metadata_list: 579 if type(metadata) != type(metadata_list[0]): 580 raise ValueError("All metadata must be the same type.") 581 if metadata is not None and metadata.shape != metadata_list[0].shape: 582 raise ValueError("All metadata must be the same shape.") 583 if metadata_list[0] is None: 584 metadata = None 585 else: 586 metadata = pd.DataFrame(metadata_list) 587 features_list = [event.features for event in events] 588 # Iterate through and ensure that all features are the same shape 589 for features in features_list: 590 if type(features) != type(features_list[0]): 591 raise ValueError("All features must be the same type.") 592 if features is not None and features.shape != features_list[0].shape: 593 raise ValueError("All features must be the same shape.") 594 if features_list[0] is None: 595 features = None 596 else: 597 features = pd.DataFrame(features_list) 598 return EventArray(info=info, metadata=metadata, features=features) 599 600 def to_dataframe(self) -> pd.DataFrame: 601 """ 602 Convert all the data in the EventArray to a single DataFrame. 603 :return: a DataFrame with all the data in the EventArray. 604 """ 605 # Make a copy of the info DataFrame and prepend "info_" to the column names 606 output = self.info.copy() 607 output.columns = [f"info_{col}" for col in output.columns] 608 # Combine with the metadata and prepend "metadata_" to the column names 609 if self.metadata is not None: 610 metadata = self.metadata.copy() 611 metadata.columns = [f"metadata_{col}" for col in metadata.columns] 612 output = pd.concat([output, metadata], axis=1) 613 # Combine with the features and prepend "features_" to the column names 614 if self.features is not None: 615 features = self.features.copy() 616 features.columns = [f"features_{col}" for col in features.columns] 617 output = pd.concat([output, features], axis=1) 618 return output 619 620 @classmethod 621 def from_dataframe(cls, df) -> Self: 622 """ 623 From a single, special DataFrame, create an EventArray. 624 :return: a DataFrame with all the data in the EventArray. 625 """ 626 # Split the columns into info, metadata, and features and strip prefix 627 info = df[[col for col in df.columns if col.startswith("info_")]].copy() 628 info.columns = [col.replace("info_", "") for col in info.columns] 629 if info.size == 0: 630 info = None 631 metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy() 632 metadata.columns = [col.replace("metadata_", "") for col in metadata.columns] 633 if metadata.size == 0: 634 metadata = None 635 features = df[[col for col in df.columns if col.startswith("features_")]].copy() 636 features.columns = [col.replace("features_", "") for col in features.columns] 637 if features.size == 0: 638 features = None 639 return cls(info=info, metadata=metadata, features=features) 640 641 @classmethod 642 def from_mask( 643 cls, 644 mask: np.ndarray, 645 slide_id: str, 646 tile_n: int, 647 n_roi: int = 0, 648 include_cell_id: bool = True, 649 images: list[np.ndarray] = None, 650 image_labels: list[str] = None, 651 properties: list[str] = None, 652 ) -> Self: 653 """ 654 Extract events from a mask DataFrame, including metadata and features. 655 :param mask: the mask to extract events from. 656 :param slide_id: the slide ID the mask is from. 657 :param tile_n: the tile number the mask is from. 658 :param n_roi: the ROI number the mask is from. 659 :param include_cell_id: whether to include the cell_id, or numerical 660 mask label, as metadata in the EventArray. 661 :param images: the intensity images to extract features from. 662 :param image_labels: the labels for the intensity images. 663 :param properties: list of properties to extract in addition to the defaults: 664 :return: EventArray corresponding to the mask labels. 665 """ 666 # Gather mask_info 667 if images is not None and image_labels is not None: 668 if len(images) != len(image_labels): 669 raise ValueError("Intensity images and labels must match lengths.") 670 671 mask_info = extract_mask_info(mask, images, image_labels, properties) 672 673 if len(mask_info) == 0: 674 return EventArray() 675 676 # Combine provided info and mask info 677 info = pd.DataFrame( 678 { 679 "slide_id": slide_id, 680 "tile": tile_n, 681 "roi": n_roi, 682 "x": mask_info["x"], 683 "y": mask_info["y"], 684 "size": mask_info["size"], 685 }, 686 ) 687 # Extract a metadata column if desired 688 if include_cell_id: 689 metadata = pd.DataFrame({"cell_id": mask_info["id"]}) 690 else: 691 metadata = None 692 # If any additional properties were extracted, add them as features 693 mask_info = mask_info.drop(columns=["id", "x", "y", "size"], errors="ignore") 694 if len(mask_info.columns) > 0: 695 features = mask_info 696 else: 697 features = None 698 return EventArray(info, metadata, features) 699 700 def save_csv(self, output_path: str) -> bool: 701 """ 702 Save the events to an CSV file, including metadata and features. 703 :param output_path: 704 :return: 705 """ 706 self.to_dataframe().to_csv(output_path, index=False) 707 return os.path.exists(output_path) 708 709 @classmethod 710 def load_csv(cls, input_path: str) -> Self: 711 """ 712 Load the events from an CSV file, including metadata and features. 713 :param input_path: 714 :return: 715 """ 716 # Load the CSV file 717 df = pd.read_csv(input_path) 718 return cls.from_dataframe(df) 719 720 def save_hdf5(self, output_path: str) -> bool: 721 """ 722 Save the events to an HDF5 file, including metadata and features. 723 Uses the pandas-provided HDF5 functions for ease, and external compatibility, 724 though these files are slightly harder to view in HDFView or similar. 725 :param output_path: 726 :return: 727 """ 728 # Open the output_path as an HDF5 file 729 with pd.HDFStore(output_path) as store: 730 # Store the dataframes in the HDF5 file 731 if self.info is not None: 732 store.put("info", self.info, index=False) 733 if self.metadata is not None: 734 store.put("metadata", self.metadata, index=False) 735 if self.features is not None: 736 store.put("features", self.features, index=False) 737 return os.path.exists(output_path) 738 739 @classmethod 740 def load_hdf5(cls, input_path: str) -> Self: 741 """ 742 Load the events from an HDF5 file, including metadata and features. 743 :param input_path: 744 :return: 745 """ 746 # Open the input_path as an HDF5 file 747 with pd.HDFStore(input_path) as store: 748 # Load the dataframes from the HDF5 file 749 info = store.get("info") if "info" in store else None 750 metadata = store.get("metadata") if "metadata" in store else None 751 features = store.get("features") if "features" in store else None 752 return cls(info=info, metadata=metadata, features=features) 753 754 def save_ocular(self, output_path: str, event_type: str = "cells"): 755 """ 756 Save the events to an OCULAR file. Relies on the dataframe originating 757 from an OCULAR file (same columns; duplicate metadata/info). 758 :param output_path: 759 :param event_type: 760 :return: 761 """ 762 if pyreadr is None: 763 raise ModuleNotFoundError( 764 "pyreadr not installed. Install pyreadr directly " 765 "or install csi-images with [rds] option to resolve." 766 ) 767 if event_type == "cells": 768 file_stub = "rc-final" 769 elif event_type == "others": 770 file_stub = "others-final" 771 else: 772 raise ValueError("Invalid event type. Must be cells or others.") 773 774 # Ensure good metadata 775 metadata = pd.DataFrame( 776 { 777 "slide_id": self.info["slide_id"], 778 "frame_id": self.info["tile"], 779 "cellx": self.info["x"], 780 "celly": self.info["y"], 781 "cell_id": ( 782 self.metadata["cell_id"] 783 if "cell_id" in self.metadata.columns 784 else range(len(self.info)) 785 ), 786 } 787 ) 788 if self.metadata is not None: 789 metadata[self.metadata.columns] = self.metadata.copy() 790 791 # Check for the "ocular_interesting" column 792 if event_type == "cells": 793 if "ocular_interesting" in metadata.columns: 794 interesting_rows = metadata["ocular_interesting"].to_numpy(dtype=bool) 795 elif "hcpc" in metadata.columns: 796 # Interesting cells don't get an hcpc designation, leaving them as -1 797 interesting_rows = ( 798 metadata["hcpc"].to_numpy() == -1 799 ) # interesting cells 800 else: 801 interesting_rows = [] 802 if sum(interesting_rows) > 0: 803 # Split the metadata into interesting and regular 804 interesting_events = self.rows(interesting_rows) 805 interesting_df = pd.concat( 806 [interesting_events.features, interesting_events.metadata], axis=1 807 ) 808 data_events = self.rows(~interesting_rows) 809 data_df = pd.concat( 810 [data_events.features, data_events.metadata], axis=1 811 ) 812 data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore") 813 814 # Drop particular columns for "interesting" 815 interesting_df = interesting_df.drop( 816 [ 817 "clust", 818 "hcpc", 819 "frame_id", 820 "cell_id", 821 "unique_id", 822 "ocular_interesting", 823 ], 824 axis=1, 825 errors="ignore", 826 ) 827 # Save both .csv and .rds 828 file_stub = os.path.join(output_path, "ocular_interesting") 829 interesting_df.to_csv(f"{file_stub}.csv") 830 # Suppress pandas FutureWarning 831 with warnings.catch_warnings(): 832 warnings.simplefilter(action="ignore", category=FutureWarning) 833 pyreadr.write_rds(f"{file_stub}.rds", interesting_df) 834 else: 835 data_df = pd.concat([self.features, metadata], axis=1) 836 else: 837 # Get all data and reset_index (will copy it) 838 data_df = pd.concat([self.features, metadata], axis=1) 839 840 # Split based on cluster number to conform to *-final[1-4].rds 841 n_clusters = max(data_df["clust"]) + 1 842 split_idx = [round(i * n_clusters / 4) for i in range(5)] 843 for i in range(4): 844 subset = (split_idx[i] <= data_df["clust"]) & ( 845 data_df["clust"] < split_idx[i + 1] 846 ) 847 data_df.loc[subset, "hcpc"] = i + 1 848 subset = data_df[subset].reset_index(drop=True) 849 # Suppress pandas FutureWarning 850 with warnings.catch_warnings(): 851 warnings.simplefilter(action="ignore", category=FutureWarning) 852 pyreadr.write_rds( 853 os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset 854 ) 855 856 # Create new example cell strings 857 data_df["example_cell_id"] = ( 858 data_df["slide_id"] 859 + " " 860 + data_df["frame_id"].astype(str) 861 + " " 862 + data_df["cell_id"].astype(str) 863 + " " 864 + data_df["cellx"].astype(int).astype(str) 865 + " " 866 + data_df["celly"].astype(int).astype(str) 867 ) 868 # Find averagable data columns 869 if "cellcluster_id" in data_df.columns: 870 end_idx = data_df.columns.get_loc("cellcluster_id") 871 else: 872 end_idx = data_df.columns.get_loc("slide_id") 873 avg_cols = data_df.columns[:end_idx].tolist() 874 # Group by cluster and average 875 data_df = data_df.groupby("clust").agg( 876 **{col: (col, "mean") for col in avg_cols}, 877 count=("clust", "size"), # count rows in each cluster 878 example_cells=("example_cell_id", lambda x: ",".join(x)), 879 hcpc=("hcpc", lambda x: x.iloc[0]), 880 ) 881 data_df = data_df.reset_index() # Do NOT drop, index is "clust" 882 # Create new columns 883 metadata = pd.DataFrame( 884 { 885 "count": data_df["count"], 886 "example_cells": data_df["example_cells"], 887 "clust": data_df["clust"].astype(int), 888 "hcpc": data_df["hcpc"].astype(int), 889 "id": data_df["clust"].astype(int).astype(str), 890 "cccluster": "0", # Dummy value 891 "ccdistance": 0.0, # Dummy value 892 "rownum": list(range(len(data_df))), 893 "framegroup": 0, # Dummy value 894 } 895 ) 896 # Need to pad the features to 761 columns, as per OCULAR report needs 897 additional_columns = range(len(avg_cols), 761) 898 if len(additional_columns) > 0: 899 padding = pd.DataFrame( 900 np.zeros((len(data_df), len(additional_columns))), 901 columns=[f"pad{i}" for i in additional_columns], 902 ) 903 data_df = pd.concat([data_df[avg_cols], padding, metadata], axis=1) 904 else: 905 data_df = pd.concat([data_df[avg_cols], metadata], axis=1) 906 907 # Save the cluster data 908 data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv")) 909 # Suppress pandas FutureWarning 910 with warnings.catch_warnings(): 911 warnings.simplefilter(action="ignore", category=FutureWarning) 912 pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df) 913 914 @classmethod 915 def load_ocular( 916 cls, 917 input_path: str, 918 event_type="cells", 919 cell_data_files=( 920 "rc-final1.rds", 921 "rc-final2.rds", 922 "rc-final3.rds", 923 "rc-final4.rds", 924 "ocular_interesting.rds", 925 ), 926 others_data_files=( 927 "others-final1.rds", 928 "others-final2.rds", 929 "others-final3.rds", 930 "others-final4.rds", 931 ), 932 atlas_data_files=( 933 "ocular_interesting.rds", 934 "ocular_not_interesting.rds", 935 ), 936 drop_common_events=True, 937 log=None, 938 ) -> Self: 939 """ 940 941 :param input_path: 942 :param event_type: 943 :param cell_data_files: 944 :param others_data_files: 945 :param atlas_data_files: 946 :param drop_common_events: 947 :param log: 948 :return: 949 """ 950 if pyreadr is None: 951 raise ModuleNotFoundError( 952 "pyreadr not installed. Install pyreadr directly " 953 "or install csi-images with [rds] option to resolve." 954 ) 955 # Check if the input path is a directory or a file 956 if os.path.isfile(input_path): 957 data_files = [os.path.basename(input_path)] 958 input_path = os.path.dirname(input_path) 959 if event_type == "cells": 960 data_files = cell_data_files 961 elif event_type == "others": 962 data_files = others_data_files 963 else: 964 raise ValueError("Invalid event type.") 965 966 # Load the data from the OCULAR files 967 file_data = {} 968 for file in data_files: 969 file_path = os.path.join(input_path, file) 970 if not os.path.isfile(file_path): 971 if log is not None: 972 log.warning(f"{file} not found for in {input_path}") 973 continue 974 file_data[file] = pyreadr.read_r(file_path) 975 # Get the DataFrame associated with None (pyreadr dict quirk) 976 file_data[file] = file_data[file][None] 977 if len(file_data[file]) == 0: 978 # File gets dropped from the dict 979 file_data.pop(file) 980 if log is not None: 981 log.warning(f"{file} has no cells") 982 continue 983 984 if log is not None: 985 log.debug(f"{file} has {len(file_data[file])} cells") 986 987 # Drop common cells if requested and in this file 988 if file in atlas_data_files and drop_common_events: 989 common_cell_indices = ( 990 file_data[file]["catalogue_classification"] == "common_cell" 991 ) 992 if log is not None: 993 log.debug( 994 f"Dropping {int(pd.Series.sum(common_cell_indices))}" 995 f"common cells from {file}" 996 ) 997 file_data[file] = file_data[file][common_cell_indices == False] 998 999 if len(file_data[file]) == 0: 1000 # File gets dropped from the dict 1001 file_data.pop(file) 1002 if log is not None: 1003 log.warning(f"{file} has no cells after dropping common cells") 1004 continue 1005 1006 # Extract frame_id and cell_id 1007 # DAPI- events already have frame_id cell_id outside rowname 1008 if event_type == "cells" and "frame_id" not in file_data[file].columns: 1009 file_data[file]["rowname"] = file_data[file]["rowname"].astype("str") 1010 # get frame_id cell_id from rownames column and split into two columns 1011 split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True) 1012 if len(split_res.columns) != 2: 1013 log.warning( 1014 f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}' 1015 ) 1016 # then assign it back to the dataframe 1017 file_data[file][["frame_id", "cell_id"]] = split_res.astype("int") 1018 # reset indexes since they can cause NaN values in concat 1019 file_data[file] = file_data[file].reset_index(drop=True) 1020 1021 # Merge the data from all files 1022 if len(file_data) == 0: 1023 return EventArray() 1024 elif len(file_data) == 1: 1025 data = [file_data[file] for file in file_data.keys()][0] 1026 else: 1027 data = pd.concat(file_data.values()) 1028 1029 if log is not None: 1030 log.debug(f"Gathered a total of {len(data)} events") 1031 1032 # Others is missing the "slide_id". Insert it right before "frame_id" column 1033 if event_type == "others" and "slide_id" not in data.columns: 1034 if os.path.basename(input_path) == "ocular": 1035 slide_id = os.path.basename(os.path.dirname(input_path)) 1036 else: 1037 slide_id = "UNKNOWN" 1038 data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id) 1039 1040 # Sort according to ascending cell_id to keep the original, which is in manual_df 1041 data = data.sort_values(by=["cell_id"], ascending=True) 1042 # Filter out duplicates by x & y 1043 data = data.assign( 1044 unique_id=data["slide_id"] 1045 + "_" 1046 + data["frame_id"].astype(str) 1047 + "_" 1048 + data["cellx"].astype(int).astype(str) 1049 + "_" 1050 + data["celly"].astype(int).astype(str) 1051 ) 1052 data = data.drop_duplicates(subset=["unique_id"], keep="first") 1053 # Normal unique_id is with cell_id 1054 data = data.assign( 1055 unique_id=data["slide_id"] 1056 + "_" 1057 + data["frame_id"].astype(str) 1058 + "_" 1059 + data["cell_id"].astype(str) 1060 ) 1061 data = data.reset_index(drop=True) 1062 # All columns up to "slide_id" are features; drop the "slide_id" 1063 features = data.loc[:, :"slide_id"].iloc[:, :-1] 1064 data = data.loc[:, "slide_id":] 1065 # Grab the info columns 1066 info = data[["slide_id", "frame_id", "cellx", "celly"]] 1067 info.columns = ["slide_id", "tile", "x", "y"] 1068 info = info.assign( 1069 roi=0, # OCULAR only works on 1 ROI, as far as known 1070 size=25, # Static, for later montaging 1071 ) 1072 info = info[["slide_id", "tile", "roi", "x", "y", "size"]] 1073 # Metadata has duplicate columns for later convenience 1074 metadata = data 1075 # Certain columns tend to be problematic with mixed data formats... 1076 for col in ["TRITC", "CY5", "FITC"]: 1077 if col in metadata: 1078 labels = { 1079 "False": False, 1080 "True": True, 1081 "FALSE": False, 1082 "TRUE": True, 1083 } 1084 metadata[col] = metadata[col].map(labels).astype(bool) 1085 for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]: 1086 if col in metadata: 1087 metadata[col] = metadata[col].fillna(-1).astype(int) 1088 return EventArray(info, metadata, features)
A class that holds a large number of events' data, making it easy to analyze and manipulate many events at once. A more separated version of the Event class.
261 def __init__( 262 self, 263 info: pd.DataFrame = None, 264 metadata: pd.DataFrame = None, 265 features: pd.DataFrame = None, 266 ): 267 # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y", "size" 268 if info is not None: 269 if list(info.columns) != self.INFO_COLUMNS: 270 raise ValueError( 271 "EventArray.info must have columns 'slide_id', 'tile', 'roi', 'x', 'y', 'size'" 272 ) 273 # Copy first to avoid modifying the original 274 info = info.copy() 275 # Ensure that the columns are the right types 276 info["slide_id"] = info["slide_id"].astype(str) 277 info["tile"] = info["tile"].astype(np.uint16) 278 info["roi"] = info["roi"].astype(np.uint8) 279 info["x"] = info["x"].round().astype(np.uint16) 280 info["y"] = info["y"].round().astype(np.uint16) 281 info["size"] = info["size"].round().astype(np.uint16) 282 # All DataFrames must all have the same number of rows 283 if metadata is not None and (info is None or len(info) != len(metadata)): 284 raise ValueError( 285 "If EventArray.metadata is not None, it should match rows with .info" 286 ) 287 if features is not None and (info is None or len(info) != len(features)): 288 raise ValueError( 289 "If EventArray.features is not None, it should match rows with .info" 290 ) 291 self.info = info 292 self.metadata = metadata 293 self.features = features
342 def get_sort_order(self, by: str | list[str], ascending: bool | list[bool] = True): 343 """ 344 Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames. 345 :param by: name of the column(s) to sort by. 346 :param ascending: whether to sort in ascending order; can be a list to match by 347 :return: the order of the indices to sort by. 348 """ 349 columns = self.get(by) 350 return columns.sort_values(by=by, ascending=ascending).index
Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.
Parameters
- by: name of the column(s) to sort by.
- ascending: whether to sort in ascending order; can be a list to match by
Returns
the order of the indices to sort by.
352 def sort(self, by: str | list[str], ascending: bool | list[bool] = True) -> Self: 353 """ 354 Sort the EventArray by column(s) in the info, metadata, or features DataFrames. 355 :param by: name of the column(s) to sort by. 356 :param ascending: whether to sort in ascending order; can be a list to match by 357 :return: a new, sorted EventArray. 358 """ 359 order = self.get_sort_order(by, ascending) 360 info = self.info.loc[order].reset_index(drop=True) 361 if self.metadata is not None: 362 metadata = self.metadata.loc[order].reset_index(drop=True) 363 else: 364 metadata = None 365 if self.features is not None: 366 features = self.features.loc[order].reset_index(drop=True) 367 else: 368 features = None 369 return EventArray(info, metadata, features)
Sort the EventArray by column(s) in the info, metadata, or features DataFrames.
Parameters
- by: name of the column(s) to sort by.
- ascending: whether to sort in ascending order; can be a list to match by
Returns
a new, sorted EventArray.
371 def get(self, column_names: int | str | list[int] | list[str]) -> pd.DataFrame: 372 """ 373 Get a DataFrame with the specified columns from the EventArray, by value. 374 :param column_names: the names of the columns to get. 375 :return: a DataFrame with the specified columns. 376 """ 377 if isinstance(column_names, int) or isinstance(column_names, str): 378 column_names = [column_names] 379 columns = [] 380 for column_name in column_names: 381 if column_name in self.info.columns: 382 columns.append(self.info[column_name]) 383 elif self.metadata is not None and column_name in self.metadata.columns: 384 columns.append(self.metadata[column_name]) 385 elif self.features is not None and column_name in self.features.columns: 386 columns.append(self.features[column_name]) 387 else: 388 raise ValueError(f"Column {column_name} not found in EventArray") 389 return pd.concat(columns, axis=1)
Get a DataFrame with the specified columns from the EventArray, by value.
Parameters
- column_names: the names of the columns to get.
Returns
a DataFrame with the specified columns.
391 def rows(self, rows) -> Self: 392 """ 393 Get a subset of the EventArray rows based on a boolean or integer index, by value. 394 :param rows: the indices to get as a 1D boolean/integer list/array/series 395 :return: a new EventArray with the subset of events. 396 """ 397 info = self.info.loc[rows].reset_index(drop=True) 398 if self.metadata is not None: 399 metadata = self.metadata.loc[rows].reset_index(drop=True) 400 else: 401 metadata = None 402 if self.features is not None: 403 features = self.features.loc[rows].reset_index(drop=True) 404 else: 405 features = None 406 return EventArray(info, metadata, features)
Get a subset of the EventArray rows based on a boolean or integer index, by value.
Parameters
- rows: the indices to get as a 1D boolean/integer list/array/series
Returns
a new EventArray with the subset of events.
408 def copy(self) -> Self: 409 """ 410 Create a deep copy of the EventArray. 411 :return: a deep copy of the EventArray. 412 """ 413 return EventArray( 414 info=self.info.copy(), 415 metadata=None if self.metadata is None else self.metadata.copy(), 416 features=None if self.features is None else self.features.copy(), 417 )
Create a deep copy of the EventArray.
Returns
a deep copy of the EventArray.
419 def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None: 420 """ 421 Add metadata to the EventArray. Removes the need to check if metadata is None. 422 Overwrites any existing metadata with the same column names as the new metadata. 423 :param new_metadata: the metadata to add. 424 """ 425 if len(self) != len(new_metadata): 426 raise ValueError("New metadata must match length of existing info") 427 428 if self.metadata is None: 429 self.metadata = new_metadata 430 else: 431 if isinstance(new_metadata, pd.Series): 432 self.metadata[new_metadata.name] = new_metadata 433 else: 434 # It's a DataFrame 435 self.metadata[new_metadata.columns] = new_metadata
Add metadata to the EventArray. Removes the need to check if metadata is None. Overwrites any existing metadata with the same column names as the new metadata.
Parameters
- new_metadata: the metadata to add.
437 def add_features(self, new_features: pd.Series | pd.DataFrame) -> None: 438 """ 439 Add features to the EventArray. Removes the need to check if features is None. 440 Overwrites any existing features with the same column names as the new features. 441 :param new_features: the features to add. 442 """ 443 if len(self) != len(new_features): 444 raise ValueError("New features must match length of existing info") 445 446 if self.features is None: 447 self.features = new_features 448 else: 449 if isinstance(new_features, pd.Series): 450 self.features[new_features.name] = new_features 451 else: 452 # It's a DataFrame 453 self.features[new_features.columns] = new_features
Add features to the EventArray. Removes the need to check if features is None. Overwrites any existing features with the same column names as the new features.
Parameters
- new_features: the features to add.
455 @classmethod 456 def merge(cls, events: list[Self]) -> Self: 457 """ 458 Combine EventArrays in a list into a single EventArray. 459 :param events: the new list of events. 460 """ 461 all_info = [] 462 all_metadata = [] 463 all_features = [] 464 for event_array in events: 465 # Skip empty EventArrays 466 if event_array.info is not None: 467 all_info.append(event_array.info) 468 if event_array.metadata is not None: 469 all_metadata.append(event_array.metadata) 470 if event_array.features is not None: 471 all_features.append(event_array.features) 472 if len(all_info) == 0: 473 return EventArray() 474 else: 475 all_info = pd.concat(all_info, ignore_index=True) 476 if len(all_metadata) == 0: 477 all_metadata = None 478 else: 479 all_metadata = pd.concat(all_metadata, ignore_index=True) 480 if len(all_features) == 0: 481 all_features = None 482 else: 483 all_features = pd.concat(all_features, ignore_index=True) 484 485 return EventArray(all_info, all_metadata, all_features)
Combine EventArrays in a list into a single EventArray.
Parameters
- events: the new list of events.
487 def to_events( 488 self, 489 scans: Scan | list[Scan], 490 ignore_missing_scans=True, 491 ignore_metadata=False, 492 ignore_features=False, 493 ) -> list[Event]: 494 """ 495 Get the events in the EventArray as a list of events. 496 :param scans: the scans that the events belong to, auto-matched by slide_id. 497 Pass None if you don't care about scan metadata (pass ignore_missing_scans). 498 :param ignore_missing_scans: whether to create blank scans for events without scans. 499 :param ignore_metadata: whether to ignore metadata or not 500 :param ignore_features: whether to ignore features or not 501 :return: 502 """ 503 if isinstance(scans, Scan): 504 scans = [scans] * len(self.info) 505 events = [] 506 for i in range(len(self.info)): 507 # Determine the associated scan 508 scan = None 509 for s in scans: 510 if s.slide_id == self.info["slide_id"][i]: 511 scan = s 512 break 513 if scan is None: 514 if ignore_missing_scans: 515 # Create a placeholder scan if the scan is missing 516 scan = Scan.make_placeholder( 517 self.info["slide_id"][i], 518 self.info["tile"][i], 519 self.info["roi"][i], 520 ) 521 else: 522 raise ValueError( 523 f"Scan {self.info['slide_id'][i]} not found for event {i}." 524 ) 525 # Prepare the metadata and features 526 if ignore_metadata or self.metadata is None: 527 metadata = None 528 else: 529 # This Series creation method is less efficient, 530 # but required for preserving dtypes 531 metadata = pd.Series( 532 {col: self.metadata.loc[i, col] for col in self.metadata.columns}, 533 dtype=object, 534 ) 535 if ignore_features or self.features is None: 536 features = None 537 else: 538 features = pd.Series( 539 {col: self.features.loc[i, col] for col in self.features.columns}, 540 dtype=object, 541 ) 542 # Create the event and append it to the list 543 events.append( 544 Event( 545 scan, 546 Tile(scan, self.info["tile"][i], self.info["roi"][i]), 547 self.info["x"][i], 548 self.info["y"][i], 549 size=self.info["size"][i], 550 metadata=metadata, 551 features=features, 552 ) 553 ) 554 return events
Get the events in the EventArray as a list of events.
Parameters
- scans: the scans that the events belong to, auto-matched by slide_id. Pass None if you don't care about scan metadata (pass ignore_missing_scans).
- ignore_missing_scans: whether to create blank scans for events without scans.
- ignore_metadata: whether to ignore metadata or not
- ignore_features: whether to ignore features or not
Returns
556 @classmethod 557 def from_events(cls, events: list[Event]) -> Self: 558 """ 559 Set the events in the EventArray to a new list of events. 560 :param events: the new list of events. 561 """ 562 # Return an empty array if we were passed nothing 563 if events is None or len(events) == 0: 564 return EventArray() 565 # Otherwise, grab the info 566 info = pd.DataFrame( 567 { 568 "slide_id": [event.scan.slide_id for event in events], 569 "tile": [event.tile.n for event in events], 570 "roi": [event.tile.n_roi for event in events], 571 "x": [event.x for event in events], 572 "y": [event.y for event in events], 573 "size": [event.size for event in events], 574 } 575 ) 576 metadata_list = [event.metadata for event in events] 577 # Iterate through and ensure that all metadata is the same shape 578 for metadata in metadata_list: 579 if type(metadata) != type(metadata_list[0]): 580 raise ValueError("All metadata must be the same type.") 581 if metadata is not None and metadata.shape != metadata_list[0].shape: 582 raise ValueError("All metadata must be the same shape.") 583 if metadata_list[0] is None: 584 metadata = None 585 else: 586 metadata = pd.DataFrame(metadata_list) 587 features_list = [event.features for event in events] 588 # Iterate through and ensure that all features are the same shape 589 for features in features_list: 590 if type(features) != type(features_list[0]): 591 raise ValueError("All features must be the same type.") 592 if features is not None and features.shape != features_list[0].shape: 593 raise ValueError("All features must be the same shape.") 594 if features_list[0] is None: 595 features = None 596 else: 597 features = pd.DataFrame(features_list) 598 return EventArray(info=info, metadata=metadata, features=features)
Set the events in the EventArray to a new list of events.
Parameters
- events: the new list of events.
600 def to_dataframe(self) -> pd.DataFrame: 601 """ 602 Convert all the data in the EventArray to a single DataFrame. 603 :return: a DataFrame with all the data in the EventArray. 604 """ 605 # Make a copy of the info DataFrame and prepend "info_" to the column names 606 output = self.info.copy() 607 output.columns = [f"info_{col}" for col in output.columns] 608 # Combine with the metadata and prepend "metadata_" to the column names 609 if self.metadata is not None: 610 metadata = self.metadata.copy() 611 metadata.columns = [f"metadata_{col}" for col in metadata.columns] 612 output = pd.concat([output, metadata], axis=1) 613 # Combine with the features and prepend "features_" to the column names 614 if self.features is not None: 615 features = self.features.copy() 616 features.columns = [f"features_{col}" for col in features.columns] 617 output = pd.concat([output, features], axis=1) 618 return output
Convert all the data in the EventArray to a single DataFrame.
Returns
a DataFrame with all the data in the EventArray.
620 @classmethod 621 def from_dataframe(cls, df) -> Self: 622 """ 623 From a single, special DataFrame, create an EventArray. 624 :return: a DataFrame with all the data in the EventArray. 625 """ 626 # Split the columns into info, metadata, and features and strip prefix 627 info = df[[col for col in df.columns if col.startswith("info_")]].copy() 628 info.columns = [col.replace("info_", "") for col in info.columns] 629 if info.size == 0: 630 info = None 631 metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy() 632 metadata.columns = [col.replace("metadata_", "") for col in metadata.columns] 633 if metadata.size == 0: 634 metadata = None 635 features = df[[col for col in df.columns if col.startswith("features_")]].copy() 636 features.columns = [col.replace("features_", "") for col in features.columns] 637 if features.size == 0: 638 features = None 639 return cls(info=info, metadata=metadata, features=features)
From a single, special DataFrame, create an EventArray.
Returns
a DataFrame with all the data in the EventArray.
641 @classmethod 642 def from_mask( 643 cls, 644 mask: np.ndarray, 645 slide_id: str, 646 tile_n: int, 647 n_roi: int = 0, 648 include_cell_id: bool = True, 649 images: list[np.ndarray] = None, 650 image_labels: list[str] = None, 651 properties: list[str] = None, 652 ) -> Self: 653 """ 654 Extract events from a mask DataFrame, including metadata and features. 655 :param mask: the mask to extract events from. 656 :param slide_id: the slide ID the mask is from. 657 :param tile_n: the tile number the mask is from. 658 :param n_roi: the ROI number the mask is from. 659 :param include_cell_id: whether to include the cell_id, or numerical 660 mask label, as metadata in the EventArray. 661 :param images: the intensity images to extract features from. 662 :param image_labels: the labels for the intensity images. 663 :param properties: list of properties to extract in addition to the defaults: 664 :return: EventArray corresponding to the mask labels. 665 """ 666 # Gather mask_info 667 if images is not None and image_labels is not None: 668 if len(images) != len(image_labels): 669 raise ValueError("Intensity images and labels must match lengths.") 670 671 mask_info = extract_mask_info(mask, images, image_labels, properties) 672 673 if len(mask_info) == 0: 674 return EventArray() 675 676 # Combine provided info and mask info 677 info = pd.DataFrame( 678 { 679 "slide_id": slide_id, 680 "tile": tile_n, 681 "roi": n_roi, 682 "x": mask_info["x"], 683 "y": mask_info["y"], 684 "size": mask_info["size"], 685 }, 686 ) 687 # Extract a metadata column if desired 688 if include_cell_id: 689 metadata = pd.DataFrame({"cell_id": mask_info["id"]}) 690 else: 691 metadata = None 692 # If any additional properties were extracted, add them as features 693 mask_info = mask_info.drop(columns=["id", "x", "y", "size"], errors="ignore") 694 if len(mask_info.columns) > 0: 695 features = mask_info 696 else: 697 features = None 698 return EventArray(info, metadata, features)
Extract events from a mask DataFrame, including metadata and features.
Parameters
- mask: the mask to extract events from.
- slide_id: the slide ID the mask is from.
- tile_n: the tile number the mask is from.
- n_roi: the ROI number the mask is from.
- include_cell_id: whether to include the cell_id, or numerical mask label, as metadata in the EventArray.
- images: the intensity images to extract features from.
- image_labels: the labels for the intensity images.
- properties: list of properties to extract in addition to the defaults:
Returns
EventArray corresponding to the mask labels.
700 def save_csv(self, output_path: str) -> bool: 701 """ 702 Save the events to an CSV file, including metadata and features. 703 :param output_path: 704 :return: 705 """ 706 self.to_dataframe().to_csv(output_path, index=False) 707 return os.path.exists(output_path)
Save the events to an CSV file, including metadata and features.
Parameters
- output_path:
Returns
709 @classmethod 710 def load_csv(cls, input_path: str) -> Self: 711 """ 712 Load the events from an CSV file, including metadata and features. 713 :param input_path: 714 :return: 715 """ 716 # Load the CSV file 717 df = pd.read_csv(input_path) 718 return cls.from_dataframe(df)
Load the events from an CSV file, including metadata and features.
Parameters
- input_path:
Returns
720 def save_hdf5(self, output_path: str) -> bool: 721 """ 722 Save the events to an HDF5 file, including metadata and features. 723 Uses the pandas-provided HDF5 functions for ease, and external compatibility, 724 though these files are slightly harder to view in HDFView or similar. 725 :param output_path: 726 :return: 727 """ 728 # Open the output_path as an HDF5 file 729 with pd.HDFStore(output_path) as store: 730 # Store the dataframes in the HDF5 file 731 if self.info is not None: 732 store.put("info", self.info, index=False) 733 if self.metadata is not None: 734 store.put("metadata", self.metadata, index=False) 735 if self.features is not None: 736 store.put("features", self.features, index=False) 737 return os.path.exists(output_path)
Save the events to an HDF5 file, including metadata and features. Uses the pandas-provided HDF5 functions for ease, and external compatibility, though these files are slightly harder to view in HDFView or similar.
Parameters
- output_path:
Returns
739 @classmethod 740 def load_hdf5(cls, input_path: str) -> Self: 741 """ 742 Load the events from an HDF5 file, including metadata and features. 743 :param input_path: 744 :return: 745 """ 746 # Open the input_path as an HDF5 file 747 with pd.HDFStore(input_path) as store: 748 # Load the dataframes from the HDF5 file 749 info = store.get("info") if "info" in store else None 750 metadata = store.get("metadata") if "metadata" in store else None 751 features = store.get("features") if "features" in store else None 752 return cls(info=info, metadata=metadata, features=features)
Load the events from an HDF5 file, including metadata and features.
Parameters
- input_path:
Returns
754 def save_ocular(self, output_path: str, event_type: str = "cells"): 755 """ 756 Save the events to an OCULAR file. Relies on the dataframe originating 757 from an OCULAR file (same columns; duplicate metadata/info). 758 :param output_path: 759 :param event_type: 760 :return: 761 """ 762 if pyreadr is None: 763 raise ModuleNotFoundError( 764 "pyreadr not installed. Install pyreadr directly " 765 "or install csi-images with [rds] option to resolve." 766 ) 767 if event_type == "cells": 768 file_stub = "rc-final" 769 elif event_type == "others": 770 file_stub = "others-final" 771 else: 772 raise ValueError("Invalid event type. Must be cells or others.") 773 774 # Ensure good metadata 775 metadata = pd.DataFrame( 776 { 777 "slide_id": self.info["slide_id"], 778 "frame_id": self.info["tile"], 779 "cellx": self.info["x"], 780 "celly": self.info["y"], 781 "cell_id": ( 782 self.metadata["cell_id"] 783 if "cell_id" in self.metadata.columns 784 else range(len(self.info)) 785 ), 786 } 787 ) 788 if self.metadata is not None: 789 metadata[self.metadata.columns] = self.metadata.copy() 790 791 # Check for the "ocular_interesting" column 792 if event_type == "cells": 793 if "ocular_interesting" in metadata.columns: 794 interesting_rows = metadata["ocular_interesting"].to_numpy(dtype=bool) 795 elif "hcpc" in metadata.columns: 796 # Interesting cells don't get an hcpc designation, leaving them as -1 797 interesting_rows = ( 798 metadata["hcpc"].to_numpy() == -1 799 ) # interesting cells 800 else: 801 interesting_rows = [] 802 if sum(interesting_rows) > 0: 803 # Split the metadata into interesting and regular 804 interesting_events = self.rows(interesting_rows) 805 interesting_df = pd.concat( 806 [interesting_events.features, interesting_events.metadata], axis=1 807 ) 808 data_events = self.rows(~interesting_rows) 809 data_df = pd.concat( 810 [data_events.features, data_events.metadata], axis=1 811 ) 812 data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore") 813 814 # Drop particular columns for "interesting" 815 interesting_df = interesting_df.drop( 816 [ 817 "clust", 818 "hcpc", 819 "frame_id", 820 "cell_id", 821 "unique_id", 822 "ocular_interesting", 823 ], 824 axis=1, 825 errors="ignore", 826 ) 827 # Save both .csv and .rds 828 file_stub = os.path.join(output_path, "ocular_interesting") 829 interesting_df.to_csv(f"{file_stub}.csv") 830 # Suppress pandas FutureWarning 831 with warnings.catch_warnings(): 832 warnings.simplefilter(action="ignore", category=FutureWarning) 833 pyreadr.write_rds(f"{file_stub}.rds", interesting_df) 834 else: 835 data_df = pd.concat([self.features, metadata], axis=1) 836 else: 837 # Get all data and reset_index (will copy it) 838 data_df = pd.concat([self.features, metadata], axis=1) 839 840 # Split based on cluster number to conform to *-final[1-4].rds 841 n_clusters = max(data_df["clust"]) + 1 842 split_idx = [round(i * n_clusters / 4) for i in range(5)] 843 for i in range(4): 844 subset = (split_idx[i] <= data_df["clust"]) & ( 845 data_df["clust"] < split_idx[i + 1] 846 ) 847 data_df.loc[subset, "hcpc"] = i + 1 848 subset = data_df[subset].reset_index(drop=True) 849 # Suppress pandas FutureWarning 850 with warnings.catch_warnings(): 851 warnings.simplefilter(action="ignore", category=FutureWarning) 852 pyreadr.write_rds( 853 os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset 854 ) 855 856 # Create new example cell strings 857 data_df["example_cell_id"] = ( 858 data_df["slide_id"] 859 + " " 860 + data_df["frame_id"].astype(str) 861 + " " 862 + data_df["cell_id"].astype(str) 863 + " " 864 + data_df["cellx"].astype(int).astype(str) 865 + " " 866 + data_df["celly"].astype(int).astype(str) 867 ) 868 # Find averagable data columns 869 if "cellcluster_id" in data_df.columns: 870 end_idx = data_df.columns.get_loc("cellcluster_id") 871 else: 872 end_idx = data_df.columns.get_loc("slide_id") 873 avg_cols = data_df.columns[:end_idx].tolist() 874 # Group by cluster and average 875 data_df = data_df.groupby("clust").agg( 876 **{col: (col, "mean") for col in avg_cols}, 877 count=("clust", "size"), # count rows in each cluster 878 example_cells=("example_cell_id", lambda x: ",".join(x)), 879 hcpc=("hcpc", lambda x: x.iloc[0]), 880 ) 881 data_df = data_df.reset_index() # Do NOT drop, index is "clust" 882 # Create new columns 883 metadata = pd.DataFrame( 884 { 885 "count": data_df["count"], 886 "example_cells": data_df["example_cells"], 887 "clust": data_df["clust"].astype(int), 888 "hcpc": data_df["hcpc"].astype(int), 889 "id": data_df["clust"].astype(int).astype(str), 890 "cccluster": "0", # Dummy value 891 "ccdistance": 0.0, # Dummy value 892 "rownum": list(range(len(data_df))), 893 "framegroup": 0, # Dummy value 894 } 895 ) 896 # Need to pad the features to 761 columns, as per OCULAR report needs 897 additional_columns = range(len(avg_cols), 761) 898 if len(additional_columns) > 0: 899 padding = pd.DataFrame( 900 np.zeros((len(data_df), len(additional_columns))), 901 columns=[f"pad{i}" for i in additional_columns], 902 ) 903 data_df = pd.concat([data_df[avg_cols], padding, metadata], axis=1) 904 else: 905 data_df = pd.concat([data_df[avg_cols], metadata], axis=1) 906 907 # Save the cluster data 908 data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv")) 909 # Suppress pandas FutureWarning 910 with warnings.catch_warnings(): 911 warnings.simplefilter(action="ignore", category=FutureWarning) 912 pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df)
Save the events to an OCULAR file. Relies on the dataframe originating from an OCULAR file (same columns; duplicate metadata/info).
Parameters
- output_path:
- event_type:
Returns
914 @classmethod 915 def load_ocular( 916 cls, 917 input_path: str, 918 event_type="cells", 919 cell_data_files=( 920 "rc-final1.rds", 921 "rc-final2.rds", 922 "rc-final3.rds", 923 "rc-final4.rds", 924 "ocular_interesting.rds", 925 ), 926 others_data_files=( 927 "others-final1.rds", 928 "others-final2.rds", 929 "others-final3.rds", 930 "others-final4.rds", 931 ), 932 atlas_data_files=( 933 "ocular_interesting.rds", 934 "ocular_not_interesting.rds", 935 ), 936 drop_common_events=True, 937 log=None, 938 ) -> Self: 939 """ 940 941 :param input_path: 942 :param event_type: 943 :param cell_data_files: 944 :param others_data_files: 945 :param atlas_data_files: 946 :param drop_common_events: 947 :param log: 948 :return: 949 """ 950 if pyreadr is None: 951 raise ModuleNotFoundError( 952 "pyreadr not installed. Install pyreadr directly " 953 "or install csi-images with [rds] option to resolve." 954 ) 955 # Check if the input path is a directory or a file 956 if os.path.isfile(input_path): 957 data_files = [os.path.basename(input_path)] 958 input_path = os.path.dirname(input_path) 959 if event_type == "cells": 960 data_files = cell_data_files 961 elif event_type == "others": 962 data_files = others_data_files 963 else: 964 raise ValueError("Invalid event type.") 965 966 # Load the data from the OCULAR files 967 file_data = {} 968 for file in data_files: 969 file_path = os.path.join(input_path, file) 970 if not os.path.isfile(file_path): 971 if log is not None: 972 log.warning(f"{file} not found for in {input_path}") 973 continue 974 file_data[file] = pyreadr.read_r(file_path) 975 # Get the DataFrame associated with None (pyreadr dict quirk) 976 file_data[file] = file_data[file][None] 977 if len(file_data[file]) == 0: 978 # File gets dropped from the dict 979 file_data.pop(file) 980 if log is not None: 981 log.warning(f"{file} has no cells") 982 continue 983 984 if log is not None: 985 log.debug(f"{file} has {len(file_data[file])} cells") 986 987 # Drop common cells if requested and in this file 988 if file in atlas_data_files and drop_common_events: 989 common_cell_indices = ( 990 file_data[file]["catalogue_classification"] == "common_cell" 991 ) 992 if log is not None: 993 log.debug( 994 f"Dropping {int(pd.Series.sum(common_cell_indices))}" 995 f"common cells from {file}" 996 ) 997 file_data[file] = file_data[file][common_cell_indices == False] 998 999 if len(file_data[file]) == 0: 1000 # File gets dropped from the dict 1001 file_data.pop(file) 1002 if log is not None: 1003 log.warning(f"{file} has no cells after dropping common cells") 1004 continue 1005 1006 # Extract frame_id and cell_id 1007 # DAPI- events already have frame_id cell_id outside rowname 1008 if event_type == "cells" and "frame_id" not in file_data[file].columns: 1009 file_data[file]["rowname"] = file_data[file]["rowname"].astype("str") 1010 # get frame_id cell_id from rownames column and split into two columns 1011 split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True) 1012 if len(split_res.columns) != 2: 1013 log.warning( 1014 f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}' 1015 ) 1016 # then assign it back to the dataframe 1017 file_data[file][["frame_id", "cell_id"]] = split_res.astype("int") 1018 # reset indexes since they can cause NaN values in concat 1019 file_data[file] = file_data[file].reset_index(drop=True) 1020 1021 # Merge the data from all files 1022 if len(file_data) == 0: 1023 return EventArray() 1024 elif len(file_data) == 1: 1025 data = [file_data[file] for file in file_data.keys()][0] 1026 else: 1027 data = pd.concat(file_data.values()) 1028 1029 if log is not None: 1030 log.debug(f"Gathered a total of {len(data)} events") 1031 1032 # Others is missing the "slide_id". Insert it right before "frame_id" column 1033 if event_type == "others" and "slide_id" not in data.columns: 1034 if os.path.basename(input_path) == "ocular": 1035 slide_id = os.path.basename(os.path.dirname(input_path)) 1036 else: 1037 slide_id = "UNKNOWN" 1038 data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id) 1039 1040 # Sort according to ascending cell_id to keep the original, which is in manual_df 1041 data = data.sort_values(by=["cell_id"], ascending=True) 1042 # Filter out duplicates by x & y 1043 data = data.assign( 1044 unique_id=data["slide_id"] 1045 + "_" 1046 + data["frame_id"].astype(str) 1047 + "_" 1048 + data["cellx"].astype(int).astype(str) 1049 + "_" 1050 + data["celly"].astype(int).astype(str) 1051 ) 1052 data = data.drop_duplicates(subset=["unique_id"], keep="first") 1053 # Normal unique_id is with cell_id 1054 data = data.assign( 1055 unique_id=data["slide_id"] 1056 + "_" 1057 + data["frame_id"].astype(str) 1058 + "_" 1059 + data["cell_id"].astype(str) 1060 ) 1061 data = data.reset_index(drop=True) 1062 # All columns up to "slide_id" are features; drop the "slide_id" 1063 features = data.loc[:, :"slide_id"].iloc[:, :-1] 1064 data = data.loc[:, "slide_id":] 1065 # Grab the info columns 1066 info = data[["slide_id", "frame_id", "cellx", "celly"]] 1067 info.columns = ["slide_id", "tile", "x", "y"] 1068 info = info.assign( 1069 roi=0, # OCULAR only works on 1 ROI, as far as known 1070 size=25, # Static, for later montaging 1071 ) 1072 info = info[["slide_id", "tile", "roi", "x", "y", "size"]] 1073 # Metadata has duplicate columns for later convenience 1074 metadata = data 1075 # Certain columns tend to be problematic with mixed data formats... 1076 for col in ["TRITC", "CY5", "FITC"]: 1077 if col in metadata: 1078 labels = { 1079 "False": False, 1080 "True": True, 1081 "FALSE": False, 1082 "TRUE": True, 1083 } 1084 metadata[col] = metadata[col].map(labels).astype(bool) 1085 for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]: 1086 if col in metadata: 1087 metadata[col] = metadata[col].fillna(-1).astype(int) 1088 return EventArray(info, metadata, features)
Parameters
- input_path:
- event_type:
- cell_data_files:
- others_data_files:
- atlas_data_files:
- drop_common_events:
- log: