csi_images.csi_events
Contains the Event class, which represents a single event in a scan. The Event class optionally holds metadata and features. Lists of events with similar metadata or features can be combined into DataFrames for analysis.
The Event class holds the position of the event in the frame, which can be converted to the position in the scanner or slide coordinate positions. See the csi_utils.csi_scans documentation page for more information on the coordinate systems.
1""" 2Contains the Event class, which represents a single event in a scan. 3The Event class optionally holds metadata and features. Lists of events with 4similar metadata or features can be combined into DataFrames for analysis. 5 6The Event class holds the position of the event in the frame, which can be converted 7to the position in the scanner or slide coordinate positions. See the 8csi_utils.csi_scans documentation page for more information on the coordinate systems. 9""" 10 11import os 12import math 13import warnings 14from typing import Self 15 16import numpy as np 17import pandas as pd 18 19from .csi_scans import Scan 20from .csi_tiles import Tile 21from .csi_frames import Frame 22 23# Optional dependencies; will raise errors in particular functions if not installed 24try: 25 from .csi_images import extract_mask_info 26except ImportError: 27 extract_mask_info = None 28try: 29 import pyreadr 30except ImportError: 31 pyreadr = None 32 33 34class Event: 35 """ 36 A class that represents a single event in a scan, making it easy to evaluate 37 singular events. Required metadata is exposed as attributes, and optional 38 metadata and features are stored as DataFrames. 39 """ 40 41 SCAN_TO_SLIDE_TRANSFORM = { 42 # Axioscan zero is in the top-right corner instead of top-left 43 Scan.Type.AXIOSCAN7: np.array( 44 [ 45 [1, 0, 75000], 46 [0, 1, 0], 47 [0, 0, 1], 48 ] 49 ), 50 # BZScanner coordinates are a special kind of messed up: 51 # - The slide is upside-down. 52 # - The slide is oriented vertically, with the barcode at the bottom. 53 # - Tiles are numbered from the top-right 54 Scan.Type.BZSCANNER: np.array( 55 [ 56 [0, -1, 75000], 57 [-1, 0, 25000], 58 [0, 0, 1], 59 ] 60 ), 61 } 62 """ 63 Homogeneous transformation matrices for converting between scanner and slide 64 coordinates. The matrices are 3x3, with the final column representing the 65 translation in micrometers (um). For more information, see 66 [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations). 67 68 Transformations are nominal, and accuracy is not guaranteed; this is due to 69 imperfections in slides and alignment in the scanners. Units are in micrometers. 70 """ 71 72 def __init__( 73 self, 74 scan: Scan, 75 tile: Tile, 76 x: int, 77 y: int, 78 size: int = 12, # End-to-end size in pixels 79 metadata: pd.Series = None, 80 features: pd.Series = None, 81 ): 82 self.scan = scan 83 self.tile = tile 84 self.x = int(x) 85 self.y = int(y) 86 self.size = int(size) 87 self.metadata = metadata 88 self.features = features 89 90 def __repr__(self) -> str: 91 return f"{self.scan.slide_id}-{self.tile.n}-{self.x}-{self.y}" 92 93 def __eq__(self, other) -> bool: 94 return self.__repr__() == other.__repr__() 95 96 def __lt__(self, other): 97 return self.__repr__() < other.__repr__() 98 99 def get_scan_position(self) -> tuple[float, float]: 100 """ 101 Get the position of the event in the scanner's coordinate frame. 102 :return: the scan position of the event in micrometers (um). 103 """ 104 # Get overall pixel position 105 pixel_x = self.x + (self.scan.tile_width_px * self.tile.x) 106 pixel_y = self.y + (self.scan.tile_height_px * self.tile.y) 107 # Convert to micrometers 108 x_um = pixel_x * self.scan.pixel_size_um 109 y_um = pixel_y * self.scan.pixel_size_um 110 # Add the scan's origin in the scanner frame 111 x_um += self.scan.roi[self.tile.n_roi].origin_x_um 112 y_um += self.scan.roi[self.tile.n_roi].origin_y_um 113 return x_um, y_um 114 115 def get_slide_position(self) -> tuple[float, float]: 116 """ 117 Get the slide position of the event in micrometers (um). 118 :return: the slide position of the event. 119 """ 120 # Turn scan_position into a 3x1 vector 121 scan_position = self.get_scan_position() 122 scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]]) 123 124 # Multiply by the appropriate homogeneous matrix 125 if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value): 126 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7] 127 elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value): 128 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER] 129 else: 130 raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.") 131 slide_position = np.matmul(transform, scan_position) 132 return float(slide_position[0][0]), float(slide_position[1][0]) 133 134 def crop_images( 135 self, images: list[np.ndarray], crop_size: int = 100, in_pixels: bool = True 136 ) -> list[np.ndarray]: 137 """ 138 Get the event crops from the frame images. Called "get" because it does not 139 need to extract anything; it is very quick for extracting multiple events from 140 the same tile. 141 Use this if you're interested in many events. 142 :param images: the frame images. 143 :param crop_size: the square size of the image crop to get for this event. 144 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 145 :return: image_size x image_size crops of the event in the provided frames. If 146 the event is too close to the edge, the crop will be smaller and not centered. 147 """ 148 # Convert a crop size in micrometers to pixels 149 if not in_pixels: 150 crop_size = round(crop_size / self.scan.pixel_size_um) 151 # Find the crop bounds 152 bounds = [ 153 self.x - crop_size // 2, 154 self.y - crop_size // 2, 155 self.x + math.ceil(crop_size / 2), 156 self.y + math.ceil(crop_size / 2), 157 ] 158 # Determine how much the bounds violate the image size 159 displacements = [ 160 max(0, -bounds[0]), 161 max(0, -bounds[1]), 162 max(0, bounds[2] - images[0].shape[1]), 163 max(0, bounds[3] - images[0].shape[0]), 164 ] 165 # Cap off the bounds 166 bounds = [ 167 max(0, bounds[0]), 168 max(0, bounds[1]), 169 min(images[0].shape[1], bounds[2]), 170 min(images[0].shape[0], bounds[3]), 171 ] 172 173 # Crop the images 174 cropped_images = [] 175 for image in images: 176 # Create a blank image of the right size 177 cropped_image = np.zeros((crop_size, crop_size), dtype=image.dtype) 178 179 # Insert the cropped image into the blank image, leaving a black buffer 180 # around the edges if the crop would go beyond the original image bounds 181 cropped_image[ 182 displacements[1] : crop_size - displacements[3], 183 displacements[0] : crop_size - displacements[2], 184 ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]] 185 cropped_images.append(cropped_image) 186 return cropped_images 187 188 def extract_images( 189 self, crop_size: int = 100, in_pixels: bool = True 190 ) -> list[np.ndarray]: 191 """ 192 Extract the images from the scan and tile, reading from the file. Called 193 "extract" because it must read and extract the images from file, which is slow. 194 Use this if you're interested in only a few events, as it is inefficient when 195 reading multiple events from the same tile. 196 :param crop_size: the square size of the image crop to get for this event. 197 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 198 :return: a list of cropped images from the scan in the order of the channels. 199 """ 200 frames = Frame.get_frames(self.tile) 201 images = [frame.get_image() for frame in frames] 202 return self.crop_images(images, crop_size, in_pixels) 203 204 @classmethod 205 def extract_images_for_list( 206 cls, 207 events: list[Self], 208 crop_size: int | list[int] = None, 209 in_pixels: bool = True, 210 ) -> list[list[np.ndarray]]: 211 """ 212 Get the images for a list of events, ensuring that there is no wasteful reading 213 of the same tile multiple times. This function is more efficient than calling 214 extract_event_images for each event. 215 :param events: the events to extract images for. 216 :param crop_size: the square size of the image crop to get for this event. 217 Defaults to four times the size of the event. 218 :param in_pixels: whether the crop size is in pixels or micrometers. 219 Defaults to pixels, and is ignored if crop_size is None. 220 :return: a list of lists of cropped images for each event. 221 """ 222 if len(events) == 0: 223 return [] 224 225 # Populate a crop size if none provided 226 if crop_size is None: 227 crop_size = [4 * event.size for event in events] 228 in_pixels = True 229 # Propagate a constant crop size 230 elif isinstance(crop_size, int): 231 crop_size = [crop_size] * len(events) 232 233 # Sort the events by tile; use a shallow copy to avoid modifying the original 234 order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__())) 235 236 # Allocate the list to size 237 images = [None] * len(events) 238 last_tile = None 239 frame_images = None # Holds large numpy arrays, so expensive to compare 240 # Iterate through in sorted order 241 for i in order: 242 if last_tile != events[i].tile: 243 # Gather the frame images, preserving them for the next event 244 frames = Frame.get_frames(events[i].tile) 245 frame_images = [frame.get_image() for frame in frames] 246 247 last_tile = events[i].tile 248 # Use the frame images to crop the event images 249 # Preserve the original order using order[i] 250 images[i] = events[i].crop_images(frame_images, crop_size[i], in_pixels) 251 return images 252 253 254class EventArray: 255 """ 256 A class that holds a large number of events' data, making it easy to analyze and 257 manipulate many events at once. A more separated version of the Event class. 258 """ 259 260 INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y", "size"] 261 262 def __init__( 263 self, 264 info: pd.DataFrame = None, 265 metadata: pd.DataFrame = None, 266 features: pd.DataFrame = None, 267 ): 268 # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y", "size" 269 if info is not None: 270 if list(info.columns) != self.INFO_COLUMNS: 271 raise ValueError( 272 "EventArray.info must have columns 'slide_id', 'tile', 'roi', 'x', 'y', 'size'" 273 ) 274 # Copy first to avoid modifying the original 275 info = info.copy() 276 # Ensure that the columns are the right types 277 info["slide_id"] = info["slide_id"].astype(str) 278 info["tile"] = info["tile"].astype(np.uint16) 279 info["roi"] = info["roi"].astype(np.uint8) 280 info["x"] = info["x"].round().astype(np.uint16) 281 info["y"] = info["y"].round().astype(np.uint16) 282 info["size"] = info["size"].round().astype(np.uint16) 283 # All DataFrames must all have the same number of rows 284 if metadata is not None and (info is None or len(info) != len(metadata)): 285 raise ValueError( 286 "If EventArray.metadata is not None, it should match rows with .info" 287 ) 288 if features is not None and (info is None or len(info) != len(features)): 289 raise ValueError( 290 "If EventArray.features is not None, it should match rows with .info" 291 ) 292 self.info = info 293 self.metadata = metadata 294 self.features = features 295 296 def __len__(self) -> int: 297 # Convenience method to get the number of events 298 if self.info is None: 299 return 0 300 else: 301 return len(self.info) 302 303 def __eq__(self, other): 304 is_equal = True 305 # Parse all possibilities for info 306 if isinstance(self.info, pd.DataFrame): 307 if isinstance(other.info, pd.DataFrame): 308 is_equal = self.info.equals(other.info) 309 if not is_equal: 310 return False 311 else: 312 return False 313 elif self.info is None: 314 if other.info is not None: 315 return False 316 317 # Parse all possibilities for metadata 318 if isinstance(self.metadata, pd.DataFrame): 319 if isinstance(other.metadata, pd.DataFrame): 320 is_equal = self.metadata.equals(other.metadata) 321 if not is_equal: 322 return False 323 else: 324 return False 325 elif self.metadata is None: 326 if other.metadata is not None: 327 return False 328 329 # Parse all possibilities for features 330 if isinstance(self.features, pd.DataFrame): 331 if isinstance(other.features, pd.DataFrame): 332 is_equal = self.features.equals(other.features) 333 if not is_equal: 334 return False 335 else: 336 return False 337 elif self.features is None: 338 if other.features is not None: 339 return False 340 341 return is_equal 342 343 def get_sort_order(self, by: str | list[str], ascending: bool | list[bool] = True): 344 """ 345 Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames. 346 :param by: name of the column(s) to sort by. 347 :param ascending: whether to sort in ascending order; can be a list to match by 348 :return: the order of the indices to sort by. 349 """ 350 columns = self.get(by) 351 return columns.sort_values(by=by, ascending=ascending).index 352 353 def sort(self, by: str | list[str], ascending: bool | list[bool] = True) -> Self: 354 """ 355 Sort the EventArray by column(s) in the info, metadata, or features DataFrames. 356 :param by: name of the column(s) to sort by. 357 :param ascending: whether to sort in ascending order; can be a list to match by 358 :return: a new, sorted EventArray. 359 """ 360 order = self.get_sort_order(by, ascending) 361 info = self.info.loc[order].reset_index(drop=True) 362 if self.metadata is not None: 363 metadata = self.metadata.loc[order].reset_index(drop=True) 364 else: 365 metadata = None 366 if self.features is not None: 367 features = self.features.loc[order].reset_index(drop=True) 368 else: 369 features = None 370 return EventArray(info, metadata, features) 371 372 def get(self, column_names: int | str | list[int] | list[str]) -> pd.DataFrame: 373 """ 374 Get a DataFrame with the specified columns from the EventArray, by value. 375 :param column_names: the names of the columns to get. 376 :return: a DataFrame with the specified columns. 377 """ 378 if isinstance(column_names, int) or isinstance(column_names, str): 379 column_names = [column_names] 380 columns = [] 381 for column_name in column_names: 382 if column_name in self.info.columns: 383 columns.append(self.info[column_name]) 384 elif self.metadata is not None and column_name in self.metadata.columns: 385 columns.append(self.metadata[column_name]) 386 elif self.features is not None and column_name in self.features.columns: 387 columns.append(self.features[column_name]) 388 else: 389 raise ValueError(f"Column {column_name} not found in EventArray") 390 return pd.concat(columns, axis=1) 391 392 def rows(self, rows) -> Self: 393 """ 394 Get a subset of the EventArray rows based on a boolean or integer index, by value. 395 :param rows: the indices to get as a 1D boolean/integer list/array/series 396 :return: a new EventArray with the subset of events. 397 """ 398 info = self.info.loc[rows].reset_index(drop=True) 399 if self.metadata is not None: 400 metadata = self.metadata.loc[rows].reset_index(drop=True) 401 else: 402 metadata = None 403 if self.features is not None: 404 features = self.features.loc[rows].reset_index(drop=True) 405 else: 406 features = None 407 return EventArray(info, metadata, features) 408 409 def copy(self) -> Self: 410 """ 411 Create a deep copy of the EventArray. 412 :return: a deep copy of the EventArray. 413 """ 414 return EventArray( 415 info=self.info.copy(), 416 metadata=None if self.metadata is None else self.metadata.copy(), 417 features=None if self.features is None else self.features.copy(), 418 ) 419 420 def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None: 421 """ 422 Add metadata to the EventArray. Removes the need to check if metadata is None. 423 Overwrites any existing metadata with the same column names as the new metadata. 424 :param new_metadata: the metadata to add. 425 """ 426 if len(self) != len(new_metadata): 427 raise ValueError("New metadata must match length of existing info") 428 429 if self.metadata is None: 430 self.metadata = new_metadata 431 else: 432 if isinstance(new_metadata, pd.Series): 433 self.metadata[new_metadata.name] = new_metadata 434 else: 435 # It's a DataFrame 436 self.metadata[new_metadata.columns] = new_metadata 437 438 def add_features(self, new_features: pd.Series | pd.DataFrame) -> None: 439 """ 440 Add features to the EventArray. Removes the need to check if features is None. 441 Overwrites any existing features with the same column names as the new features. 442 :param new_features: the features to add. 443 """ 444 if len(self) != len(new_features): 445 raise ValueError("New features must match length of existing info") 446 447 if self.features is None: 448 self.features = new_features 449 else: 450 if isinstance(new_features, pd.Series): 451 self.features[new_features.name] = new_features 452 else: 453 # It's a DataFrame 454 self.features[new_features.columns] = new_features 455 456 @classmethod 457 def merge(cls, events: list[Self]) -> Self: 458 """ 459 Combine EventArrays in a list into a single EventArray. 460 :param events: the new list of events. 461 """ 462 all_info = [] 463 all_metadata = [] 464 all_features = [] 465 for event_array in events: 466 # Skip empty EventArrays 467 if event_array.info is not None: 468 all_info.append(event_array.info) 469 if event_array.metadata is not None: 470 all_metadata.append(event_array.metadata) 471 if event_array.features is not None: 472 all_features.append(event_array.features) 473 if len(all_info) == 0: 474 return EventArray() 475 else: 476 all_info = pd.concat(all_info, ignore_index=True) 477 if len(all_metadata) == 0: 478 all_metadata = None 479 else: 480 all_metadata = pd.concat(all_metadata, ignore_index=True) 481 if len(all_features) == 0: 482 all_features = None 483 else: 484 all_features = pd.concat(all_features, ignore_index=True) 485 486 return EventArray(all_info, all_metadata, all_features) 487 488 def to_events( 489 self, 490 scans: Scan | list[Scan], 491 ignore_missing_scans=True, 492 ignore_metadata=False, 493 ignore_features=False, 494 ) -> list[Event]: 495 """ 496 Get the events in the EventArray as a list of events. 497 :param scans: the scans that the events belong to, auto-matched by slide_id. 498 Pass None if you don't care about scan metadata (pass ignore_missing_scans). 499 :param ignore_missing_scans: whether to create blank scans for events without scans. 500 :param ignore_metadata: whether to ignore metadata or not 501 :param ignore_features: whether to ignore features or not 502 :return: 503 """ 504 if isinstance(scans, Scan): 505 scans = [scans] * len(self.info) 506 events = [] 507 for i in range(len(self.info)): 508 # Determine the associated scan 509 scan = None 510 for s in scans: 511 if s.slide_id == self.info["slide_id"][i]: 512 scan = s 513 break 514 if scan is None: 515 if ignore_missing_scans: 516 # Create a placeholder scan if the scan is missing 517 scan = Scan.make_placeholder( 518 self.info["slide_id"][i], 519 self.info["tile"][i], 520 self.info["roi"][i], 521 ) 522 else: 523 raise ValueError( 524 f"Scan {self.info['slide_id'][i]} not found for event {i}." 525 ) 526 # Prepare the metadata and features 527 if ignore_metadata or self.metadata is None: 528 metadata = None 529 else: 530 # This Series creation method is less efficient, 531 # but required for preserving dtypes 532 metadata = pd.Series( 533 {col: self.metadata.loc[i, col] for col in self.metadata.columns}, 534 dtype=object, 535 ) 536 if ignore_features or self.features is None: 537 features = None 538 else: 539 features = pd.Series( 540 {col: self.features.loc[i, col] for col in self.features.columns}, 541 dtype=object, 542 ) 543 # Create the event and append it to the list 544 events.append( 545 Event( 546 scan, 547 Tile(scan, self.info["tile"][i], self.info["roi"][i]), 548 self.info["x"][i], 549 self.info["y"][i], 550 size=self.info["size"][i], 551 metadata=metadata, 552 features=features, 553 ) 554 ) 555 return events 556 557 @classmethod 558 def from_events(cls, events: list[Event]) -> Self: 559 """ 560 Set the events in the EventArray to a new list of events. 561 :param events: the new list of events. 562 """ 563 # Return an empty array if we were passed nothing 564 if events is None or len(events) == 0: 565 return EventArray() 566 # Otherwise, grab the info 567 info = pd.DataFrame( 568 { 569 "slide_id": [event.scan.slide_id for event in events], 570 "tile": [event.tile.n for event in events], 571 "roi": [event.tile.n_roi for event in events], 572 "x": [event.x for event in events], 573 "y": [event.y for event in events], 574 "size": [event.size for event in events], 575 } 576 ) 577 metadata_list = [event.metadata for event in events] 578 # Iterate through and ensure that all metadata is the same shape 579 for metadata in metadata_list: 580 if type(metadata) != type(metadata_list[0]): 581 raise ValueError("All metadata must be the same type.") 582 if metadata is not None and metadata.shape != metadata_list[0].shape: 583 raise ValueError("All metadata must be the same shape.") 584 if metadata_list[0] is None: 585 metadata = None 586 else: 587 metadata = pd.DataFrame(metadata_list) 588 features_list = [event.features for event in events] 589 # Iterate through and ensure that all features are the same shape 590 for features in features_list: 591 if type(features) != type(features_list[0]): 592 raise ValueError("All features must be the same type.") 593 if features is not None and features.shape != features_list[0].shape: 594 raise ValueError("All features must be the same shape.") 595 if features_list[0] is None: 596 features = None 597 else: 598 features = pd.DataFrame(features_list) 599 return EventArray(info=info, metadata=metadata, features=features) 600 601 def to_dataframe(self) -> pd.DataFrame: 602 """ 603 Convert all the data in the EventArray to a single DataFrame. 604 :return: a DataFrame with all the data in the EventArray. 605 """ 606 # Make a copy of the info DataFrame and prepend "info_" to the column names 607 output = self.info.copy() 608 output.columns = [f"info_{col}" for col in output.columns] 609 # Combine with the metadata and prepend "metadata_" to the column names 610 if self.metadata is not None: 611 metadata = self.metadata.copy() 612 metadata.columns = [f"metadata_{col}" for col in metadata.columns] 613 output = pd.concat([output, metadata], axis=1) 614 # Combine with the features and prepend "features_" to the column names 615 if self.features is not None: 616 features = self.features.copy() 617 features.columns = [f"features_{col}" for col in features.columns] 618 output = pd.concat([output, features], axis=1) 619 return output 620 621 @classmethod 622 def from_dataframe(cls, df) -> Self: 623 """ 624 From a single, special DataFrame, create an EventArray. 625 :return: a DataFrame with all the data in the EventArray. 626 """ 627 # Split the columns into info, metadata, and features and strip prefix 628 info = df[[col for col in df.columns if col.startswith("info_")]].copy() 629 info.columns = [col.replace("info_", "") for col in info.columns] 630 if info.size == 0: 631 info = None 632 metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy() 633 metadata.columns = [col.replace("metadata_", "") for col in metadata.columns] 634 if metadata.size == 0: 635 metadata = None 636 features = df[[col for col in df.columns if col.startswith("features_")]].copy() 637 features.columns = [col.replace("features_", "") for col in features.columns] 638 if features.size == 0: 639 features = None 640 return cls(info=info, metadata=metadata, features=features) 641 642 @classmethod 643 def from_mask( 644 cls, 645 mask: np.ndarray, 646 slide_id: str, 647 tile_n: int, 648 n_roi: int = 0, 649 include_cell_id: bool = True, 650 images: list[np.ndarray] = None, 651 image_labels: list[str] = None, 652 properties: list[str] = None, 653 ) -> Self: 654 """ 655 Extract events from a mask DataFrame, including metadata and features. 656 :param mask: the mask to extract events from. 657 :param slide_id: the slide ID the mask is from. 658 :param tile_n: the tile number the mask is from. 659 :param n_roi: the ROI number the mask is from. 660 :param include_cell_id: whether to include the cell_id, or numerical 661 mask label, as metadata in the EventArray. 662 :param images: the intensity images to extract features from. 663 :param image_labels: the labels for the intensity images. 664 :param properties: list of properties to extract in addition to the defaults: 665 :return: EventArray corresponding to the mask labels. 666 """ 667 if extract_mask_info is None: 668 raise ModuleNotFoundError( 669 "csi_images.csi_images dependencies not installed. Install csi-images " 670 "with [imageio] option to resolve." 671 ) 672 # Gather mask_info 673 if images is not None and image_labels is not None: 674 if len(images) != len(image_labels): 675 raise ValueError("Intensity images and labels must match lengths.") 676 677 mask_info = extract_mask_info(mask, images, image_labels, properties) 678 679 if len(mask_info) == 0: 680 return EventArray() 681 682 # Combine provided info and mask info 683 info = pd.DataFrame( 684 { 685 "slide_id": slide_id, 686 "tile": tile_n, 687 "roi": n_roi, 688 "x": mask_info["x"], 689 "y": mask_info["y"], 690 "size": mask_info["size"], 691 }, 692 ) 693 # Extract a metadata column if desired 694 if include_cell_id: 695 metadata = pd.DataFrame({"cell_id": mask_info["id"]}) 696 else: 697 metadata = None 698 # If any additional properties were extracted, add them as features 699 mask_info = mask_info.drop(columns=["id", "x", "y", "size"], errors="ignore") 700 if len(mask_info.columns) > 0: 701 features = mask_info 702 else: 703 features = None 704 return EventArray(info, metadata, features) 705 706 def save_csv(self, output_path: str) -> bool: 707 """ 708 Save the events to an CSV file, including metadata and features. 709 :param output_path: 710 :return: 711 """ 712 self.to_dataframe().to_csv(output_path, index=False) 713 return os.path.exists(output_path) 714 715 @classmethod 716 def load_csv(cls, input_path: str) -> Self: 717 """ 718 Load the events from an CSV file, including metadata and features. 719 :param input_path: 720 :return: 721 """ 722 # Load the CSV file 723 df = pd.read_csv(input_path) 724 return cls.from_dataframe(df) 725 726 def save_hdf5(self, output_path: str) -> bool: 727 """ 728 Save the events to an HDF5 file, including metadata and features. 729 Uses the pandas-provided HDF5 functions for ease, and external compatibility, 730 though these files are slightly harder to view in HDFView or similar. 731 :param output_path: 732 :return: 733 """ 734 # Open the output_path as an HDF5 file 735 with pd.HDFStore(output_path) as store: 736 # Store the dataframes in the HDF5 file 737 if self.info is not None: 738 store.put("info", self.info, index=False) 739 if self.metadata is not None: 740 store.put("metadata", self.metadata, index=False) 741 if self.features is not None: 742 store.put("features", self.features, index=False) 743 return os.path.exists(output_path) 744 745 @classmethod 746 def load_hdf5(cls, input_path: str) -> Self: 747 """ 748 Load the events from an HDF5 file, including metadata and features. 749 :param input_path: 750 :return: 751 """ 752 # Open the input_path as an HDF5 file 753 with pd.HDFStore(input_path) as store: 754 # Load the dataframes from the HDF5 file 755 info = store.get("info") if "info" in store else None 756 metadata = store.get("metadata") if "metadata" in store else None 757 features = store.get("features") if "features" in store else None 758 return cls(info=info, metadata=metadata, features=features) 759 760 def save_ocular(self, output_path: str, event_type: str = "cells"): 761 """ 762 Save the events to an OCULAR file. Relies on the dataframe originating 763 from an OCULAR file (same columns; duplicate metadata/info). 764 :param output_path: 765 :param event_type: 766 :return: 767 """ 768 if pyreadr is None: 769 raise ModuleNotFoundError( 770 "pyreadr not installed. Install pyreadr directly " 771 "or install csi-images with [rds] option to resolve." 772 ) 773 if event_type == "cells": 774 file_stub = "rc-final" 775 elif event_type == "others": 776 file_stub = "others-final" 777 else: 778 raise ValueError("Invalid event type. Must be cells or others.") 779 780 # Ensure good metadata 781 metadata = pd.DataFrame( 782 { 783 "slide_id": self.info["slide_id"], 784 "frame_id": self.info["tile"], 785 "cell_id": ( 786 self.metadata["cell_id"] 787 if "cell_id" in self.metadata.columns 788 else range(len(self.info)) 789 ), 790 "cellx": self.info["x"], 791 "celly": self.info["y"], 792 } 793 ) 794 if self.metadata is not None: 795 metadata[self.metadata.columns] = self.metadata.copy() 796 797 # Check for the "ocular_interesting" column 798 if event_type == "cells": 799 if "ocular_interesting" in metadata.columns: 800 interesting_rows = metadata["ocular_interesting"].to_numpy(dtype=bool) 801 elif "hcpc" in metadata.columns: 802 # Interesting cells don't get an hcpc designation, leaving them as -1 803 interesting_rows = ( 804 metadata["hcpc"].to_numpy() == -1 805 ) # interesting cells 806 else: 807 interesting_rows = [] 808 if sum(interesting_rows) > 0: 809 # Split the metadata into interesting and regular 810 interesting_events = self.rows(interesting_rows) 811 interesting_df = pd.concat( 812 [interesting_events.features, interesting_events.metadata], axis=1 813 ) 814 data_events = self.rows(~interesting_rows) 815 data_df = pd.concat( 816 [data_events.features, data_events.metadata], axis=1 817 ) 818 data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore") 819 820 # Drop particular columns for "interesting" 821 interesting_df = interesting_df.drop( 822 [ 823 "clust", 824 "hcpc", 825 "frame_id", 826 "cell_id", 827 "unique_id", 828 "ocular_interesting", 829 ], 830 axis=1, 831 errors="ignore", 832 ) 833 # Save both .csv and .rds 834 interesting_stub = os.path.join(output_path, "ocular_interesting") 835 interesting_df.to_csv(f"{interesting_stub}.csv") 836 # Suppress pandas FutureWarning 837 with warnings.catch_warnings(): 838 warnings.simplefilter(action="ignore", category=FutureWarning) 839 pyreadr.write_rds(f"{interesting_stub}.rds", interesting_df) 840 else: 841 data_df = pd.concat([self.features, metadata], axis=1) 842 else: 843 # Get all data and reset_index (will copy it) 844 data_df = pd.concat([self.features, metadata], axis=1) 845 846 # Split based on cluster number to conform to *-final[1-4].rds 847 n_clusters = max(data_df["clust"]) + 1 848 split_idx = [round(i * n_clusters / 4) for i in range(5)] 849 for i in range(4): 850 subset = (split_idx[i] <= data_df["clust"]) & ( 851 data_df["clust"] < split_idx[i + 1] 852 ) 853 data_df.loc[subset, "hcpc"] = i + 1 854 subset = data_df[subset].reset_index(drop=True) 855 # Suppress pandas FutureWarning 856 with warnings.catch_warnings(): 857 warnings.simplefilter(action="ignore", category=FutureWarning) 858 pyreadr.write_rds( 859 os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset 860 ) 861 862 # Create new example cell strings 863 data_df["example_cell_id"] = ( 864 data_df["slide_id"] 865 + " " 866 + data_df["frame_id"].astype(str) 867 + " " 868 + data_df["cell_id"].astype(str) 869 + " " 870 + data_df["cellx"].astype(int).astype(str) 871 + " " 872 + data_df["celly"].astype(int).astype(str) 873 ) 874 # Find averagable data columns 875 if "cellcluster_id" in data_df.columns: 876 end_idx = data_df.columns.get_loc("cellcluster_id") 877 else: 878 end_idx = data_df.columns.get_loc("slide_id") 879 avg_cols = data_df.columns[:end_idx].tolist() 880 # Group by cluster and average 881 data_df = data_df.groupby("clust").agg( 882 **{col: (col, "mean") for col in avg_cols}, 883 count=("clust", "size"), # count rows in each cluster 884 example_cells=("example_cell_id", lambda x: ",".join(x)), 885 hcpc=("hcpc", lambda x: x.iloc[0]), 886 ) 887 data_df = data_df.reset_index() # Do NOT drop, index is "clust" 888 # Create new columns 889 metadata = pd.DataFrame( 890 { 891 "count": data_df["count"], 892 "example_cells": data_df["example_cells"], 893 "clust": data_df["clust"].astype(int), 894 "hcpc": data_df["hcpc"].astype(int), 895 "id": data_df["clust"].astype(int).astype(str), 896 "cccluster": "0", # Dummy value 897 "ccdistance": 0.0, # Dummy value 898 "rownum": list(range(len(data_df))), 899 "framegroup": 0, # Dummy value 900 } 901 ) 902 # Need to pad the features to 761 columns, as per OCULAR report needs 903 additional_columns = range(len(avg_cols), 761) 904 if len(additional_columns) > 0: 905 padding = pd.DataFrame( 906 np.zeros((len(data_df), len(additional_columns))), 907 columns=[f"pad{i}" for i in additional_columns], 908 ) 909 data_df = pd.concat([data_df[avg_cols], padding, metadata], axis=1) 910 else: 911 data_df = pd.concat([data_df[avg_cols], metadata], axis=1) 912 913 # Save the cluster data 914 data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv")) 915 # Suppress pandas FutureWarning 916 with warnings.catch_warnings(): 917 warnings.simplefilter(action="ignore", category=FutureWarning) 918 pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df) 919 920 @classmethod 921 def load_ocular( 922 cls, 923 input_path: str, 924 event_type="cells", 925 cell_data_files=( 926 "rc-final1.rds", 927 "rc-final2.rds", 928 "rc-final3.rds", 929 "rc-final4.rds", 930 "ocular_interesting.rds", 931 ), 932 others_data_files=( 933 "others-final1.rds", 934 "others-final2.rds", 935 "others-final3.rds", 936 "others-final4.rds", 937 ), 938 atlas_data_files=( 939 "ocular_interesting.rds", 940 "ocular_not_interesting.rds", 941 ), 942 drop_common_events=True, 943 log=None, 944 ) -> Self: 945 """ 946 947 :param input_path: 948 :param event_type: 949 :param cell_data_files: 950 :param others_data_files: 951 :param atlas_data_files: 952 :param drop_common_events: 953 :param log: 954 :return: 955 """ 956 if pyreadr is None: 957 raise ModuleNotFoundError( 958 "pyreadr not installed. Install pyreadr directly " 959 "or install csi-images with [rds] option to resolve." 960 ) 961 # Check if the input path is a directory or a file 962 if os.path.isfile(input_path): 963 data_files = [os.path.basename(input_path)] 964 input_path = os.path.dirname(input_path) 965 if event_type == "cells": 966 data_files = cell_data_files 967 elif event_type == "others": 968 data_files = others_data_files 969 else: 970 raise ValueError("Invalid event type.") 971 972 # Load the data from the OCULAR files 973 file_data = {} 974 for file in data_files: 975 file_path = os.path.join(input_path, file) 976 if not os.path.isfile(file_path): 977 if log is not None: 978 log.warning(f"{file} not found for in {input_path}") 979 continue 980 file_data[file] = pyreadr.read_r(file_path) 981 # Get the DataFrame associated with None (pyreadr dict quirk) 982 file_data[file] = file_data[file][None] 983 if len(file_data[file]) == 0: 984 # File gets dropped from the dict 985 file_data.pop(file) 986 if log is not None: 987 log.warning(f"{file} has no cells") 988 continue 989 990 if log is not None: 991 log.debug(f"{file} has {len(file_data[file])} cells") 992 993 # Drop common cells if requested and in this file 994 if ( 995 file in atlas_data_files 996 and drop_common_events 997 and "catalogue_classification" in file_data[file] 998 ): 999 common_cell_indices = ( 1000 file_data[file]["catalogue_classification"] == "common_cell" 1001 ) 1002 if log is not None: 1003 log.debug( 1004 f"Dropping {int(pd.Series.sum(common_cell_indices))}" 1005 f"common cells from {file}" 1006 ) 1007 file_data[file] = file_data[file][common_cell_indices == False] 1008 1009 if len(file_data[file]) == 0: 1010 # File gets dropped from the dict 1011 file_data.pop(file) 1012 if log is not None: 1013 log.warning(f"{file} has no cells after dropping common cells") 1014 continue 1015 1016 # Extract frame_id and cell_id 1017 # DAPI- events already have frame_id cell_id outside rowname 1018 if event_type == "cells" and "frame_id" not in file_data[file].columns: 1019 file_data[file]["rowname"] = file_data[file]["rowname"].astype("str") 1020 # get frame_id cell_id from rownames column and split into two columns 1021 split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True) 1022 if len(split_res.columns) != 2: 1023 log.warning( 1024 f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}' 1025 ) 1026 # then assign it back to the dataframe 1027 file_data[file][["frame_id", "cell_id"]] = split_res.astype("int") 1028 # reset indexes since they can cause NaN values in concat 1029 file_data[file] = file_data[file].reset_index(drop=True) 1030 1031 # Merge the data from all files 1032 if len(file_data) == 0: 1033 return EventArray() 1034 elif len(file_data) == 1: 1035 data = [file_data[file] for file in file_data.keys()][0] 1036 else: 1037 data = pd.concat(file_data.values()) 1038 1039 if log is not None: 1040 log.debug(f"Gathered a total of {len(data)} events") 1041 1042 # Others is missing the "slide_id". Insert it right before "frame_id" column 1043 if event_type == "others" and "slide_id" not in data.columns: 1044 if os.path.basename(input_path) == "ocular": 1045 slide_id = os.path.basename(os.path.dirname(input_path)) 1046 else: 1047 slide_id = "UNKNOWN" 1048 data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id) 1049 1050 # Sort according to ascending cell_id to keep the original, which is in manual_df 1051 data = data.sort_values(by=["cell_id"], ascending=True) 1052 # Filter out duplicates by x & y 1053 data = data.assign( 1054 unique_id=data["slide_id"] 1055 + "_" 1056 + data["frame_id"].astype(str) 1057 + "_" 1058 + data["cellx"].astype(int).astype(str) 1059 + "_" 1060 + data["celly"].astype(int).astype(str) 1061 ) 1062 data = data.drop_duplicates(subset=["unique_id"], keep="first") 1063 # Normal unique_id is with cell_id 1064 data = data.assign( 1065 unique_id=data["slide_id"] 1066 + "_" 1067 + data["frame_id"].astype(str) 1068 + "_" 1069 + data["cell_id"].astype(str) 1070 ) 1071 data = data.reset_index(drop=True) 1072 # All columns up to "slide_id" are features; drop the "slide_id" 1073 features = data.loc[:, :"slide_id"].iloc[:, :-1] 1074 data = data.loc[:, "slide_id":] 1075 # Grab the info columns 1076 info = data[["slide_id", "frame_id", "cellx", "celly"]] 1077 info.columns = ["slide_id", "tile", "x", "y"] 1078 info = info.assign( 1079 roi=0, # OCULAR only works on 1 ROI, as far as known 1080 size=25, # Static, for later montaging 1081 ) 1082 info = info[["slide_id", "tile", "roi", "x", "y", "size"]] 1083 # Metadata has duplicate columns for later convenience 1084 metadata = data 1085 # Certain columns tend to be problematic with mixed data formats... 1086 for col in ["TRITC", "CY5", "FITC"]: 1087 if col in metadata: 1088 labels = { 1089 "False": False, 1090 "True": True, 1091 "FALSE": False, 1092 "TRUE": True, 1093 } 1094 metadata[col] = metadata[col].map(labels).astype(bool) 1095 for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]: 1096 if col in metadata: 1097 metadata[col] = metadata[col].fillna(-1).astype(int) 1098 return EventArray(info, metadata, features)
35class Event: 36 """ 37 A class that represents a single event in a scan, making it easy to evaluate 38 singular events. Required metadata is exposed as attributes, and optional 39 metadata and features are stored as DataFrames. 40 """ 41 42 SCAN_TO_SLIDE_TRANSFORM = { 43 # Axioscan zero is in the top-right corner instead of top-left 44 Scan.Type.AXIOSCAN7: np.array( 45 [ 46 [1, 0, 75000], 47 [0, 1, 0], 48 [0, 0, 1], 49 ] 50 ), 51 # BZScanner coordinates are a special kind of messed up: 52 # - The slide is upside-down. 53 # - The slide is oriented vertically, with the barcode at the bottom. 54 # - Tiles are numbered from the top-right 55 Scan.Type.BZSCANNER: np.array( 56 [ 57 [0, -1, 75000], 58 [-1, 0, 25000], 59 [0, 0, 1], 60 ] 61 ), 62 } 63 """ 64 Homogeneous transformation matrices for converting between scanner and slide 65 coordinates. The matrices are 3x3, with the final column representing the 66 translation in micrometers (um). For more information, see 67 [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations). 68 69 Transformations are nominal, and accuracy is not guaranteed; this is due to 70 imperfections in slides and alignment in the scanners. Units are in micrometers. 71 """ 72 73 def __init__( 74 self, 75 scan: Scan, 76 tile: Tile, 77 x: int, 78 y: int, 79 size: int = 12, # End-to-end size in pixels 80 metadata: pd.Series = None, 81 features: pd.Series = None, 82 ): 83 self.scan = scan 84 self.tile = tile 85 self.x = int(x) 86 self.y = int(y) 87 self.size = int(size) 88 self.metadata = metadata 89 self.features = features 90 91 def __repr__(self) -> str: 92 return f"{self.scan.slide_id}-{self.tile.n}-{self.x}-{self.y}" 93 94 def __eq__(self, other) -> bool: 95 return self.__repr__() == other.__repr__() 96 97 def __lt__(self, other): 98 return self.__repr__() < other.__repr__() 99 100 def get_scan_position(self) -> tuple[float, float]: 101 """ 102 Get the position of the event in the scanner's coordinate frame. 103 :return: the scan position of the event in micrometers (um). 104 """ 105 # Get overall pixel position 106 pixel_x = self.x + (self.scan.tile_width_px * self.tile.x) 107 pixel_y = self.y + (self.scan.tile_height_px * self.tile.y) 108 # Convert to micrometers 109 x_um = pixel_x * self.scan.pixel_size_um 110 y_um = pixel_y * self.scan.pixel_size_um 111 # Add the scan's origin in the scanner frame 112 x_um += self.scan.roi[self.tile.n_roi].origin_x_um 113 y_um += self.scan.roi[self.tile.n_roi].origin_y_um 114 return x_um, y_um 115 116 def get_slide_position(self) -> tuple[float, float]: 117 """ 118 Get the slide position of the event in micrometers (um). 119 :return: the slide position of the event. 120 """ 121 # Turn scan_position into a 3x1 vector 122 scan_position = self.get_scan_position() 123 scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]]) 124 125 # Multiply by the appropriate homogeneous matrix 126 if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value): 127 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7] 128 elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value): 129 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER] 130 else: 131 raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.") 132 slide_position = np.matmul(transform, scan_position) 133 return float(slide_position[0][0]), float(slide_position[1][0]) 134 135 def crop_images( 136 self, images: list[np.ndarray], crop_size: int = 100, in_pixels: bool = True 137 ) -> list[np.ndarray]: 138 """ 139 Get the event crops from the frame images. Called "get" because it does not 140 need to extract anything; it is very quick for extracting multiple events from 141 the same tile. 142 Use this if you're interested in many events. 143 :param images: the frame images. 144 :param crop_size: the square size of the image crop to get for this event. 145 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 146 :return: image_size x image_size crops of the event in the provided frames. If 147 the event is too close to the edge, the crop will be smaller and not centered. 148 """ 149 # Convert a crop size in micrometers to pixels 150 if not in_pixels: 151 crop_size = round(crop_size / self.scan.pixel_size_um) 152 # Find the crop bounds 153 bounds = [ 154 self.x - crop_size // 2, 155 self.y - crop_size // 2, 156 self.x + math.ceil(crop_size / 2), 157 self.y + math.ceil(crop_size / 2), 158 ] 159 # Determine how much the bounds violate the image size 160 displacements = [ 161 max(0, -bounds[0]), 162 max(0, -bounds[1]), 163 max(0, bounds[2] - images[0].shape[1]), 164 max(0, bounds[3] - images[0].shape[0]), 165 ] 166 # Cap off the bounds 167 bounds = [ 168 max(0, bounds[0]), 169 max(0, bounds[1]), 170 min(images[0].shape[1], bounds[2]), 171 min(images[0].shape[0], bounds[3]), 172 ] 173 174 # Crop the images 175 cropped_images = [] 176 for image in images: 177 # Create a blank image of the right size 178 cropped_image = np.zeros((crop_size, crop_size), dtype=image.dtype) 179 180 # Insert the cropped image into the blank image, leaving a black buffer 181 # around the edges if the crop would go beyond the original image bounds 182 cropped_image[ 183 displacements[1] : crop_size - displacements[3], 184 displacements[0] : crop_size - displacements[2], 185 ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]] 186 cropped_images.append(cropped_image) 187 return cropped_images 188 189 def extract_images( 190 self, crop_size: int = 100, in_pixels: bool = True 191 ) -> list[np.ndarray]: 192 """ 193 Extract the images from the scan and tile, reading from the file. Called 194 "extract" because it must read and extract the images from file, which is slow. 195 Use this if you're interested in only a few events, as it is inefficient when 196 reading multiple events from the same tile. 197 :param crop_size: the square size of the image crop to get for this event. 198 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 199 :return: a list of cropped images from the scan in the order of the channels. 200 """ 201 frames = Frame.get_frames(self.tile) 202 images = [frame.get_image() for frame in frames] 203 return self.crop_images(images, crop_size, in_pixels) 204 205 @classmethod 206 def extract_images_for_list( 207 cls, 208 events: list[Self], 209 crop_size: int | list[int] = None, 210 in_pixels: bool = True, 211 ) -> list[list[np.ndarray]]: 212 """ 213 Get the images for a list of events, ensuring that there is no wasteful reading 214 of the same tile multiple times. This function is more efficient than calling 215 extract_event_images for each event. 216 :param events: the events to extract images for. 217 :param crop_size: the square size of the image crop to get for this event. 218 Defaults to four times the size of the event. 219 :param in_pixels: whether the crop size is in pixels or micrometers. 220 Defaults to pixels, and is ignored if crop_size is None. 221 :return: a list of lists of cropped images for each event. 222 """ 223 if len(events) == 0: 224 return [] 225 226 # Populate a crop size if none provided 227 if crop_size is None: 228 crop_size = [4 * event.size for event in events] 229 in_pixels = True 230 # Propagate a constant crop size 231 elif isinstance(crop_size, int): 232 crop_size = [crop_size] * len(events) 233 234 # Sort the events by tile; use a shallow copy to avoid modifying the original 235 order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__())) 236 237 # Allocate the list to size 238 images = [None] * len(events) 239 last_tile = None 240 frame_images = None # Holds large numpy arrays, so expensive to compare 241 # Iterate through in sorted order 242 for i in order: 243 if last_tile != events[i].tile: 244 # Gather the frame images, preserving them for the next event 245 frames = Frame.get_frames(events[i].tile) 246 frame_images = [frame.get_image() for frame in frames] 247 248 last_tile = events[i].tile 249 # Use the frame images to crop the event images 250 # Preserve the original order using order[i] 251 images[i] = events[i].crop_images(frame_images, crop_size[i], in_pixels) 252 return images
A class that represents a single event in a scan, making it easy to evaluate singular events. Required metadata is exposed as attributes, and optional metadata and features are stored as DataFrames.
73 def __init__( 74 self, 75 scan: Scan, 76 tile: Tile, 77 x: int, 78 y: int, 79 size: int = 12, # End-to-end size in pixels 80 metadata: pd.Series = None, 81 features: pd.Series = None, 82 ): 83 self.scan = scan 84 self.tile = tile 85 self.x = int(x) 86 self.y = int(y) 87 self.size = int(size) 88 self.metadata = metadata 89 self.features = features
Homogeneous transformation matrices for converting between scanner and slide coordinates. The matrices are 3x3, with the final column representing the translation in micrometers (um). For more information, see affine transformations.
Transformations are nominal, and accuracy is not guaranteed; this is due to imperfections in slides and alignment in the scanners. Units are in micrometers.
100 def get_scan_position(self) -> tuple[float, float]: 101 """ 102 Get the position of the event in the scanner's coordinate frame. 103 :return: the scan position of the event in micrometers (um). 104 """ 105 # Get overall pixel position 106 pixel_x = self.x + (self.scan.tile_width_px * self.tile.x) 107 pixel_y = self.y + (self.scan.tile_height_px * self.tile.y) 108 # Convert to micrometers 109 x_um = pixel_x * self.scan.pixel_size_um 110 y_um = pixel_y * self.scan.pixel_size_um 111 # Add the scan's origin in the scanner frame 112 x_um += self.scan.roi[self.tile.n_roi].origin_x_um 113 y_um += self.scan.roi[self.tile.n_roi].origin_y_um 114 return x_um, y_um
Get the position of the event in the scanner's coordinate frame.
Returns
the scan position of the event in micrometers (um).
116 def get_slide_position(self) -> tuple[float, float]: 117 """ 118 Get the slide position of the event in micrometers (um). 119 :return: the slide position of the event. 120 """ 121 # Turn scan_position into a 3x1 vector 122 scan_position = self.get_scan_position() 123 scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]]) 124 125 # Multiply by the appropriate homogeneous matrix 126 if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value): 127 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7] 128 elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value): 129 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER] 130 else: 131 raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.") 132 slide_position = np.matmul(transform, scan_position) 133 return float(slide_position[0][0]), float(slide_position[1][0])
Get the slide position of the event in micrometers (um).
Returns
the slide position of the event.
135 def crop_images( 136 self, images: list[np.ndarray], crop_size: int = 100, in_pixels: bool = True 137 ) -> list[np.ndarray]: 138 """ 139 Get the event crops from the frame images. Called "get" because it does not 140 need to extract anything; it is very quick for extracting multiple events from 141 the same tile. 142 Use this if you're interested in many events. 143 :param images: the frame images. 144 :param crop_size: the square size of the image crop to get for this event. 145 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 146 :return: image_size x image_size crops of the event in the provided frames. If 147 the event is too close to the edge, the crop will be smaller and not centered. 148 """ 149 # Convert a crop size in micrometers to pixels 150 if not in_pixels: 151 crop_size = round(crop_size / self.scan.pixel_size_um) 152 # Find the crop bounds 153 bounds = [ 154 self.x - crop_size // 2, 155 self.y - crop_size // 2, 156 self.x + math.ceil(crop_size / 2), 157 self.y + math.ceil(crop_size / 2), 158 ] 159 # Determine how much the bounds violate the image size 160 displacements = [ 161 max(0, -bounds[0]), 162 max(0, -bounds[1]), 163 max(0, bounds[2] - images[0].shape[1]), 164 max(0, bounds[3] - images[0].shape[0]), 165 ] 166 # Cap off the bounds 167 bounds = [ 168 max(0, bounds[0]), 169 max(0, bounds[1]), 170 min(images[0].shape[1], bounds[2]), 171 min(images[0].shape[0], bounds[3]), 172 ] 173 174 # Crop the images 175 cropped_images = [] 176 for image in images: 177 # Create a blank image of the right size 178 cropped_image = np.zeros((crop_size, crop_size), dtype=image.dtype) 179 180 # Insert the cropped image into the blank image, leaving a black buffer 181 # around the edges if the crop would go beyond the original image bounds 182 cropped_image[ 183 displacements[1] : crop_size - displacements[3], 184 displacements[0] : crop_size - displacements[2], 185 ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]] 186 cropped_images.append(cropped_image) 187 return cropped_images
Get the event crops from the frame images. Called "get" because it does not need to extract anything; it is very quick for extracting multiple events from the same tile. Use this if you're interested in many events.
Parameters
- images: the frame images.
- crop_size: the square size of the image crop to get for this event.
- in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
Returns
image_size x image_size crops of the event in the provided frames. If the event is too close to the edge, the crop will be smaller and not centered.
189 def extract_images( 190 self, crop_size: int = 100, in_pixels: bool = True 191 ) -> list[np.ndarray]: 192 """ 193 Extract the images from the scan and tile, reading from the file. Called 194 "extract" because it must read and extract the images from file, which is slow. 195 Use this if you're interested in only a few events, as it is inefficient when 196 reading multiple events from the same tile. 197 :param crop_size: the square size of the image crop to get for this event. 198 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 199 :return: a list of cropped images from the scan in the order of the channels. 200 """ 201 frames = Frame.get_frames(self.tile) 202 images = [frame.get_image() for frame in frames] 203 return self.crop_images(images, crop_size, in_pixels)
Extract the images from the scan and tile, reading from the file. Called "extract" because it must read and extract the images from file, which is slow. Use this if you're interested in only a few events, as it is inefficient when reading multiple events from the same tile.
Parameters
- crop_size: the square size of the image crop to get for this event.
- in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
Returns
a list of cropped images from the scan in the order of the channels.
205 @classmethod 206 def extract_images_for_list( 207 cls, 208 events: list[Self], 209 crop_size: int | list[int] = None, 210 in_pixels: bool = True, 211 ) -> list[list[np.ndarray]]: 212 """ 213 Get the images for a list of events, ensuring that there is no wasteful reading 214 of the same tile multiple times. This function is more efficient than calling 215 extract_event_images for each event. 216 :param events: the events to extract images for. 217 :param crop_size: the square size of the image crop to get for this event. 218 Defaults to four times the size of the event. 219 :param in_pixels: whether the crop size is in pixels or micrometers. 220 Defaults to pixels, and is ignored if crop_size is None. 221 :return: a list of lists of cropped images for each event. 222 """ 223 if len(events) == 0: 224 return [] 225 226 # Populate a crop size if none provided 227 if crop_size is None: 228 crop_size = [4 * event.size for event in events] 229 in_pixels = True 230 # Propagate a constant crop size 231 elif isinstance(crop_size, int): 232 crop_size = [crop_size] * len(events) 233 234 # Sort the events by tile; use a shallow copy to avoid modifying the original 235 order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__())) 236 237 # Allocate the list to size 238 images = [None] * len(events) 239 last_tile = None 240 frame_images = None # Holds large numpy arrays, so expensive to compare 241 # Iterate through in sorted order 242 for i in order: 243 if last_tile != events[i].tile: 244 # Gather the frame images, preserving them for the next event 245 frames = Frame.get_frames(events[i].tile) 246 frame_images = [frame.get_image() for frame in frames] 247 248 last_tile = events[i].tile 249 # Use the frame images to crop the event images 250 # Preserve the original order using order[i] 251 images[i] = events[i].crop_images(frame_images, crop_size[i], in_pixels) 252 return images
Get the images for a list of events, ensuring that there is no wasteful reading of the same tile multiple times. This function is more efficient than calling extract_event_images for each event.
Parameters
- events: the events to extract images for.
- crop_size: the square size of the image crop to get for this event. Defaults to four times the size of the event.
- in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels, and is ignored if crop_size is None.
Returns
a list of lists of cropped images for each event.
255class EventArray: 256 """ 257 A class that holds a large number of events' data, making it easy to analyze and 258 manipulate many events at once. A more separated version of the Event class. 259 """ 260 261 INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y", "size"] 262 263 def __init__( 264 self, 265 info: pd.DataFrame = None, 266 metadata: pd.DataFrame = None, 267 features: pd.DataFrame = None, 268 ): 269 # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y", "size" 270 if info is not None: 271 if list(info.columns) != self.INFO_COLUMNS: 272 raise ValueError( 273 "EventArray.info must have columns 'slide_id', 'tile', 'roi', 'x', 'y', 'size'" 274 ) 275 # Copy first to avoid modifying the original 276 info = info.copy() 277 # Ensure that the columns are the right types 278 info["slide_id"] = info["slide_id"].astype(str) 279 info["tile"] = info["tile"].astype(np.uint16) 280 info["roi"] = info["roi"].astype(np.uint8) 281 info["x"] = info["x"].round().astype(np.uint16) 282 info["y"] = info["y"].round().astype(np.uint16) 283 info["size"] = info["size"].round().astype(np.uint16) 284 # All DataFrames must all have the same number of rows 285 if metadata is not None and (info is None or len(info) != len(metadata)): 286 raise ValueError( 287 "If EventArray.metadata is not None, it should match rows with .info" 288 ) 289 if features is not None and (info is None or len(info) != len(features)): 290 raise ValueError( 291 "If EventArray.features is not None, it should match rows with .info" 292 ) 293 self.info = info 294 self.metadata = metadata 295 self.features = features 296 297 def __len__(self) -> int: 298 # Convenience method to get the number of events 299 if self.info is None: 300 return 0 301 else: 302 return len(self.info) 303 304 def __eq__(self, other): 305 is_equal = True 306 # Parse all possibilities for info 307 if isinstance(self.info, pd.DataFrame): 308 if isinstance(other.info, pd.DataFrame): 309 is_equal = self.info.equals(other.info) 310 if not is_equal: 311 return False 312 else: 313 return False 314 elif self.info is None: 315 if other.info is not None: 316 return False 317 318 # Parse all possibilities for metadata 319 if isinstance(self.metadata, pd.DataFrame): 320 if isinstance(other.metadata, pd.DataFrame): 321 is_equal = self.metadata.equals(other.metadata) 322 if not is_equal: 323 return False 324 else: 325 return False 326 elif self.metadata is None: 327 if other.metadata is not None: 328 return False 329 330 # Parse all possibilities for features 331 if isinstance(self.features, pd.DataFrame): 332 if isinstance(other.features, pd.DataFrame): 333 is_equal = self.features.equals(other.features) 334 if not is_equal: 335 return False 336 else: 337 return False 338 elif self.features is None: 339 if other.features is not None: 340 return False 341 342 return is_equal 343 344 def get_sort_order(self, by: str | list[str], ascending: bool | list[bool] = True): 345 """ 346 Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames. 347 :param by: name of the column(s) to sort by. 348 :param ascending: whether to sort in ascending order; can be a list to match by 349 :return: the order of the indices to sort by. 350 """ 351 columns = self.get(by) 352 return columns.sort_values(by=by, ascending=ascending).index 353 354 def sort(self, by: str | list[str], ascending: bool | list[bool] = True) -> Self: 355 """ 356 Sort the EventArray by column(s) in the info, metadata, or features DataFrames. 357 :param by: name of the column(s) to sort by. 358 :param ascending: whether to sort in ascending order; can be a list to match by 359 :return: a new, sorted EventArray. 360 """ 361 order = self.get_sort_order(by, ascending) 362 info = self.info.loc[order].reset_index(drop=True) 363 if self.metadata is not None: 364 metadata = self.metadata.loc[order].reset_index(drop=True) 365 else: 366 metadata = None 367 if self.features is not None: 368 features = self.features.loc[order].reset_index(drop=True) 369 else: 370 features = None 371 return EventArray(info, metadata, features) 372 373 def get(self, column_names: int | str | list[int] | list[str]) -> pd.DataFrame: 374 """ 375 Get a DataFrame with the specified columns from the EventArray, by value. 376 :param column_names: the names of the columns to get. 377 :return: a DataFrame with the specified columns. 378 """ 379 if isinstance(column_names, int) or isinstance(column_names, str): 380 column_names = [column_names] 381 columns = [] 382 for column_name in column_names: 383 if column_name in self.info.columns: 384 columns.append(self.info[column_name]) 385 elif self.metadata is not None and column_name in self.metadata.columns: 386 columns.append(self.metadata[column_name]) 387 elif self.features is not None and column_name in self.features.columns: 388 columns.append(self.features[column_name]) 389 else: 390 raise ValueError(f"Column {column_name} not found in EventArray") 391 return pd.concat(columns, axis=1) 392 393 def rows(self, rows) -> Self: 394 """ 395 Get a subset of the EventArray rows based on a boolean or integer index, by value. 396 :param rows: the indices to get as a 1D boolean/integer list/array/series 397 :return: a new EventArray with the subset of events. 398 """ 399 info = self.info.loc[rows].reset_index(drop=True) 400 if self.metadata is not None: 401 metadata = self.metadata.loc[rows].reset_index(drop=True) 402 else: 403 metadata = None 404 if self.features is not None: 405 features = self.features.loc[rows].reset_index(drop=True) 406 else: 407 features = None 408 return EventArray(info, metadata, features) 409 410 def copy(self) -> Self: 411 """ 412 Create a deep copy of the EventArray. 413 :return: a deep copy of the EventArray. 414 """ 415 return EventArray( 416 info=self.info.copy(), 417 metadata=None if self.metadata is None else self.metadata.copy(), 418 features=None if self.features is None else self.features.copy(), 419 ) 420 421 def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None: 422 """ 423 Add metadata to the EventArray. Removes the need to check if metadata is None. 424 Overwrites any existing metadata with the same column names as the new metadata. 425 :param new_metadata: the metadata to add. 426 """ 427 if len(self) != len(new_metadata): 428 raise ValueError("New metadata must match length of existing info") 429 430 if self.metadata is None: 431 self.metadata = new_metadata 432 else: 433 if isinstance(new_metadata, pd.Series): 434 self.metadata[new_metadata.name] = new_metadata 435 else: 436 # It's a DataFrame 437 self.metadata[new_metadata.columns] = new_metadata 438 439 def add_features(self, new_features: pd.Series | pd.DataFrame) -> None: 440 """ 441 Add features to the EventArray. Removes the need to check if features is None. 442 Overwrites any existing features with the same column names as the new features. 443 :param new_features: the features to add. 444 """ 445 if len(self) != len(new_features): 446 raise ValueError("New features must match length of existing info") 447 448 if self.features is None: 449 self.features = new_features 450 else: 451 if isinstance(new_features, pd.Series): 452 self.features[new_features.name] = new_features 453 else: 454 # It's a DataFrame 455 self.features[new_features.columns] = new_features 456 457 @classmethod 458 def merge(cls, events: list[Self]) -> Self: 459 """ 460 Combine EventArrays in a list into a single EventArray. 461 :param events: the new list of events. 462 """ 463 all_info = [] 464 all_metadata = [] 465 all_features = [] 466 for event_array in events: 467 # Skip empty EventArrays 468 if event_array.info is not None: 469 all_info.append(event_array.info) 470 if event_array.metadata is not None: 471 all_metadata.append(event_array.metadata) 472 if event_array.features is not None: 473 all_features.append(event_array.features) 474 if len(all_info) == 0: 475 return EventArray() 476 else: 477 all_info = pd.concat(all_info, ignore_index=True) 478 if len(all_metadata) == 0: 479 all_metadata = None 480 else: 481 all_metadata = pd.concat(all_metadata, ignore_index=True) 482 if len(all_features) == 0: 483 all_features = None 484 else: 485 all_features = pd.concat(all_features, ignore_index=True) 486 487 return EventArray(all_info, all_metadata, all_features) 488 489 def to_events( 490 self, 491 scans: Scan | list[Scan], 492 ignore_missing_scans=True, 493 ignore_metadata=False, 494 ignore_features=False, 495 ) -> list[Event]: 496 """ 497 Get the events in the EventArray as a list of events. 498 :param scans: the scans that the events belong to, auto-matched by slide_id. 499 Pass None if you don't care about scan metadata (pass ignore_missing_scans). 500 :param ignore_missing_scans: whether to create blank scans for events without scans. 501 :param ignore_metadata: whether to ignore metadata or not 502 :param ignore_features: whether to ignore features or not 503 :return: 504 """ 505 if isinstance(scans, Scan): 506 scans = [scans] * len(self.info) 507 events = [] 508 for i in range(len(self.info)): 509 # Determine the associated scan 510 scan = None 511 for s in scans: 512 if s.slide_id == self.info["slide_id"][i]: 513 scan = s 514 break 515 if scan is None: 516 if ignore_missing_scans: 517 # Create a placeholder scan if the scan is missing 518 scan = Scan.make_placeholder( 519 self.info["slide_id"][i], 520 self.info["tile"][i], 521 self.info["roi"][i], 522 ) 523 else: 524 raise ValueError( 525 f"Scan {self.info['slide_id'][i]} not found for event {i}." 526 ) 527 # Prepare the metadata and features 528 if ignore_metadata or self.metadata is None: 529 metadata = None 530 else: 531 # This Series creation method is less efficient, 532 # but required for preserving dtypes 533 metadata = pd.Series( 534 {col: self.metadata.loc[i, col] for col in self.metadata.columns}, 535 dtype=object, 536 ) 537 if ignore_features or self.features is None: 538 features = None 539 else: 540 features = pd.Series( 541 {col: self.features.loc[i, col] for col in self.features.columns}, 542 dtype=object, 543 ) 544 # Create the event and append it to the list 545 events.append( 546 Event( 547 scan, 548 Tile(scan, self.info["tile"][i], self.info["roi"][i]), 549 self.info["x"][i], 550 self.info["y"][i], 551 size=self.info["size"][i], 552 metadata=metadata, 553 features=features, 554 ) 555 ) 556 return events 557 558 @classmethod 559 def from_events(cls, events: list[Event]) -> Self: 560 """ 561 Set the events in the EventArray to a new list of events. 562 :param events: the new list of events. 563 """ 564 # Return an empty array if we were passed nothing 565 if events is None or len(events) == 0: 566 return EventArray() 567 # Otherwise, grab the info 568 info = pd.DataFrame( 569 { 570 "slide_id": [event.scan.slide_id for event in events], 571 "tile": [event.tile.n for event in events], 572 "roi": [event.tile.n_roi for event in events], 573 "x": [event.x for event in events], 574 "y": [event.y for event in events], 575 "size": [event.size for event in events], 576 } 577 ) 578 metadata_list = [event.metadata for event in events] 579 # Iterate through and ensure that all metadata is the same shape 580 for metadata in metadata_list: 581 if type(metadata) != type(metadata_list[0]): 582 raise ValueError("All metadata must be the same type.") 583 if metadata is not None and metadata.shape != metadata_list[0].shape: 584 raise ValueError("All metadata must be the same shape.") 585 if metadata_list[0] is None: 586 metadata = None 587 else: 588 metadata = pd.DataFrame(metadata_list) 589 features_list = [event.features for event in events] 590 # Iterate through and ensure that all features are the same shape 591 for features in features_list: 592 if type(features) != type(features_list[0]): 593 raise ValueError("All features must be the same type.") 594 if features is not None and features.shape != features_list[0].shape: 595 raise ValueError("All features must be the same shape.") 596 if features_list[0] is None: 597 features = None 598 else: 599 features = pd.DataFrame(features_list) 600 return EventArray(info=info, metadata=metadata, features=features) 601 602 def to_dataframe(self) -> pd.DataFrame: 603 """ 604 Convert all the data in the EventArray to a single DataFrame. 605 :return: a DataFrame with all the data in the EventArray. 606 """ 607 # Make a copy of the info DataFrame and prepend "info_" to the column names 608 output = self.info.copy() 609 output.columns = [f"info_{col}" for col in output.columns] 610 # Combine with the metadata and prepend "metadata_" to the column names 611 if self.metadata is not None: 612 metadata = self.metadata.copy() 613 metadata.columns = [f"metadata_{col}" for col in metadata.columns] 614 output = pd.concat([output, metadata], axis=1) 615 # Combine with the features and prepend "features_" to the column names 616 if self.features is not None: 617 features = self.features.copy() 618 features.columns = [f"features_{col}" for col in features.columns] 619 output = pd.concat([output, features], axis=1) 620 return output 621 622 @classmethod 623 def from_dataframe(cls, df) -> Self: 624 """ 625 From a single, special DataFrame, create an EventArray. 626 :return: a DataFrame with all the data in the EventArray. 627 """ 628 # Split the columns into info, metadata, and features and strip prefix 629 info = df[[col for col in df.columns if col.startswith("info_")]].copy() 630 info.columns = [col.replace("info_", "") for col in info.columns] 631 if info.size == 0: 632 info = None 633 metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy() 634 metadata.columns = [col.replace("metadata_", "") for col in metadata.columns] 635 if metadata.size == 0: 636 metadata = None 637 features = df[[col for col in df.columns if col.startswith("features_")]].copy() 638 features.columns = [col.replace("features_", "") for col in features.columns] 639 if features.size == 0: 640 features = None 641 return cls(info=info, metadata=metadata, features=features) 642 643 @classmethod 644 def from_mask( 645 cls, 646 mask: np.ndarray, 647 slide_id: str, 648 tile_n: int, 649 n_roi: int = 0, 650 include_cell_id: bool = True, 651 images: list[np.ndarray] = None, 652 image_labels: list[str] = None, 653 properties: list[str] = None, 654 ) -> Self: 655 """ 656 Extract events from a mask DataFrame, including metadata and features. 657 :param mask: the mask to extract events from. 658 :param slide_id: the slide ID the mask is from. 659 :param tile_n: the tile number the mask is from. 660 :param n_roi: the ROI number the mask is from. 661 :param include_cell_id: whether to include the cell_id, or numerical 662 mask label, as metadata in the EventArray. 663 :param images: the intensity images to extract features from. 664 :param image_labels: the labels for the intensity images. 665 :param properties: list of properties to extract in addition to the defaults: 666 :return: EventArray corresponding to the mask labels. 667 """ 668 if extract_mask_info is None: 669 raise ModuleNotFoundError( 670 "csi_images.csi_images dependencies not installed. Install csi-images " 671 "with [imageio] option to resolve." 672 ) 673 # Gather mask_info 674 if images is not None and image_labels is not None: 675 if len(images) != len(image_labels): 676 raise ValueError("Intensity images and labels must match lengths.") 677 678 mask_info = extract_mask_info(mask, images, image_labels, properties) 679 680 if len(mask_info) == 0: 681 return EventArray() 682 683 # Combine provided info and mask info 684 info = pd.DataFrame( 685 { 686 "slide_id": slide_id, 687 "tile": tile_n, 688 "roi": n_roi, 689 "x": mask_info["x"], 690 "y": mask_info["y"], 691 "size": mask_info["size"], 692 }, 693 ) 694 # Extract a metadata column if desired 695 if include_cell_id: 696 metadata = pd.DataFrame({"cell_id": mask_info["id"]}) 697 else: 698 metadata = None 699 # If any additional properties were extracted, add them as features 700 mask_info = mask_info.drop(columns=["id", "x", "y", "size"], errors="ignore") 701 if len(mask_info.columns) > 0: 702 features = mask_info 703 else: 704 features = None 705 return EventArray(info, metadata, features) 706 707 def save_csv(self, output_path: str) -> bool: 708 """ 709 Save the events to an CSV file, including metadata and features. 710 :param output_path: 711 :return: 712 """ 713 self.to_dataframe().to_csv(output_path, index=False) 714 return os.path.exists(output_path) 715 716 @classmethod 717 def load_csv(cls, input_path: str) -> Self: 718 """ 719 Load the events from an CSV file, including metadata and features. 720 :param input_path: 721 :return: 722 """ 723 # Load the CSV file 724 df = pd.read_csv(input_path) 725 return cls.from_dataframe(df) 726 727 def save_hdf5(self, output_path: str) -> bool: 728 """ 729 Save the events to an HDF5 file, including metadata and features. 730 Uses the pandas-provided HDF5 functions for ease, and external compatibility, 731 though these files are slightly harder to view in HDFView or similar. 732 :param output_path: 733 :return: 734 """ 735 # Open the output_path as an HDF5 file 736 with pd.HDFStore(output_path) as store: 737 # Store the dataframes in the HDF5 file 738 if self.info is not None: 739 store.put("info", self.info, index=False) 740 if self.metadata is not None: 741 store.put("metadata", self.metadata, index=False) 742 if self.features is not None: 743 store.put("features", self.features, index=False) 744 return os.path.exists(output_path) 745 746 @classmethod 747 def load_hdf5(cls, input_path: str) -> Self: 748 """ 749 Load the events from an HDF5 file, including metadata and features. 750 :param input_path: 751 :return: 752 """ 753 # Open the input_path as an HDF5 file 754 with pd.HDFStore(input_path) as store: 755 # Load the dataframes from the HDF5 file 756 info = store.get("info") if "info" in store else None 757 metadata = store.get("metadata") if "metadata" in store else None 758 features = store.get("features") if "features" in store else None 759 return cls(info=info, metadata=metadata, features=features) 760 761 def save_ocular(self, output_path: str, event_type: str = "cells"): 762 """ 763 Save the events to an OCULAR file. Relies on the dataframe originating 764 from an OCULAR file (same columns; duplicate metadata/info). 765 :param output_path: 766 :param event_type: 767 :return: 768 """ 769 if pyreadr is None: 770 raise ModuleNotFoundError( 771 "pyreadr not installed. Install pyreadr directly " 772 "or install csi-images with [rds] option to resolve." 773 ) 774 if event_type == "cells": 775 file_stub = "rc-final" 776 elif event_type == "others": 777 file_stub = "others-final" 778 else: 779 raise ValueError("Invalid event type. Must be cells or others.") 780 781 # Ensure good metadata 782 metadata = pd.DataFrame( 783 { 784 "slide_id": self.info["slide_id"], 785 "frame_id": self.info["tile"], 786 "cell_id": ( 787 self.metadata["cell_id"] 788 if "cell_id" in self.metadata.columns 789 else range(len(self.info)) 790 ), 791 "cellx": self.info["x"], 792 "celly": self.info["y"], 793 } 794 ) 795 if self.metadata is not None: 796 metadata[self.metadata.columns] = self.metadata.copy() 797 798 # Check for the "ocular_interesting" column 799 if event_type == "cells": 800 if "ocular_interesting" in metadata.columns: 801 interesting_rows = metadata["ocular_interesting"].to_numpy(dtype=bool) 802 elif "hcpc" in metadata.columns: 803 # Interesting cells don't get an hcpc designation, leaving them as -1 804 interesting_rows = ( 805 metadata["hcpc"].to_numpy() == -1 806 ) # interesting cells 807 else: 808 interesting_rows = [] 809 if sum(interesting_rows) > 0: 810 # Split the metadata into interesting and regular 811 interesting_events = self.rows(interesting_rows) 812 interesting_df = pd.concat( 813 [interesting_events.features, interesting_events.metadata], axis=1 814 ) 815 data_events = self.rows(~interesting_rows) 816 data_df = pd.concat( 817 [data_events.features, data_events.metadata], axis=1 818 ) 819 data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore") 820 821 # Drop particular columns for "interesting" 822 interesting_df = interesting_df.drop( 823 [ 824 "clust", 825 "hcpc", 826 "frame_id", 827 "cell_id", 828 "unique_id", 829 "ocular_interesting", 830 ], 831 axis=1, 832 errors="ignore", 833 ) 834 # Save both .csv and .rds 835 interesting_stub = os.path.join(output_path, "ocular_interesting") 836 interesting_df.to_csv(f"{interesting_stub}.csv") 837 # Suppress pandas FutureWarning 838 with warnings.catch_warnings(): 839 warnings.simplefilter(action="ignore", category=FutureWarning) 840 pyreadr.write_rds(f"{interesting_stub}.rds", interesting_df) 841 else: 842 data_df = pd.concat([self.features, metadata], axis=1) 843 else: 844 # Get all data and reset_index (will copy it) 845 data_df = pd.concat([self.features, metadata], axis=1) 846 847 # Split based on cluster number to conform to *-final[1-4].rds 848 n_clusters = max(data_df["clust"]) + 1 849 split_idx = [round(i * n_clusters / 4) for i in range(5)] 850 for i in range(4): 851 subset = (split_idx[i] <= data_df["clust"]) & ( 852 data_df["clust"] < split_idx[i + 1] 853 ) 854 data_df.loc[subset, "hcpc"] = i + 1 855 subset = data_df[subset].reset_index(drop=True) 856 # Suppress pandas FutureWarning 857 with warnings.catch_warnings(): 858 warnings.simplefilter(action="ignore", category=FutureWarning) 859 pyreadr.write_rds( 860 os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset 861 ) 862 863 # Create new example cell strings 864 data_df["example_cell_id"] = ( 865 data_df["slide_id"] 866 + " " 867 + data_df["frame_id"].astype(str) 868 + " " 869 + data_df["cell_id"].astype(str) 870 + " " 871 + data_df["cellx"].astype(int).astype(str) 872 + " " 873 + data_df["celly"].astype(int).astype(str) 874 ) 875 # Find averagable data columns 876 if "cellcluster_id" in data_df.columns: 877 end_idx = data_df.columns.get_loc("cellcluster_id") 878 else: 879 end_idx = data_df.columns.get_loc("slide_id") 880 avg_cols = data_df.columns[:end_idx].tolist() 881 # Group by cluster and average 882 data_df = data_df.groupby("clust").agg( 883 **{col: (col, "mean") for col in avg_cols}, 884 count=("clust", "size"), # count rows in each cluster 885 example_cells=("example_cell_id", lambda x: ",".join(x)), 886 hcpc=("hcpc", lambda x: x.iloc[0]), 887 ) 888 data_df = data_df.reset_index() # Do NOT drop, index is "clust" 889 # Create new columns 890 metadata = pd.DataFrame( 891 { 892 "count": data_df["count"], 893 "example_cells": data_df["example_cells"], 894 "clust": data_df["clust"].astype(int), 895 "hcpc": data_df["hcpc"].astype(int), 896 "id": data_df["clust"].astype(int).astype(str), 897 "cccluster": "0", # Dummy value 898 "ccdistance": 0.0, # Dummy value 899 "rownum": list(range(len(data_df))), 900 "framegroup": 0, # Dummy value 901 } 902 ) 903 # Need to pad the features to 761 columns, as per OCULAR report needs 904 additional_columns = range(len(avg_cols), 761) 905 if len(additional_columns) > 0: 906 padding = pd.DataFrame( 907 np.zeros((len(data_df), len(additional_columns))), 908 columns=[f"pad{i}" for i in additional_columns], 909 ) 910 data_df = pd.concat([data_df[avg_cols], padding, metadata], axis=1) 911 else: 912 data_df = pd.concat([data_df[avg_cols], metadata], axis=1) 913 914 # Save the cluster data 915 data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv")) 916 # Suppress pandas FutureWarning 917 with warnings.catch_warnings(): 918 warnings.simplefilter(action="ignore", category=FutureWarning) 919 pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df) 920 921 @classmethod 922 def load_ocular( 923 cls, 924 input_path: str, 925 event_type="cells", 926 cell_data_files=( 927 "rc-final1.rds", 928 "rc-final2.rds", 929 "rc-final3.rds", 930 "rc-final4.rds", 931 "ocular_interesting.rds", 932 ), 933 others_data_files=( 934 "others-final1.rds", 935 "others-final2.rds", 936 "others-final3.rds", 937 "others-final4.rds", 938 ), 939 atlas_data_files=( 940 "ocular_interesting.rds", 941 "ocular_not_interesting.rds", 942 ), 943 drop_common_events=True, 944 log=None, 945 ) -> Self: 946 """ 947 948 :param input_path: 949 :param event_type: 950 :param cell_data_files: 951 :param others_data_files: 952 :param atlas_data_files: 953 :param drop_common_events: 954 :param log: 955 :return: 956 """ 957 if pyreadr is None: 958 raise ModuleNotFoundError( 959 "pyreadr not installed. Install pyreadr directly " 960 "or install csi-images with [rds] option to resolve." 961 ) 962 # Check if the input path is a directory or a file 963 if os.path.isfile(input_path): 964 data_files = [os.path.basename(input_path)] 965 input_path = os.path.dirname(input_path) 966 if event_type == "cells": 967 data_files = cell_data_files 968 elif event_type == "others": 969 data_files = others_data_files 970 else: 971 raise ValueError("Invalid event type.") 972 973 # Load the data from the OCULAR files 974 file_data = {} 975 for file in data_files: 976 file_path = os.path.join(input_path, file) 977 if not os.path.isfile(file_path): 978 if log is not None: 979 log.warning(f"{file} not found for in {input_path}") 980 continue 981 file_data[file] = pyreadr.read_r(file_path) 982 # Get the DataFrame associated with None (pyreadr dict quirk) 983 file_data[file] = file_data[file][None] 984 if len(file_data[file]) == 0: 985 # File gets dropped from the dict 986 file_data.pop(file) 987 if log is not None: 988 log.warning(f"{file} has no cells") 989 continue 990 991 if log is not None: 992 log.debug(f"{file} has {len(file_data[file])} cells") 993 994 # Drop common cells if requested and in this file 995 if ( 996 file in atlas_data_files 997 and drop_common_events 998 and "catalogue_classification" in file_data[file] 999 ): 1000 common_cell_indices = ( 1001 file_data[file]["catalogue_classification"] == "common_cell" 1002 ) 1003 if log is not None: 1004 log.debug( 1005 f"Dropping {int(pd.Series.sum(common_cell_indices))}" 1006 f"common cells from {file}" 1007 ) 1008 file_data[file] = file_data[file][common_cell_indices == False] 1009 1010 if len(file_data[file]) == 0: 1011 # File gets dropped from the dict 1012 file_data.pop(file) 1013 if log is not None: 1014 log.warning(f"{file} has no cells after dropping common cells") 1015 continue 1016 1017 # Extract frame_id and cell_id 1018 # DAPI- events already have frame_id cell_id outside rowname 1019 if event_type == "cells" and "frame_id" not in file_data[file].columns: 1020 file_data[file]["rowname"] = file_data[file]["rowname"].astype("str") 1021 # get frame_id cell_id from rownames column and split into two columns 1022 split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True) 1023 if len(split_res.columns) != 2: 1024 log.warning( 1025 f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}' 1026 ) 1027 # then assign it back to the dataframe 1028 file_data[file][["frame_id", "cell_id"]] = split_res.astype("int") 1029 # reset indexes since they can cause NaN values in concat 1030 file_data[file] = file_data[file].reset_index(drop=True) 1031 1032 # Merge the data from all files 1033 if len(file_data) == 0: 1034 return EventArray() 1035 elif len(file_data) == 1: 1036 data = [file_data[file] for file in file_data.keys()][0] 1037 else: 1038 data = pd.concat(file_data.values()) 1039 1040 if log is not None: 1041 log.debug(f"Gathered a total of {len(data)} events") 1042 1043 # Others is missing the "slide_id". Insert it right before "frame_id" column 1044 if event_type == "others" and "slide_id" not in data.columns: 1045 if os.path.basename(input_path) == "ocular": 1046 slide_id = os.path.basename(os.path.dirname(input_path)) 1047 else: 1048 slide_id = "UNKNOWN" 1049 data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id) 1050 1051 # Sort according to ascending cell_id to keep the original, which is in manual_df 1052 data = data.sort_values(by=["cell_id"], ascending=True) 1053 # Filter out duplicates by x & y 1054 data = data.assign( 1055 unique_id=data["slide_id"] 1056 + "_" 1057 + data["frame_id"].astype(str) 1058 + "_" 1059 + data["cellx"].astype(int).astype(str) 1060 + "_" 1061 + data["celly"].astype(int).astype(str) 1062 ) 1063 data = data.drop_duplicates(subset=["unique_id"], keep="first") 1064 # Normal unique_id is with cell_id 1065 data = data.assign( 1066 unique_id=data["slide_id"] 1067 + "_" 1068 + data["frame_id"].astype(str) 1069 + "_" 1070 + data["cell_id"].astype(str) 1071 ) 1072 data = data.reset_index(drop=True) 1073 # All columns up to "slide_id" are features; drop the "slide_id" 1074 features = data.loc[:, :"slide_id"].iloc[:, :-1] 1075 data = data.loc[:, "slide_id":] 1076 # Grab the info columns 1077 info = data[["slide_id", "frame_id", "cellx", "celly"]] 1078 info.columns = ["slide_id", "tile", "x", "y"] 1079 info = info.assign( 1080 roi=0, # OCULAR only works on 1 ROI, as far as known 1081 size=25, # Static, for later montaging 1082 ) 1083 info = info[["slide_id", "tile", "roi", "x", "y", "size"]] 1084 # Metadata has duplicate columns for later convenience 1085 metadata = data 1086 # Certain columns tend to be problematic with mixed data formats... 1087 for col in ["TRITC", "CY5", "FITC"]: 1088 if col in metadata: 1089 labels = { 1090 "False": False, 1091 "True": True, 1092 "FALSE": False, 1093 "TRUE": True, 1094 } 1095 metadata[col] = metadata[col].map(labels).astype(bool) 1096 for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]: 1097 if col in metadata: 1098 metadata[col] = metadata[col].fillna(-1).astype(int) 1099 return EventArray(info, metadata, features)
A class that holds a large number of events' data, making it easy to analyze and manipulate many events at once. A more separated version of the Event class.
263 def __init__( 264 self, 265 info: pd.DataFrame = None, 266 metadata: pd.DataFrame = None, 267 features: pd.DataFrame = None, 268 ): 269 # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y", "size" 270 if info is not None: 271 if list(info.columns) != self.INFO_COLUMNS: 272 raise ValueError( 273 "EventArray.info must have columns 'slide_id', 'tile', 'roi', 'x', 'y', 'size'" 274 ) 275 # Copy first to avoid modifying the original 276 info = info.copy() 277 # Ensure that the columns are the right types 278 info["slide_id"] = info["slide_id"].astype(str) 279 info["tile"] = info["tile"].astype(np.uint16) 280 info["roi"] = info["roi"].astype(np.uint8) 281 info["x"] = info["x"].round().astype(np.uint16) 282 info["y"] = info["y"].round().astype(np.uint16) 283 info["size"] = info["size"].round().astype(np.uint16) 284 # All DataFrames must all have the same number of rows 285 if metadata is not None and (info is None or len(info) != len(metadata)): 286 raise ValueError( 287 "If EventArray.metadata is not None, it should match rows with .info" 288 ) 289 if features is not None and (info is None or len(info) != len(features)): 290 raise ValueError( 291 "If EventArray.features is not None, it should match rows with .info" 292 ) 293 self.info = info 294 self.metadata = metadata 295 self.features = features
344 def get_sort_order(self, by: str | list[str], ascending: bool | list[bool] = True): 345 """ 346 Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames. 347 :param by: name of the column(s) to sort by. 348 :param ascending: whether to sort in ascending order; can be a list to match by 349 :return: the order of the indices to sort by. 350 """ 351 columns = self.get(by) 352 return columns.sort_values(by=by, ascending=ascending).index
Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.
Parameters
- by: name of the column(s) to sort by.
- ascending: whether to sort in ascending order; can be a list to match by
Returns
the order of the indices to sort by.
354 def sort(self, by: str | list[str], ascending: bool | list[bool] = True) -> Self: 355 """ 356 Sort the EventArray by column(s) in the info, metadata, or features DataFrames. 357 :param by: name of the column(s) to sort by. 358 :param ascending: whether to sort in ascending order; can be a list to match by 359 :return: a new, sorted EventArray. 360 """ 361 order = self.get_sort_order(by, ascending) 362 info = self.info.loc[order].reset_index(drop=True) 363 if self.metadata is not None: 364 metadata = self.metadata.loc[order].reset_index(drop=True) 365 else: 366 metadata = None 367 if self.features is not None: 368 features = self.features.loc[order].reset_index(drop=True) 369 else: 370 features = None 371 return EventArray(info, metadata, features)
Sort the EventArray by column(s) in the info, metadata, or features DataFrames.
Parameters
- by: name of the column(s) to sort by.
- ascending: whether to sort in ascending order; can be a list to match by
Returns
a new, sorted EventArray.
373 def get(self, column_names: int | str | list[int] | list[str]) -> pd.DataFrame: 374 """ 375 Get a DataFrame with the specified columns from the EventArray, by value. 376 :param column_names: the names of the columns to get. 377 :return: a DataFrame with the specified columns. 378 """ 379 if isinstance(column_names, int) or isinstance(column_names, str): 380 column_names = [column_names] 381 columns = [] 382 for column_name in column_names: 383 if column_name in self.info.columns: 384 columns.append(self.info[column_name]) 385 elif self.metadata is not None and column_name in self.metadata.columns: 386 columns.append(self.metadata[column_name]) 387 elif self.features is not None and column_name in self.features.columns: 388 columns.append(self.features[column_name]) 389 else: 390 raise ValueError(f"Column {column_name} not found in EventArray") 391 return pd.concat(columns, axis=1)
Get a DataFrame with the specified columns from the EventArray, by value.
Parameters
- column_names: the names of the columns to get.
Returns
a DataFrame with the specified columns.
393 def rows(self, rows) -> Self: 394 """ 395 Get a subset of the EventArray rows based on a boolean or integer index, by value. 396 :param rows: the indices to get as a 1D boolean/integer list/array/series 397 :return: a new EventArray with the subset of events. 398 """ 399 info = self.info.loc[rows].reset_index(drop=True) 400 if self.metadata is not None: 401 metadata = self.metadata.loc[rows].reset_index(drop=True) 402 else: 403 metadata = None 404 if self.features is not None: 405 features = self.features.loc[rows].reset_index(drop=True) 406 else: 407 features = None 408 return EventArray(info, metadata, features)
Get a subset of the EventArray rows based on a boolean or integer index, by value.
Parameters
- rows: the indices to get as a 1D boolean/integer list/array/series
Returns
a new EventArray with the subset of events.
410 def copy(self) -> Self: 411 """ 412 Create a deep copy of the EventArray. 413 :return: a deep copy of the EventArray. 414 """ 415 return EventArray( 416 info=self.info.copy(), 417 metadata=None if self.metadata is None else self.metadata.copy(), 418 features=None if self.features is None else self.features.copy(), 419 )
Create a deep copy of the EventArray.
Returns
a deep copy of the EventArray.
421 def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None: 422 """ 423 Add metadata to the EventArray. Removes the need to check if metadata is None. 424 Overwrites any existing metadata with the same column names as the new metadata. 425 :param new_metadata: the metadata to add. 426 """ 427 if len(self) != len(new_metadata): 428 raise ValueError("New metadata must match length of existing info") 429 430 if self.metadata is None: 431 self.metadata = new_metadata 432 else: 433 if isinstance(new_metadata, pd.Series): 434 self.metadata[new_metadata.name] = new_metadata 435 else: 436 # It's a DataFrame 437 self.metadata[new_metadata.columns] = new_metadata
Add metadata to the EventArray. Removes the need to check if metadata is None. Overwrites any existing metadata with the same column names as the new metadata.
Parameters
- new_metadata: the metadata to add.
439 def add_features(self, new_features: pd.Series | pd.DataFrame) -> None: 440 """ 441 Add features to the EventArray. Removes the need to check if features is None. 442 Overwrites any existing features with the same column names as the new features. 443 :param new_features: the features to add. 444 """ 445 if len(self) != len(new_features): 446 raise ValueError("New features must match length of existing info") 447 448 if self.features is None: 449 self.features = new_features 450 else: 451 if isinstance(new_features, pd.Series): 452 self.features[new_features.name] = new_features 453 else: 454 # It's a DataFrame 455 self.features[new_features.columns] = new_features
Add features to the EventArray. Removes the need to check if features is None. Overwrites any existing features with the same column names as the new features.
Parameters
- new_features: the features to add.
457 @classmethod 458 def merge(cls, events: list[Self]) -> Self: 459 """ 460 Combine EventArrays in a list into a single EventArray. 461 :param events: the new list of events. 462 """ 463 all_info = [] 464 all_metadata = [] 465 all_features = [] 466 for event_array in events: 467 # Skip empty EventArrays 468 if event_array.info is not None: 469 all_info.append(event_array.info) 470 if event_array.metadata is not None: 471 all_metadata.append(event_array.metadata) 472 if event_array.features is not None: 473 all_features.append(event_array.features) 474 if len(all_info) == 0: 475 return EventArray() 476 else: 477 all_info = pd.concat(all_info, ignore_index=True) 478 if len(all_metadata) == 0: 479 all_metadata = None 480 else: 481 all_metadata = pd.concat(all_metadata, ignore_index=True) 482 if len(all_features) == 0: 483 all_features = None 484 else: 485 all_features = pd.concat(all_features, ignore_index=True) 486 487 return EventArray(all_info, all_metadata, all_features)
Combine EventArrays in a list into a single EventArray.
Parameters
- events: the new list of events.
489 def to_events( 490 self, 491 scans: Scan | list[Scan], 492 ignore_missing_scans=True, 493 ignore_metadata=False, 494 ignore_features=False, 495 ) -> list[Event]: 496 """ 497 Get the events in the EventArray as a list of events. 498 :param scans: the scans that the events belong to, auto-matched by slide_id. 499 Pass None if you don't care about scan metadata (pass ignore_missing_scans). 500 :param ignore_missing_scans: whether to create blank scans for events without scans. 501 :param ignore_metadata: whether to ignore metadata or not 502 :param ignore_features: whether to ignore features or not 503 :return: 504 """ 505 if isinstance(scans, Scan): 506 scans = [scans] * len(self.info) 507 events = [] 508 for i in range(len(self.info)): 509 # Determine the associated scan 510 scan = None 511 for s in scans: 512 if s.slide_id == self.info["slide_id"][i]: 513 scan = s 514 break 515 if scan is None: 516 if ignore_missing_scans: 517 # Create a placeholder scan if the scan is missing 518 scan = Scan.make_placeholder( 519 self.info["slide_id"][i], 520 self.info["tile"][i], 521 self.info["roi"][i], 522 ) 523 else: 524 raise ValueError( 525 f"Scan {self.info['slide_id'][i]} not found for event {i}." 526 ) 527 # Prepare the metadata and features 528 if ignore_metadata or self.metadata is None: 529 metadata = None 530 else: 531 # This Series creation method is less efficient, 532 # but required for preserving dtypes 533 metadata = pd.Series( 534 {col: self.metadata.loc[i, col] for col in self.metadata.columns}, 535 dtype=object, 536 ) 537 if ignore_features or self.features is None: 538 features = None 539 else: 540 features = pd.Series( 541 {col: self.features.loc[i, col] for col in self.features.columns}, 542 dtype=object, 543 ) 544 # Create the event and append it to the list 545 events.append( 546 Event( 547 scan, 548 Tile(scan, self.info["tile"][i], self.info["roi"][i]), 549 self.info["x"][i], 550 self.info["y"][i], 551 size=self.info["size"][i], 552 metadata=metadata, 553 features=features, 554 ) 555 ) 556 return events
Get the events in the EventArray as a list of events.
Parameters
- scans: the scans that the events belong to, auto-matched by slide_id. Pass None if you don't care about scan metadata (pass ignore_missing_scans).
- ignore_missing_scans: whether to create blank scans for events without scans.
- ignore_metadata: whether to ignore metadata or not
- ignore_features: whether to ignore features or not
Returns
558 @classmethod 559 def from_events(cls, events: list[Event]) -> Self: 560 """ 561 Set the events in the EventArray to a new list of events. 562 :param events: the new list of events. 563 """ 564 # Return an empty array if we were passed nothing 565 if events is None or len(events) == 0: 566 return EventArray() 567 # Otherwise, grab the info 568 info = pd.DataFrame( 569 { 570 "slide_id": [event.scan.slide_id for event in events], 571 "tile": [event.tile.n for event in events], 572 "roi": [event.tile.n_roi for event in events], 573 "x": [event.x for event in events], 574 "y": [event.y for event in events], 575 "size": [event.size for event in events], 576 } 577 ) 578 metadata_list = [event.metadata for event in events] 579 # Iterate through and ensure that all metadata is the same shape 580 for metadata in metadata_list: 581 if type(metadata) != type(metadata_list[0]): 582 raise ValueError("All metadata must be the same type.") 583 if metadata is not None and metadata.shape != metadata_list[0].shape: 584 raise ValueError("All metadata must be the same shape.") 585 if metadata_list[0] is None: 586 metadata = None 587 else: 588 metadata = pd.DataFrame(metadata_list) 589 features_list = [event.features for event in events] 590 # Iterate through and ensure that all features are the same shape 591 for features in features_list: 592 if type(features) != type(features_list[0]): 593 raise ValueError("All features must be the same type.") 594 if features is not None and features.shape != features_list[0].shape: 595 raise ValueError("All features must be the same shape.") 596 if features_list[0] is None: 597 features = None 598 else: 599 features = pd.DataFrame(features_list) 600 return EventArray(info=info, metadata=metadata, features=features)
Set the events in the EventArray to a new list of events.
Parameters
- events: the new list of events.
602 def to_dataframe(self) -> pd.DataFrame: 603 """ 604 Convert all the data in the EventArray to a single DataFrame. 605 :return: a DataFrame with all the data in the EventArray. 606 """ 607 # Make a copy of the info DataFrame and prepend "info_" to the column names 608 output = self.info.copy() 609 output.columns = [f"info_{col}" for col in output.columns] 610 # Combine with the metadata and prepend "metadata_" to the column names 611 if self.metadata is not None: 612 metadata = self.metadata.copy() 613 metadata.columns = [f"metadata_{col}" for col in metadata.columns] 614 output = pd.concat([output, metadata], axis=1) 615 # Combine with the features and prepend "features_" to the column names 616 if self.features is not None: 617 features = self.features.copy() 618 features.columns = [f"features_{col}" for col in features.columns] 619 output = pd.concat([output, features], axis=1) 620 return output
Convert all the data in the EventArray to a single DataFrame.
Returns
a DataFrame with all the data in the EventArray.
622 @classmethod 623 def from_dataframe(cls, df) -> Self: 624 """ 625 From a single, special DataFrame, create an EventArray. 626 :return: a DataFrame with all the data in the EventArray. 627 """ 628 # Split the columns into info, metadata, and features and strip prefix 629 info = df[[col for col in df.columns if col.startswith("info_")]].copy() 630 info.columns = [col.replace("info_", "") for col in info.columns] 631 if info.size == 0: 632 info = None 633 metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy() 634 metadata.columns = [col.replace("metadata_", "") for col in metadata.columns] 635 if metadata.size == 0: 636 metadata = None 637 features = df[[col for col in df.columns if col.startswith("features_")]].copy() 638 features.columns = [col.replace("features_", "") for col in features.columns] 639 if features.size == 0: 640 features = None 641 return cls(info=info, metadata=metadata, features=features)
From a single, special DataFrame, create an EventArray.
Returns
a DataFrame with all the data in the EventArray.
643 @classmethod 644 def from_mask( 645 cls, 646 mask: np.ndarray, 647 slide_id: str, 648 tile_n: int, 649 n_roi: int = 0, 650 include_cell_id: bool = True, 651 images: list[np.ndarray] = None, 652 image_labels: list[str] = None, 653 properties: list[str] = None, 654 ) -> Self: 655 """ 656 Extract events from a mask DataFrame, including metadata and features. 657 :param mask: the mask to extract events from. 658 :param slide_id: the slide ID the mask is from. 659 :param tile_n: the tile number the mask is from. 660 :param n_roi: the ROI number the mask is from. 661 :param include_cell_id: whether to include the cell_id, or numerical 662 mask label, as metadata in the EventArray. 663 :param images: the intensity images to extract features from. 664 :param image_labels: the labels for the intensity images. 665 :param properties: list of properties to extract in addition to the defaults: 666 :return: EventArray corresponding to the mask labels. 667 """ 668 if extract_mask_info is None: 669 raise ModuleNotFoundError( 670 "csi_images.csi_images dependencies not installed. Install csi-images " 671 "with [imageio] option to resolve." 672 ) 673 # Gather mask_info 674 if images is not None and image_labels is not None: 675 if len(images) != len(image_labels): 676 raise ValueError("Intensity images and labels must match lengths.") 677 678 mask_info = extract_mask_info(mask, images, image_labels, properties) 679 680 if len(mask_info) == 0: 681 return EventArray() 682 683 # Combine provided info and mask info 684 info = pd.DataFrame( 685 { 686 "slide_id": slide_id, 687 "tile": tile_n, 688 "roi": n_roi, 689 "x": mask_info["x"], 690 "y": mask_info["y"], 691 "size": mask_info["size"], 692 }, 693 ) 694 # Extract a metadata column if desired 695 if include_cell_id: 696 metadata = pd.DataFrame({"cell_id": mask_info["id"]}) 697 else: 698 metadata = None 699 # If any additional properties were extracted, add them as features 700 mask_info = mask_info.drop(columns=["id", "x", "y", "size"], errors="ignore") 701 if len(mask_info.columns) > 0: 702 features = mask_info 703 else: 704 features = None 705 return EventArray(info, metadata, features)
Extract events from a mask DataFrame, including metadata and features.
Parameters
- mask: the mask to extract events from.
- slide_id: the slide ID the mask is from.
- tile_n: the tile number the mask is from.
- n_roi: the ROI number the mask is from.
- include_cell_id: whether to include the cell_id, or numerical mask label, as metadata in the EventArray.
- images: the intensity images to extract features from.
- image_labels: the labels for the intensity images.
- properties: list of properties to extract in addition to the defaults:
Returns
EventArray corresponding to the mask labels.
707 def save_csv(self, output_path: str) -> bool: 708 """ 709 Save the events to an CSV file, including metadata and features. 710 :param output_path: 711 :return: 712 """ 713 self.to_dataframe().to_csv(output_path, index=False) 714 return os.path.exists(output_path)
Save the events to an CSV file, including metadata and features.
Parameters
- output_path:
Returns
716 @classmethod 717 def load_csv(cls, input_path: str) -> Self: 718 """ 719 Load the events from an CSV file, including metadata and features. 720 :param input_path: 721 :return: 722 """ 723 # Load the CSV file 724 df = pd.read_csv(input_path) 725 return cls.from_dataframe(df)
Load the events from an CSV file, including metadata and features.
Parameters
- input_path:
Returns
727 def save_hdf5(self, output_path: str) -> bool: 728 """ 729 Save the events to an HDF5 file, including metadata and features. 730 Uses the pandas-provided HDF5 functions for ease, and external compatibility, 731 though these files are slightly harder to view in HDFView or similar. 732 :param output_path: 733 :return: 734 """ 735 # Open the output_path as an HDF5 file 736 with pd.HDFStore(output_path) as store: 737 # Store the dataframes in the HDF5 file 738 if self.info is not None: 739 store.put("info", self.info, index=False) 740 if self.metadata is not None: 741 store.put("metadata", self.metadata, index=False) 742 if self.features is not None: 743 store.put("features", self.features, index=False) 744 return os.path.exists(output_path)
Save the events to an HDF5 file, including metadata and features. Uses the pandas-provided HDF5 functions for ease, and external compatibility, though these files are slightly harder to view in HDFView or similar.
Parameters
- output_path:
Returns
746 @classmethod 747 def load_hdf5(cls, input_path: str) -> Self: 748 """ 749 Load the events from an HDF5 file, including metadata and features. 750 :param input_path: 751 :return: 752 """ 753 # Open the input_path as an HDF5 file 754 with pd.HDFStore(input_path) as store: 755 # Load the dataframes from the HDF5 file 756 info = store.get("info") if "info" in store else None 757 metadata = store.get("metadata") if "metadata" in store else None 758 features = store.get("features") if "features" in store else None 759 return cls(info=info, metadata=metadata, features=features)
Load the events from an HDF5 file, including metadata and features.
Parameters
- input_path:
Returns
761 def save_ocular(self, output_path: str, event_type: str = "cells"): 762 """ 763 Save the events to an OCULAR file. Relies on the dataframe originating 764 from an OCULAR file (same columns; duplicate metadata/info). 765 :param output_path: 766 :param event_type: 767 :return: 768 """ 769 if pyreadr is None: 770 raise ModuleNotFoundError( 771 "pyreadr not installed. Install pyreadr directly " 772 "or install csi-images with [rds] option to resolve." 773 ) 774 if event_type == "cells": 775 file_stub = "rc-final" 776 elif event_type == "others": 777 file_stub = "others-final" 778 else: 779 raise ValueError("Invalid event type. Must be cells or others.") 780 781 # Ensure good metadata 782 metadata = pd.DataFrame( 783 { 784 "slide_id": self.info["slide_id"], 785 "frame_id": self.info["tile"], 786 "cell_id": ( 787 self.metadata["cell_id"] 788 if "cell_id" in self.metadata.columns 789 else range(len(self.info)) 790 ), 791 "cellx": self.info["x"], 792 "celly": self.info["y"], 793 } 794 ) 795 if self.metadata is not None: 796 metadata[self.metadata.columns] = self.metadata.copy() 797 798 # Check for the "ocular_interesting" column 799 if event_type == "cells": 800 if "ocular_interesting" in metadata.columns: 801 interesting_rows = metadata["ocular_interesting"].to_numpy(dtype=bool) 802 elif "hcpc" in metadata.columns: 803 # Interesting cells don't get an hcpc designation, leaving them as -1 804 interesting_rows = ( 805 metadata["hcpc"].to_numpy() == -1 806 ) # interesting cells 807 else: 808 interesting_rows = [] 809 if sum(interesting_rows) > 0: 810 # Split the metadata into interesting and regular 811 interesting_events = self.rows(interesting_rows) 812 interesting_df = pd.concat( 813 [interesting_events.features, interesting_events.metadata], axis=1 814 ) 815 data_events = self.rows(~interesting_rows) 816 data_df = pd.concat( 817 [data_events.features, data_events.metadata], axis=1 818 ) 819 data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore") 820 821 # Drop particular columns for "interesting" 822 interesting_df = interesting_df.drop( 823 [ 824 "clust", 825 "hcpc", 826 "frame_id", 827 "cell_id", 828 "unique_id", 829 "ocular_interesting", 830 ], 831 axis=1, 832 errors="ignore", 833 ) 834 # Save both .csv and .rds 835 interesting_stub = os.path.join(output_path, "ocular_interesting") 836 interesting_df.to_csv(f"{interesting_stub}.csv") 837 # Suppress pandas FutureWarning 838 with warnings.catch_warnings(): 839 warnings.simplefilter(action="ignore", category=FutureWarning) 840 pyreadr.write_rds(f"{interesting_stub}.rds", interesting_df) 841 else: 842 data_df = pd.concat([self.features, metadata], axis=1) 843 else: 844 # Get all data and reset_index (will copy it) 845 data_df = pd.concat([self.features, metadata], axis=1) 846 847 # Split based on cluster number to conform to *-final[1-4].rds 848 n_clusters = max(data_df["clust"]) + 1 849 split_idx = [round(i * n_clusters / 4) for i in range(5)] 850 for i in range(4): 851 subset = (split_idx[i] <= data_df["clust"]) & ( 852 data_df["clust"] < split_idx[i + 1] 853 ) 854 data_df.loc[subset, "hcpc"] = i + 1 855 subset = data_df[subset].reset_index(drop=True) 856 # Suppress pandas FutureWarning 857 with warnings.catch_warnings(): 858 warnings.simplefilter(action="ignore", category=FutureWarning) 859 pyreadr.write_rds( 860 os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset 861 ) 862 863 # Create new example cell strings 864 data_df["example_cell_id"] = ( 865 data_df["slide_id"] 866 + " " 867 + data_df["frame_id"].astype(str) 868 + " " 869 + data_df["cell_id"].astype(str) 870 + " " 871 + data_df["cellx"].astype(int).astype(str) 872 + " " 873 + data_df["celly"].astype(int).astype(str) 874 ) 875 # Find averagable data columns 876 if "cellcluster_id" in data_df.columns: 877 end_idx = data_df.columns.get_loc("cellcluster_id") 878 else: 879 end_idx = data_df.columns.get_loc("slide_id") 880 avg_cols = data_df.columns[:end_idx].tolist() 881 # Group by cluster and average 882 data_df = data_df.groupby("clust").agg( 883 **{col: (col, "mean") for col in avg_cols}, 884 count=("clust", "size"), # count rows in each cluster 885 example_cells=("example_cell_id", lambda x: ",".join(x)), 886 hcpc=("hcpc", lambda x: x.iloc[0]), 887 ) 888 data_df = data_df.reset_index() # Do NOT drop, index is "clust" 889 # Create new columns 890 metadata = pd.DataFrame( 891 { 892 "count": data_df["count"], 893 "example_cells": data_df["example_cells"], 894 "clust": data_df["clust"].astype(int), 895 "hcpc": data_df["hcpc"].astype(int), 896 "id": data_df["clust"].astype(int).astype(str), 897 "cccluster": "0", # Dummy value 898 "ccdistance": 0.0, # Dummy value 899 "rownum": list(range(len(data_df))), 900 "framegroup": 0, # Dummy value 901 } 902 ) 903 # Need to pad the features to 761 columns, as per OCULAR report needs 904 additional_columns = range(len(avg_cols), 761) 905 if len(additional_columns) > 0: 906 padding = pd.DataFrame( 907 np.zeros((len(data_df), len(additional_columns))), 908 columns=[f"pad{i}" for i in additional_columns], 909 ) 910 data_df = pd.concat([data_df[avg_cols], padding, metadata], axis=1) 911 else: 912 data_df = pd.concat([data_df[avg_cols], metadata], axis=1) 913 914 # Save the cluster data 915 data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv")) 916 # Suppress pandas FutureWarning 917 with warnings.catch_warnings(): 918 warnings.simplefilter(action="ignore", category=FutureWarning) 919 pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df)
Save the events to an OCULAR file. Relies on the dataframe originating from an OCULAR file (same columns; duplicate metadata/info).
Parameters
- output_path:
- event_type:
Returns
921 @classmethod 922 def load_ocular( 923 cls, 924 input_path: str, 925 event_type="cells", 926 cell_data_files=( 927 "rc-final1.rds", 928 "rc-final2.rds", 929 "rc-final3.rds", 930 "rc-final4.rds", 931 "ocular_interesting.rds", 932 ), 933 others_data_files=( 934 "others-final1.rds", 935 "others-final2.rds", 936 "others-final3.rds", 937 "others-final4.rds", 938 ), 939 atlas_data_files=( 940 "ocular_interesting.rds", 941 "ocular_not_interesting.rds", 942 ), 943 drop_common_events=True, 944 log=None, 945 ) -> Self: 946 """ 947 948 :param input_path: 949 :param event_type: 950 :param cell_data_files: 951 :param others_data_files: 952 :param atlas_data_files: 953 :param drop_common_events: 954 :param log: 955 :return: 956 """ 957 if pyreadr is None: 958 raise ModuleNotFoundError( 959 "pyreadr not installed. Install pyreadr directly " 960 "or install csi-images with [rds] option to resolve." 961 ) 962 # Check if the input path is a directory or a file 963 if os.path.isfile(input_path): 964 data_files = [os.path.basename(input_path)] 965 input_path = os.path.dirname(input_path) 966 if event_type == "cells": 967 data_files = cell_data_files 968 elif event_type == "others": 969 data_files = others_data_files 970 else: 971 raise ValueError("Invalid event type.") 972 973 # Load the data from the OCULAR files 974 file_data = {} 975 for file in data_files: 976 file_path = os.path.join(input_path, file) 977 if not os.path.isfile(file_path): 978 if log is not None: 979 log.warning(f"{file} not found for in {input_path}") 980 continue 981 file_data[file] = pyreadr.read_r(file_path) 982 # Get the DataFrame associated with None (pyreadr dict quirk) 983 file_data[file] = file_data[file][None] 984 if len(file_data[file]) == 0: 985 # File gets dropped from the dict 986 file_data.pop(file) 987 if log is not None: 988 log.warning(f"{file} has no cells") 989 continue 990 991 if log is not None: 992 log.debug(f"{file} has {len(file_data[file])} cells") 993 994 # Drop common cells if requested and in this file 995 if ( 996 file in atlas_data_files 997 and drop_common_events 998 and "catalogue_classification" in file_data[file] 999 ): 1000 common_cell_indices = ( 1001 file_data[file]["catalogue_classification"] == "common_cell" 1002 ) 1003 if log is not None: 1004 log.debug( 1005 f"Dropping {int(pd.Series.sum(common_cell_indices))}" 1006 f"common cells from {file}" 1007 ) 1008 file_data[file] = file_data[file][common_cell_indices == False] 1009 1010 if len(file_data[file]) == 0: 1011 # File gets dropped from the dict 1012 file_data.pop(file) 1013 if log is not None: 1014 log.warning(f"{file} has no cells after dropping common cells") 1015 continue 1016 1017 # Extract frame_id and cell_id 1018 # DAPI- events already have frame_id cell_id outside rowname 1019 if event_type == "cells" and "frame_id" not in file_data[file].columns: 1020 file_data[file]["rowname"] = file_data[file]["rowname"].astype("str") 1021 # get frame_id cell_id from rownames column and split into two columns 1022 split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True) 1023 if len(split_res.columns) != 2: 1024 log.warning( 1025 f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}' 1026 ) 1027 # then assign it back to the dataframe 1028 file_data[file][["frame_id", "cell_id"]] = split_res.astype("int") 1029 # reset indexes since they can cause NaN values in concat 1030 file_data[file] = file_data[file].reset_index(drop=True) 1031 1032 # Merge the data from all files 1033 if len(file_data) == 0: 1034 return EventArray() 1035 elif len(file_data) == 1: 1036 data = [file_data[file] for file in file_data.keys()][0] 1037 else: 1038 data = pd.concat(file_data.values()) 1039 1040 if log is not None: 1041 log.debug(f"Gathered a total of {len(data)} events") 1042 1043 # Others is missing the "slide_id". Insert it right before "frame_id" column 1044 if event_type == "others" and "slide_id" not in data.columns: 1045 if os.path.basename(input_path) == "ocular": 1046 slide_id = os.path.basename(os.path.dirname(input_path)) 1047 else: 1048 slide_id = "UNKNOWN" 1049 data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id) 1050 1051 # Sort according to ascending cell_id to keep the original, which is in manual_df 1052 data = data.sort_values(by=["cell_id"], ascending=True) 1053 # Filter out duplicates by x & y 1054 data = data.assign( 1055 unique_id=data["slide_id"] 1056 + "_" 1057 + data["frame_id"].astype(str) 1058 + "_" 1059 + data["cellx"].astype(int).astype(str) 1060 + "_" 1061 + data["celly"].astype(int).astype(str) 1062 ) 1063 data = data.drop_duplicates(subset=["unique_id"], keep="first") 1064 # Normal unique_id is with cell_id 1065 data = data.assign( 1066 unique_id=data["slide_id"] 1067 + "_" 1068 + data["frame_id"].astype(str) 1069 + "_" 1070 + data["cell_id"].astype(str) 1071 ) 1072 data = data.reset_index(drop=True) 1073 # All columns up to "slide_id" are features; drop the "slide_id" 1074 features = data.loc[:, :"slide_id"].iloc[:, :-1] 1075 data = data.loc[:, "slide_id":] 1076 # Grab the info columns 1077 info = data[["slide_id", "frame_id", "cellx", "celly"]] 1078 info.columns = ["slide_id", "tile", "x", "y"] 1079 info = info.assign( 1080 roi=0, # OCULAR only works on 1 ROI, as far as known 1081 size=25, # Static, for later montaging 1082 ) 1083 info = info[["slide_id", "tile", "roi", "x", "y", "size"]] 1084 # Metadata has duplicate columns for later convenience 1085 metadata = data 1086 # Certain columns tend to be problematic with mixed data formats... 1087 for col in ["TRITC", "CY5", "FITC"]: 1088 if col in metadata: 1089 labels = { 1090 "False": False, 1091 "True": True, 1092 "FALSE": False, 1093 "TRUE": True, 1094 } 1095 metadata[col] = metadata[col].map(labels).astype(bool) 1096 for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]: 1097 if col in metadata: 1098 metadata[col] = metadata[col].fillna(-1).astype(int) 1099 return EventArray(info, metadata, features)
Parameters
- input_path:
- event_type:
- cell_data_files:
- others_data_files:
- atlas_data_files:
- drop_common_events:
- log: