csi_images.csi_events
Contains the Event class, which represents a single event in a scan. The Event class optionally holds metadata and features. Lists of events with similar metadata or features can be combined into DataFrames for analysis.
The Event class holds the position of the event in the frame, which can be converted to the position in the scanner or slide coordinate positions. See the csi_utils.csi_scans documentation page for more information on the coordinate systems.
1""" 2Contains the Event class, which represents a single event in a scan. 3The Event class optionally holds metadata and features. Lists of events with 4similar metadata or features can be combined into DataFrames for analysis. 5 6The Event class holds the position of the event in the frame, which can be converted 7to the position in the scanner or slide coordinate positions. See the 8csi_utils.csi_scans documentation page for more information on the coordinate systems. 9""" 10 11import os 12import math 13import warnings 14from typing import Self, Iterable, Hashable, Sequence 15 16import numpy as np 17import pandas as pd 18 19from .csi_scans import Scan 20from .csi_tiles import Tile 21from .csi_frames import Frame 22 23# Optional dependencies; will raise errors in particular functions if not installed 24try: 25 from .csi_images import extract_mask_info 26except ImportError: 27 extract_mask_info = None 28try: 29 import pyreadr 30except ImportError: 31 pyreadr = None 32 33 34class Event: 35 """ 36 A class that represents a single event in a scan, making it easy to evaluate 37 singular events. Required metadata is exposed as attributes, and optional 38 metadata and features are stored as DataFrames. 39 """ 40 41 SCAN_TO_SLIDE_TRANSFORM = { 42 # Axioscan zero is in the top-right corner instead of top-left 43 Scan.Type.AXIOSCAN7: np.array( 44 [ 45 [1, 0, 75000], 46 [0, 1, 0], 47 [0, 0, 1], 48 ] 49 ), 50 # BZScanner coordinates are a special kind of messed up: 51 # - The slide is upside-down. 52 # - The slide is oriented vertically, with the barcode at the bottom. 53 # - Tiles are numbered from the top-right 54 Scan.Type.BZSCANNER: np.array( 55 [ 56 [0, -1, 75000], 57 [-1, 0, 25000], 58 [0, 0, 1], 59 ] 60 ), 61 } 62 """ 63 Homogeneous transformation matrices for converting between scanner and slide 64 coordinates. The matrices are 3x3, with the final column representing the 65 translation in micrometers (um). For more information, see 66 [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations). 67 68 Transformations are nominal, and accuracy is not guaranteed; this is due to 69 imperfections in slides and alignment in the scanners. Units are in micrometers. 70 """ 71 72 def __init__( 73 self, 74 scan: Scan, 75 tile: Tile, 76 x: int, 77 y: int, 78 metadata: pd.Series = None, 79 features: pd.Series = None, 80 ): 81 self.scan = scan 82 self.tile = tile 83 self.x = int(x) 84 self.y = int(y) 85 self.metadata = metadata 86 self.features = features 87 88 def __repr__(self) -> str: 89 return f"{self.scan.slide_id}-{self.tile.n}-{self.x}-{self.y}" 90 91 def __eq__(self, other) -> bool: 92 return self.__repr__() == other.__repr__() 93 94 def __lt__(self, other): 95 return self.__repr__() < other.__repr__() 96 97 def get_scan_position(self) -> tuple[float, float]: 98 """ 99 Get the position of the event in the scanner's coordinate frame. 100 :return: the scan position of the event in micrometers (um). 101 """ 102 # Get overall pixel position 103 pixel_x = self.x + (self.scan.tile_width_px * self.tile.x) 104 pixel_y = self.y + (self.scan.tile_height_px * self.tile.y) 105 # Convert to micrometers 106 x_um = pixel_x * self.scan.pixel_size_um 107 y_um = pixel_y * self.scan.pixel_size_um 108 # Add the scan's origin in the scanner frame 109 x_um += self.scan.roi[self.tile.n_roi].origin_x_um 110 y_um += self.scan.roi[self.tile.n_roi].origin_y_um 111 return x_um, y_um 112 113 def get_slide_position(self) -> tuple[float, float]: 114 """ 115 Get the slide position of the event in micrometers (um). 116 :return: the slide position of the event. 117 """ 118 # Turn scan_position into a 3x1 vector 119 scan_position = self.get_scan_position() 120 scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]]) 121 122 # Multiply by the appropriate homogeneous matrix 123 if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value): 124 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7] 125 elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value): 126 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER] 127 else: 128 raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.") 129 slide_position = np.matmul(transform, scan_position) 130 return float(slide_position[0][0]), float(slide_position[1][0]) 131 132 def crop_images( 133 self, images: Iterable[np.ndarray], crop_size: int = 100, in_pixels: bool = True 134 ) -> list[np.ndarray]: 135 """ 136 Get the event crops from the frame images. Called "get" because it does not 137 need to extract anything; it is very quick for extracting multiple events from 138 the same tile. 139 Use this if you're interested in many events. 140 :param images: the frame images. 141 :param crop_size: the square size of the image crop to get for this event. 142 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 143 :return: image_size x image_size crops of the event in the provided frames. If 144 the event is too close to the edge, the crop will be smaller and not centered. 145 """ 146 # Convert a crop size in micrometers to pixels 147 if not in_pixels: 148 crop_size = round(crop_size / self.scan.pixel_size_um) 149 # Find the crop bounds 150 bounds = [ 151 self.x - (crop_size // 2) + 1, 152 self.y - (crop_size // 2) + 1, 153 self.x + math.ceil(crop_size / 2) + 1, 154 self.y + math.ceil(crop_size / 2) + 1, 155 ] 156 # Determine how much the bounds violate the image size 157 displacements = [ 158 max(0, -bounds[0]), 159 max(0, -bounds[1]), 160 max(0, bounds[2] - images[0].shape[1]), 161 max(0, bounds[3] - images[0].shape[0]), 162 ] 163 # Cap off the bounds 164 bounds = [ 165 max(0, bounds[0]), 166 max(0, bounds[1]), 167 min(images[0].shape[1], bounds[2]), 168 min(images[0].shape[0], bounds[3]), 169 ] 170 171 # Crop the images 172 crops = [] 173 for image in images: 174 # Create a blank image of the right size 175 crop = np.zeros((crop_size, crop_size), dtype=image.dtype) 176 177 # Insert the cropped image into the blank image, leaving a black buffer 178 # around the edges if the crop would go beyond the original image bounds 179 crop[ 180 displacements[1] : crop_size - displacements[3], 181 displacements[0] : crop_size - displacements[2], 182 ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]] 183 crops.append(crop) 184 return crops 185 186 def extract_images( 187 self, 188 crop_size: int = 100, 189 in_pixels: bool = True, 190 input_path: str = None, 191 channels: Iterable[int | str] = None, 192 apply_gain: bool | Iterable[bool] = True, 193 ) -> list[np.ndarray]: 194 """ 195 Extract the images from the scan and tile, reading from the file. Called 196 "extract" because it must read and extract the images from file, which is slow. 197 Use this if you're interested in only a few events, as it is inefficient when 198 reading multiple events from the same tile. 199 :param crop_size: the square size of the image crop to get for this event. 200 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 201 :param input_path: the path to the input images. Defaults to None (uses the scan's path). 202 :param channels: the channels to extract images for. Defaults to all channels. 203 :param apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True. 204 Can be supplied as a list to apply gain to individual channels. 205 :return: a list of cropped images from the scan in the order of the channels. 206 """ 207 frames = Frame.get_frames(self.tile, channels) 208 if isinstance(apply_gain, bool): 209 apply_gain = [apply_gain] * len(frames) 210 images = [f.get_image(input_path, a) for f, a in zip(frames, apply_gain)] 211 return self.crop_images(images, crop_size, in_pixels) 212 213 @classmethod 214 def extract_images_for_list( 215 cls, 216 events: list[Self], 217 crop_size: int | list[int] = 75, 218 in_pixels: bool = True, 219 input_path: str = None, 220 channels: Iterable[int | str] = None, 221 apply_gain: bool | Iterable[bool] = True, 222 ) -> list[list[np.ndarray]]: 223 """ 224 Get the images for a list of events, ensuring that there is no wasteful reading 225 of the same tile multiple times. This function is more efficient than calling 226 extract_event_images for each event. 227 :param events: the events to extract images for. 228 :param crop_size: the square size of the image crop to get for this event. 229 Defaults to four times the size of the event. 230 :param in_pixels: whether the crop size is in pixels or micrometers. 231 Defaults to pixels, and is ignored if crop_size is None. 232 :param input_path: the path to the input images. Will only work for lists of events 233 from the same scan. Defaults to None (uses the scan's path). 234 :param channels: the channels to extract images for. Defaults to all channels. 235 :param apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True. 236 Can be supplied as a list to apply gain to individual channels. 237 :return: a list of lists of cropped images for each event. 238 """ 239 # Validation 240 if len(events) == 0: 241 return [] 242 if isinstance(crop_size, int): 243 crop_size = [crop_size] * len(events) 244 245 # Get the order of the events when sorted by slide/tile 246 order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__())) 247 248 # Allocate the list to size 249 crops = [[]] * len(events) 250 last_tile = None 251 images = None # Holds large numpy arrays, so expensive to compare 252 # Iterate through in slide/tile sorted order 253 for i in order: 254 if last_tile != events[i].tile: 255 # Gather the frame images, preserving them for the next event 256 frames = Frame.get_frames(events[i].tile, channels) 257 if isinstance(apply_gain, bool): 258 gain_list = [apply_gain] * len(frames) 259 else: 260 gain_list = apply_gain 261 images = [f.get_image(input_path, a) for f, a in zip(frames, gain_list)] 262 last_tile = events[i].tile 263 # Use the frame images to crop the event images 264 crops[i] = events[i].crop_images(images, crop_size[i], in_pixels) 265 return crops 266 267 268class EventArray: 269 """ 270 A class that holds a large number of events' data, making it easy to analyze and 271 manipulate many events at once. A more separated version of the Event class. 272 """ 273 274 INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y"] 275 276 def __init__( 277 self, 278 info: pd.DataFrame = None, 279 metadata: pd.DataFrame = None, 280 features: pd.DataFrame = None, 281 ): 282 # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y" 283 if info is not None: 284 if list(info.columns) != self.INFO_COLUMNS: 285 raise ValueError( 286 'EventArray.info must have columns "slide_id", "tile", "roi", "x", "y"' 287 ) 288 # Copy first to avoid modifying the original 289 info = info.copy() 290 # Ensure that the columns are the right types 291 info["slide_id"] = info["slide_id"].astype(str) 292 info["tile"] = info["tile"].astype(np.uint16) 293 info["roi"] = info["roi"].astype(np.uint8) 294 info["x"] = info["x"].round().astype(np.uint16) 295 info["y"] = info["y"].round().astype(np.uint16) 296 # All DataFrames must all have the same number of rows 297 if metadata is not None and (info is None or len(info) != len(metadata)): 298 raise ValueError( 299 "If EventArray.metadata is not None, it should match rows with .info" 300 ) 301 if features is not None and (info is None or len(info) != len(features)): 302 raise ValueError( 303 "If EventArray.features is not None, it should match rows with .info" 304 ) 305 # No columns named "metadata_", "features_", or "None" 306 column_names = [] 307 if metadata is not None: 308 column_names += metadata.columns.tolist() 309 if features is not None: 310 column_names += features.columns.tolist() 311 if any([col.lower().startswith("metadata_") for col in column_names]): 312 raise ValueError("EventArray column names cannot start with 'metadata_'") 313 if any([col.lower().startswith("features_") for col in column_names]): 314 raise ValueError("EventArray column names cannot start with 'features_'") 315 if any([col.lower() == "none" for col in column_names]): 316 raise ValueError("EventArray column names cannot be 'none'") 317 318 self.info = info 319 self.metadata = metadata 320 self.features = features 321 322 def __len__(self) -> int: 323 # Convenience method to get the number of events 324 if self.info is None: 325 return 0 326 else: 327 return len(self.info) 328 329 def __eq__(self, other): 330 is_equal = True 331 # Parse all possibilities for info 332 if isinstance(self.info, pd.DataFrame): 333 if isinstance(other.info, pd.DataFrame): 334 is_equal = self.info.equals(other.info) 335 if not is_equal: 336 return False 337 else: 338 return False 339 elif self.info is None: 340 if other.info is not None: 341 return False 342 343 # Parse all possibilities for metadata 344 if isinstance(self.metadata, pd.DataFrame): 345 if isinstance(other.metadata, pd.DataFrame): 346 is_equal = self.metadata.equals(other.metadata) 347 if not is_equal: 348 return False 349 else: 350 return False 351 elif self.metadata is None: 352 if other.metadata is not None: 353 return False 354 355 # Parse all possibilities for features 356 if isinstance(self.features, pd.DataFrame): 357 if isinstance(other.features, pd.DataFrame): 358 is_equal = self.features.equals(other.features) 359 if not is_equal: 360 return False 361 else: 362 return False 363 elif self.features is None: 364 if other.features is not None: 365 return False 366 367 return is_equal 368 369 def get_sort_order( 370 self, by: Hashable | Sequence[Hashable], ascending: bool | Sequence[bool] = True 371 ): 372 """ 373 Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames. 374 :param by: name of the column(s) to sort by. 375 :param ascending: whether to sort in ascending order; can be a list to match by 376 :return: the order of the indices to sort by. 377 """ 378 columns = self.get(by) 379 return columns.sort_values(by=by, ascending=ascending).index 380 381 def sort( 382 self, 383 by: Hashable | Sequence[Hashable], 384 ascending: bool | Sequence[bool] = True, 385 ) -> Self: 386 """ 387 Sort the EventArray by column(s) in the info, metadata, or features DataFrames. 388 :param by: name of the column(s) to sort by. 389 :param ascending: whether to sort in ascending order; can be a list to match by 390 :return: a new, sorted EventArray. 391 """ 392 order = self.get_sort_order(by, ascending) 393 info = self.info.loc[order].reset_index(drop=True) 394 if self.metadata is not None: 395 metadata = self.metadata.loc[order].reset_index(drop=True) 396 else: 397 metadata = None 398 if self.features is not None: 399 features = self.features.loc[order].reset_index(drop=True) 400 else: 401 features = None 402 return EventArray(info, metadata, features) 403 404 def get(self, column_names: Hashable | Sequence[Hashable]) -> pd.DataFrame: 405 """ 406 Get a DataFrame with the specified columns from the EventArray, by value. 407 :param column_names: the names of the columns to get. 408 :return: a DataFrame with the specified columns. 409 """ 410 if isinstance(column_names, Hashable): 411 column_names = [column_names] # Drop into a list for the loop 412 columns = [] 413 for column_name in column_names: 414 if column_name in self.info.columns: 415 columns.append(self.info[column_name]) 416 elif self.metadata is not None and column_name in self.metadata.columns: 417 columns.append(self.metadata[column_name]) 418 elif self.features is not None and column_name in self.features.columns: 419 columns.append(self.features[column_name]) 420 else: 421 raise ValueError(f"Column {column_name} not found in EventArray") 422 return pd.concat(columns, axis=1) 423 424 def rows(self, rows: Sequence[Hashable]) -> Self: 425 """ 426 Get a subset of the EventArray rows based on a boolean or integer index, by value. 427 :param rows: row labels, indices, or boolean mask; anything for .loc[] 428 :return: a new EventArray with the subset of events. 429 """ 430 info = self.info.loc[rows].reset_index(drop=True) 431 if self.metadata is not None: 432 metadata = self.metadata.loc[rows].reset_index(drop=True) 433 else: 434 metadata = None 435 if self.features is not None: 436 features = self.features.loc[rows].reset_index(drop=True) 437 else: 438 features = None 439 return EventArray(info, metadata, features) 440 441 def copy(self) -> Self: 442 """ 443 Create a deep copy of the EventArray. 444 :return: a deep copy of the EventArray. 445 """ 446 return EventArray( 447 info=self.info.copy(), 448 metadata=None if self.metadata is None else self.metadata.copy(), 449 features=None if self.features is None else self.features.copy(), 450 ) 451 452 # TODO: add a "filter" convenience function that takes a column name and values to filter by 453 454 def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None: 455 """ 456 Add metadata to the EventArray. Removes the need to check if metadata is None. 457 Overwrites any existing metadata with the same column names as the new metadata. 458 :param new_metadata: the metadata to add. 459 """ 460 if len(self) != len(new_metadata): 461 raise ValueError("New metadata must match length of existing info") 462 463 if self.metadata is None: 464 self.metadata = new_metadata 465 else: 466 if isinstance(new_metadata, pd.Series): 467 self.metadata[new_metadata.name] = new_metadata 468 else: 469 # It's a DataFrame 470 self.metadata[new_metadata.columns] = new_metadata 471 472 def add_features(self, new_features: pd.Series | pd.DataFrame) -> None: 473 """ 474 Add features to the EventArray. Removes the need to check if features is None. 475 Overwrites any existing features with the same column names as the new features. 476 :param new_features: the features to add. 477 """ 478 if len(self) != len(new_features): 479 raise ValueError("New features must match length of existing info") 480 481 if self.features is None: 482 self.features = new_features 483 else: 484 if isinstance(new_features, pd.Series): 485 self.features[new_features.name] = new_features 486 else: 487 # It's a DataFrame 488 self.features[new_features.columns] = new_features 489 490 @classmethod 491 def merge(cls, events: Iterable[Self]) -> Self: 492 """ 493 Combine EventArrays in a list into a single EventArray. 494 :param events: the new list of events. 495 """ 496 all_info = [] 497 all_metadata = [] 498 all_features = [] 499 for event_array in events: 500 # Skip empty EventArrays 501 if event_array.info is not None: 502 all_info.append(event_array.info) 503 if event_array.metadata is not None: 504 all_metadata.append(event_array.metadata) 505 if event_array.features is not None: 506 all_features.append(event_array.features) 507 if len(all_info) == 0: 508 return EventArray() 509 else: 510 all_info = pd.concat(all_info, ignore_index=True) 511 if len(all_metadata) == 0: 512 all_metadata = None 513 else: 514 all_metadata = pd.concat(all_metadata, ignore_index=True) 515 if len(all_features) == 0: 516 all_features = None 517 else: 518 all_features = pd.concat(all_features, ignore_index=True) 519 520 return EventArray(all_info, all_metadata, all_features) 521 522 def to_events( 523 self, 524 scans: Scan | Iterable[Scan], 525 ignore_missing_scans=True, 526 ignore_metadata=False, 527 ignore_features=False, 528 ) -> list[Event]: 529 """ 530 Get the events in the EventArray as a list of events. 531 :param scans: the scans that the events belong to, auto-matched by slide_id. 532 Pass None if you don't care about scan metadata (pass ignore_missing_scans). 533 :param ignore_missing_scans: whether to create blank scans for events without scans. 534 :param ignore_metadata: whether to ignore metadata or not 535 :param ignore_features: whether to ignore features or not 536 :return: 537 """ 538 if isinstance(scans, Scan): 539 scans = [scans] 540 scans = {scan.slide_id: scan for scan in scans} 541 events = [] 542 for i in range(len(self.info)): 543 # Determine the associated scan 544 slide_id = self.info["slide_id"][i] 545 if slide_id not in scans: 546 if ignore_missing_scans: 547 # Create a placeholder scan if the scan is missing 548 scan = Scan.make_placeholder( 549 slide_id, 550 self.info["tile"][i], 551 self.info["roi"][i], 552 ) 553 else: 554 raise ValueError( 555 f"Scan {self.info['slide_id'][i]} not found for event {i}." 556 ) 557 else: 558 scan = scans[slide_id] 559 560 # Prepare the metadata and features 561 if ignore_metadata or self.metadata is None: 562 metadata = None 563 else: 564 # This Series creation method is less efficient, 565 # but required for preserving dtypes 566 metadata = pd.Series( 567 {col: self.metadata.loc[i, col] for col in self.metadata.columns}, 568 dtype=object, 569 ) 570 if ignore_features or self.features is None: 571 features = None 572 else: 573 features = pd.Series( 574 {col: self.features.loc[i, col] for col in self.features.columns}, 575 dtype=object, 576 ) 577 # Create the event and append it to the list 578 events.append( 579 Event( 580 scan, 581 Tile(scan, self.info["tile"][i], self.info["roi"][i]), 582 self.info["x"][i], 583 self.info["y"][i], 584 metadata=metadata, 585 features=features, 586 ) 587 ) 588 return events 589 590 @classmethod 591 def from_events(cls, events: Iterable[Event]) -> Self: 592 """ 593 Set the events in the EventArray to a new list of events. 594 :param events: the new list of events. 595 """ 596 info = pd.DataFrame( 597 { 598 "slide_id": [event.scan.slide_id for event in events], 599 "tile": [event.tile.n for event in events], 600 "roi": [event.tile.n_roi for event in events], 601 "x": [event.x for event in events], 602 "y": [event.y for event in events], 603 } 604 ) 605 metadata_list = [event.metadata for event in events] 606 # Iterate through and ensure that all metadata is the same shape 607 for metadata in metadata_list: 608 if type(metadata) != type(metadata_list[0]): 609 raise ValueError("All metadata must be the same type.") 610 if metadata is not None and metadata.shape != metadata_list[0].shape: 611 raise ValueError("All metadata must be the same shape.") 612 if metadata_list[0] is None: 613 metadata = None 614 else: 615 metadata = pd.DataFrame(metadata_list) 616 features_list = [event.features for event in events] 617 # Iterate through and ensure that all features are the same shape 618 for features in features_list: 619 if type(features) != type(features_list[0]): 620 raise ValueError("All features must be the same type.") 621 if features is not None and features.shape != features_list[0].shape: 622 raise ValueError("All features must be the same shape.") 623 if features_list[0] is None: 624 features = None 625 else: 626 features = pd.DataFrame(features_list) 627 return EventArray(info=info, metadata=metadata, features=features) 628 629 def to_dataframe(self) -> pd.DataFrame: 630 """ 631 Convert all the data in the EventArray to a single DataFrame. 632 :return: a DataFrame with all the data in the EventArray. 633 """ 634 # Make a copy of the info DataFrame and prepend "info_" to the column names 635 output = self.info.copy() 636 # Combine with the metadata and prepend "metadata_" to the column names 637 if self.metadata is not None: 638 metadata = self.metadata.copy() 639 metadata.columns = [f"metadata_{col}" for col in metadata.columns] 640 output = pd.concat([output, metadata], axis=1) 641 # Combine with the features and prepend "features_" to the column names 642 if self.features is not None: 643 features = self.features.copy() 644 features.columns = [f"features_{col}" for col in features.columns] 645 output = pd.concat([output, features], axis=1) 646 return output 647 648 @classmethod 649 def from_dataframe(cls, df) -> Self: 650 """ 651 From a single, special DataFrame, create an EventArray. 652 :return: a DataFrame with all the data in the EventArray. 653 """ 654 # Split the columns into info, metadata, and features and strip prefix 655 info = df[[col for col in df.columns if col in cls.INFO_COLUMNS]].copy() 656 if info.size == 0: 657 info = None 658 metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy() 659 metadata.columns = [col.replace("metadata_", "") for col in metadata.columns] 660 if metadata.size == 0: 661 metadata = None 662 features = df[[col for col in df.columns if col.startswith("features_")]].copy() 663 features.columns = [col.replace("features_", "") for col in features.columns] 664 if features.size == 0: 665 features = None 666 return cls(info=info, metadata=metadata, features=features) 667 668 @classmethod 669 def from_mask( 670 cls, 671 mask: np.ndarray, 672 slide_id: str, 673 tile_n: int, 674 n_roi: int = 0, 675 include_cell_id: bool = True, 676 images: list[np.ndarray] = None, 677 image_labels: list[str] = None, 678 properties: list[str] = None, 679 ) -> Self: 680 """ 681 Extract events from a mask DataFrame, including metadata and features. 682 :param mask: the mask to extract events from. 683 :param slide_id: the slide ID the mask is from. 684 :param tile_n: the tile number the mask is from. 685 :param n_roi: the ROI number the mask is from. 686 :param include_cell_id: whether to include the cell_id, or numerical 687 mask label, as metadata in the EventArray. 688 :param images: the intensity images to extract features from. 689 :param image_labels: the labels for the intensity images. 690 :param properties: list of properties to extract in addition to the defaults: 691 :return: EventArray corresponding to the mask labels. 692 """ 693 if extract_mask_info is None: 694 raise ModuleNotFoundError( 695 "csi_images.csi_images dependencies not installed. Install csi-images " 696 "with [imageio] option to resolve." 697 ) 698 # Gather mask_info 699 if images is not None and image_labels is not None: 700 if len(images) != len(image_labels): 701 raise ValueError("Intensity images and labels must match lengths.") 702 703 mask_info = extract_mask_info(mask, images, image_labels, properties) 704 705 if len(mask_info) == 0: 706 return EventArray() 707 708 # Combine provided info and mask info 709 info = pd.DataFrame( 710 { 711 "slide_id": slide_id, 712 "tile": tile_n, 713 "roi": n_roi, 714 "x": mask_info["x"], 715 "y": mask_info["y"], 716 }, 717 ) 718 # Extract a metadata column if desired 719 if include_cell_id: 720 metadata = pd.DataFrame({"cell_id": mask_info["id"]}) 721 else: 722 metadata = None 723 # If any additional properties were extracted, add them as features 724 mask_info = mask_info.drop(columns=["id", "x", "y"], errors="ignore") 725 if len(mask_info.columns) > 0: 726 features = mask_info 727 else: 728 features = None 729 return EventArray(info, metadata, features) 730 731 def save_csv(self, output_path: str) -> bool: 732 """ 733 Save the events to an CSV file, including metadata and features. 734 :param output_path: 735 :return: 736 """ 737 self.to_dataframe().to_csv(output_path, index=False) 738 return os.path.exists(output_path) 739 740 @classmethod 741 def load_csv(cls, input_path: str) -> Self: 742 """ 743 Load the events from an CSV file, including metadata and features. 744 :param input_path: 745 :return: 746 """ 747 # Load the CSV file 748 df = pd.read_csv(input_path) 749 return cls.from_dataframe(df) 750 751 def save_hdf5(self, output_path: str) -> bool: 752 """ 753 Save the events to an HDF5 file, including metadata and features. 754 Uses the pandas-provided HDF5 functions for ease, and external compatibility, 755 though these files are slightly harder to view in HDFView or similar. 756 :param output_path: 757 :return: 758 """ 759 # Open the output_path as an HDF5 file 760 with pd.HDFStore(output_path) as store: 761 # Store the dataframes in the HDF5 file 762 if self.info is not None: 763 store.put("info", self.info, index=False) 764 if self.metadata is not None: 765 store.put("metadata", self.metadata, index=False) 766 if self.features is not None: 767 store.put("features", self.features, index=False) 768 return os.path.exists(output_path) 769 770 @classmethod 771 def load_hdf5(cls, input_path: str) -> Self: 772 """ 773 Load the events from an HDF5 file, including metadata and features. 774 :param input_path: 775 :return: 776 """ 777 # Open the input_path as an HDF5 file 778 with pd.HDFStore(input_path, "r") as store: 779 # Load the dataframes from the HDF5 file 780 info = store.get("info") if "info" in store else None 781 metadata = store.get("metadata") if "metadata" in store else None 782 features = store.get("features") if "features" in store else None 783 return cls(info=info, metadata=metadata, features=features) 784 785 def save_ocular(self, output_path: str, event_type: str = "cells"): 786 """ 787 Save the events to an OCULAR file. Relies on the dataframe originating 788 from an OCULAR file (same columns; duplicate metadata/info). 789 :param output_path: 790 :param event_type: 791 :return: 792 """ 793 if pyreadr is None: 794 raise ModuleNotFoundError( 795 "pyreadr not installed. Install pyreadr directly " 796 "or install csi-images with [rds] option to resolve." 797 ) 798 if event_type == "cells": 799 file_stub = "rc-final" 800 elif event_type == "others": 801 file_stub = "others-final" 802 else: 803 raise ValueError("Invalid event type. Must be cells or others.") 804 805 # Ensure good metadata 806 metadata = pd.DataFrame( 807 { 808 "slide_id": self.info["slide_id"], 809 "frame_id": self.info["tile"], 810 "cell_id": ( 811 self.metadata["cell_id"] 812 if "cell_id" in self.metadata.columns 813 else range(len(self.info)) 814 ), 815 "cellx": self.info["x"], 816 "celly": self.info["y"], 817 } 818 ) 819 if self.metadata is not None: 820 metadata[self.metadata.columns] = self.metadata.copy() 821 822 # Check for the "ocular_interesting" column 823 if event_type == "cells": 824 if "ocular_interesting" in metadata.columns: 825 interesting_rows = metadata["ocular_interesting"].to_numpy(dtype=bool) 826 elif "hcpc" in metadata.columns: 827 # Interesting cells don't get an hcpc designation, leaving them as -1 828 interesting_rows = ( 829 metadata["hcpc"].to_numpy() == -1 830 ) # interesting cells 831 else: 832 interesting_rows = [] 833 if sum(interesting_rows) > 0: 834 # Split the metadata into interesting and regular 835 interesting_events = self.rows(interesting_rows) 836 interesting_df = pd.concat( 837 [interesting_events.features, interesting_events.metadata], axis=1 838 ) 839 data_events = self.rows(~interesting_rows) 840 data_df = pd.concat( 841 [data_events.features, data_events.metadata], axis=1 842 ) 843 data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore") 844 845 # Drop particular columns for "interesting" 846 interesting_df = interesting_df.drop( 847 [ 848 "clust", 849 "hcpc", 850 "frame_id", 851 "cell_id", 852 "unique_id", 853 "ocular_interesting", 854 ], 855 axis=1, 856 errors="ignore", 857 ) 858 # Save both .csv and .rds 859 interesting_stub = os.path.join(output_path, "ocular_interesting") 860 interesting_df.to_csv(f"{interesting_stub}.csv") 861 # Suppress pandas FutureWarning 862 with warnings.catch_warnings(): 863 warnings.simplefilter(action="ignore", category=FutureWarning) 864 pyreadr.write_rds(f"{interesting_stub}.rds", interesting_df) 865 else: 866 data_df = pd.concat([self.features, metadata], axis=1) 867 else: 868 # Get all data and reset_index (will copy it) 869 data_df = pd.concat([self.features, metadata], axis=1) 870 871 # Split based on cluster number to conform to *-final[1-4].rds 872 n_clusters = max(data_df["clust"]) + 1 873 split_idx = [round(i * n_clusters / 4) for i in range(5)] 874 for i in range(4): 875 subset = (split_idx[i] <= data_df["clust"]) & ( 876 data_df["clust"] < split_idx[i + 1] 877 ) 878 data_df.loc[subset, "hcpc"] = i + 1 879 subset = data_df[subset].reset_index(drop=True) 880 # Suppress pandas FutureWarning 881 with warnings.catch_warnings(): 882 warnings.simplefilter(action="ignore", category=FutureWarning) 883 pyreadr.write_rds( 884 os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset 885 ) 886 887 # Create new example cell strings 888 data_df["example_cell_id"] = ( 889 data_df["slide_id"] 890 + " " 891 + data_df["frame_id"].astype(str) 892 + " " 893 + data_df["cell_id"].astype(str) 894 + " " 895 + data_df["cellx"].astype(int).astype(str) 896 + " " 897 + data_df["celly"].astype(int).astype(str) 898 ) 899 # Find averagable data columns 900 if "cellcluster_id" in data_df.columns: 901 end_idx = data_df.columns.get_loc("cellcluster_id") 902 else: 903 end_idx = data_df.columns.get_loc("slide_id") 904 avg_cols = data_df.columns[:end_idx].tolist() 905 # Group by cluster and average 906 data_df = data_df.groupby("clust").agg( 907 **{col: (col, "mean") for col in avg_cols}, 908 count=("clust", "size"), # count rows in each cluster 909 example_cells=("example_cell_id", lambda x: ",".join(x)), 910 hcpc=("hcpc", lambda x: x.iloc[0]), 911 ) 912 data_df = data_df.reset_index() # Do NOT drop, index is "clust" 913 # Create new columns 914 metadata = pd.DataFrame( 915 { 916 "count": data_df["count"], 917 "example_cells": data_df["example_cells"], 918 "clust": data_df["clust"].astype(int), 919 "hcpc": data_df["hcpc"].astype(int), 920 "id": data_df["clust"].astype(int).astype(str), 921 "cccluster": "0", # Dummy value 922 "ccdistance": 0.0, # Dummy value 923 "rownum": list(range(len(data_df))), 924 "framegroup": 0, # Dummy value 925 } 926 ) 927 # Need to pad the features to 761 columns, as per OCULAR report needs 928 additional_columns = range(len(avg_cols), 761) 929 if len(additional_columns) > 0: 930 padding = pd.DataFrame( 931 np.zeros((len(data_df), len(additional_columns))), 932 columns=[f"pad{i}" for i in additional_columns], 933 ) 934 data_df = pd.concat([data_df[avg_cols], padding, metadata], axis=1) 935 else: 936 data_df = pd.concat([data_df[avg_cols], metadata], axis=1) 937 938 # Save the cluster data 939 data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv")) 940 # Suppress pandas FutureWarning 941 with warnings.catch_warnings(): 942 warnings.simplefilter(action="ignore", category=FutureWarning) 943 pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df) 944 945 @classmethod 946 def load_ocular( 947 cls, 948 input_path: str, 949 event_type="cells", 950 cell_data_files=( 951 "rc-final1.rds", 952 "rc-final2.rds", 953 "rc-final3.rds", 954 "rc-final4.rds", 955 "ocular_interesting.rds", 956 ), 957 others_data_files=( 958 "others-final1.rds", 959 "others-final2.rds", 960 "others-final3.rds", 961 "others-final4.rds", 962 ), 963 atlas_data_files=( 964 "ocular_interesting.rds", 965 "ocular_not_interesting.rds", 966 ), 967 drop_common_events=True, 968 log=None, 969 ) -> Self: 970 """ 971 972 :param input_path: 973 :param event_type: 974 :param cell_data_files: 975 :param others_data_files: 976 :param atlas_data_files: 977 :param drop_common_events: 978 :param log: 979 :return: 980 """ 981 if pyreadr is None: 982 raise ModuleNotFoundError( 983 "pyreadr not installed. Install pyreadr directly " 984 "or install csi-images with [rds] option to resolve." 985 ) 986 # Check if the input path is a directory or a file 987 if os.path.isfile(input_path): 988 data_files = [os.path.basename(input_path)] 989 input_path = os.path.dirname(input_path) 990 if event_type == "cells": 991 data_files = cell_data_files 992 elif event_type == "others": 993 data_files = others_data_files 994 else: 995 raise ValueError("Invalid event type.") 996 997 # Load the data from the OCULAR files 998 file_data = {} 999 for file in data_files: 1000 file_path = os.path.join(input_path, file) 1001 if not os.path.isfile(file_path): 1002 if log is not None: 1003 log.warning(f"{file} not found for in {input_path}") 1004 continue 1005 file_data[file] = pyreadr.read_r(file_path) 1006 # Get the DataFrame associated with None (pyreadr dict quirk) 1007 file_data[file] = file_data[file][None] 1008 if len(file_data[file]) == 0: 1009 # File gets dropped from the dict 1010 file_data.pop(file) 1011 if log is not None: 1012 log.warning(f"{file} has no cells") 1013 continue 1014 1015 if log is not None: 1016 log.debug(f"{file} has {len(file_data[file])} cells") 1017 1018 # Drop common cells if requested and in this file 1019 if ( 1020 file in atlas_data_files 1021 and drop_common_events 1022 and "catalogue_classification" in file_data[file] 1023 ): 1024 common_cell_indices = ( 1025 file_data[file]["catalogue_classification"] == "common_cell" 1026 ) 1027 if log is not None: 1028 log.debug( 1029 f"Dropping {int(pd.Series.sum(common_cell_indices))}" 1030 f"common cells from {file}" 1031 ) 1032 file_data[file] = file_data[file][common_cell_indices == False] 1033 1034 if len(file_data[file]) == 0: 1035 # File gets dropped from the dict 1036 file_data.pop(file) 1037 if log is not None: 1038 log.warning(f"{file} has no cells after dropping common cells") 1039 continue 1040 1041 # Extract frame_id and cell_id 1042 # DAPI- events already have frame_id cell_id outside rowname 1043 if event_type == "cells" and "frame_id" not in file_data[file].columns: 1044 file_data[file]["rowname"] = file_data[file]["rowname"].astype("str") 1045 # get frame_id cell_id from rownames column and split into two columns 1046 split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True) 1047 if len(split_res.columns) != 2: 1048 log.warning( 1049 f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}' 1050 ) 1051 # then assign it back to the dataframe 1052 file_data[file][["frame_id", "cell_id"]] = split_res.astype("int") 1053 # reset indexes since they can cause NaN values in concat 1054 file_data[file] = file_data[file].reset_index(drop=True) 1055 1056 # Merge the data from all files 1057 if len(file_data) == 0: 1058 return EventArray() 1059 elif len(file_data) == 1: 1060 data = [file_data[file] for file in file_data.keys()][0] 1061 else: 1062 data = pd.concat(file_data.values()) 1063 1064 if log is not None: 1065 log.debug(f"Gathered a total of {len(data)} events") 1066 1067 # Others is missing the "slide_id". Insert it right before "frame_id" column 1068 if event_type == "others" and "slide_id" not in data.columns: 1069 if os.path.basename(input_path) == "ocular": 1070 slide_id = os.path.basename(os.path.dirname(input_path)) 1071 else: 1072 slide_id = "UNKNOWN" 1073 data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id) 1074 1075 # Sort according to ascending cell_id to keep the original, which is in manual_df 1076 data = data.sort_values(by=["cell_id"], ascending=True) 1077 # Filter out duplicates by x & y 1078 data = data.assign( 1079 unique_id=data["slide_id"] 1080 + "_" 1081 + data["frame_id"].astype(str) 1082 + "_" 1083 + data["cellx"].astype(int).astype(str) 1084 + "_" 1085 + data["celly"].astype(int).astype(str) 1086 ) 1087 data = data.drop_duplicates(subset=["unique_id"], keep="first") 1088 # Normal unique_id is with cell_id 1089 data = data.assign( 1090 unique_id=data["slide_id"] 1091 + "_" 1092 + data["frame_id"].astype(str) 1093 + "_" 1094 + data["cell_id"].astype(str) 1095 ) 1096 data = data.reset_index(drop=True) 1097 # All columns up to "slide_id" are features; drop the "slide_id" 1098 features = data.loc[:, :"slide_id"].iloc[:, :-1] 1099 data = data.loc[:, "slide_id":] 1100 # Grab the info columns 1101 info = data[["slide_id", "frame_id", "cellx", "celly"]] 1102 info.columns = ["slide_id", "tile", "x", "y"] 1103 info = info.assign(roi=0) # OCULAR only works on 1 ROI, as far as known 1104 info = info[["slide_id", "tile", "roi", "x", "y"]] 1105 # Metadata has duplicate columns for later convenience 1106 metadata = data 1107 # Certain columns tend to be problematic with mixed data formats... 1108 for col in ["TRITC", "CY5", "FITC"]: 1109 if col in metadata: 1110 labels = { 1111 "False": False, 1112 "True": True, 1113 "FALSE": False, 1114 "TRUE": True, 1115 } 1116 metadata[col] = metadata[col].map(labels).astype(bool) 1117 for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]: 1118 if col in metadata: 1119 metadata[col] = metadata[col].fillna(-1).astype(int) 1120 return EventArray(info, metadata, features)
35class Event: 36 """ 37 A class that represents a single event in a scan, making it easy to evaluate 38 singular events. Required metadata is exposed as attributes, and optional 39 metadata and features are stored as DataFrames. 40 """ 41 42 SCAN_TO_SLIDE_TRANSFORM = { 43 # Axioscan zero is in the top-right corner instead of top-left 44 Scan.Type.AXIOSCAN7: np.array( 45 [ 46 [1, 0, 75000], 47 [0, 1, 0], 48 [0, 0, 1], 49 ] 50 ), 51 # BZScanner coordinates are a special kind of messed up: 52 # - The slide is upside-down. 53 # - The slide is oriented vertically, with the barcode at the bottom. 54 # - Tiles are numbered from the top-right 55 Scan.Type.BZSCANNER: np.array( 56 [ 57 [0, -1, 75000], 58 [-1, 0, 25000], 59 [0, 0, 1], 60 ] 61 ), 62 } 63 """ 64 Homogeneous transformation matrices for converting between scanner and slide 65 coordinates. The matrices are 3x3, with the final column representing the 66 translation in micrometers (um). For more information, see 67 [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations). 68 69 Transformations are nominal, and accuracy is not guaranteed; this is due to 70 imperfections in slides and alignment in the scanners. Units are in micrometers. 71 """ 72 73 def __init__( 74 self, 75 scan: Scan, 76 tile: Tile, 77 x: int, 78 y: int, 79 metadata: pd.Series = None, 80 features: pd.Series = None, 81 ): 82 self.scan = scan 83 self.tile = tile 84 self.x = int(x) 85 self.y = int(y) 86 self.metadata = metadata 87 self.features = features 88 89 def __repr__(self) -> str: 90 return f"{self.scan.slide_id}-{self.tile.n}-{self.x}-{self.y}" 91 92 def __eq__(self, other) -> bool: 93 return self.__repr__() == other.__repr__() 94 95 def __lt__(self, other): 96 return self.__repr__() < other.__repr__() 97 98 def get_scan_position(self) -> tuple[float, float]: 99 """ 100 Get the position of the event in the scanner's coordinate frame. 101 :return: the scan position of the event in micrometers (um). 102 """ 103 # Get overall pixel position 104 pixel_x = self.x + (self.scan.tile_width_px * self.tile.x) 105 pixel_y = self.y + (self.scan.tile_height_px * self.tile.y) 106 # Convert to micrometers 107 x_um = pixel_x * self.scan.pixel_size_um 108 y_um = pixel_y * self.scan.pixel_size_um 109 # Add the scan's origin in the scanner frame 110 x_um += self.scan.roi[self.tile.n_roi].origin_x_um 111 y_um += self.scan.roi[self.tile.n_roi].origin_y_um 112 return x_um, y_um 113 114 def get_slide_position(self) -> tuple[float, float]: 115 """ 116 Get the slide position of the event in micrometers (um). 117 :return: the slide position of the event. 118 """ 119 # Turn scan_position into a 3x1 vector 120 scan_position = self.get_scan_position() 121 scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]]) 122 123 # Multiply by the appropriate homogeneous matrix 124 if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value): 125 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7] 126 elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value): 127 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER] 128 else: 129 raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.") 130 slide_position = np.matmul(transform, scan_position) 131 return float(slide_position[0][0]), float(slide_position[1][0]) 132 133 def crop_images( 134 self, images: Iterable[np.ndarray], crop_size: int = 100, in_pixels: bool = True 135 ) -> list[np.ndarray]: 136 """ 137 Get the event crops from the frame images. Called "get" because it does not 138 need to extract anything; it is very quick for extracting multiple events from 139 the same tile. 140 Use this if you're interested in many events. 141 :param images: the frame images. 142 :param crop_size: the square size of the image crop to get for this event. 143 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 144 :return: image_size x image_size crops of the event in the provided frames. If 145 the event is too close to the edge, the crop will be smaller and not centered. 146 """ 147 # Convert a crop size in micrometers to pixels 148 if not in_pixels: 149 crop_size = round(crop_size / self.scan.pixel_size_um) 150 # Find the crop bounds 151 bounds = [ 152 self.x - (crop_size // 2) + 1, 153 self.y - (crop_size // 2) + 1, 154 self.x + math.ceil(crop_size / 2) + 1, 155 self.y + math.ceil(crop_size / 2) + 1, 156 ] 157 # Determine how much the bounds violate the image size 158 displacements = [ 159 max(0, -bounds[0]), 160 max(0, -bounds[1]), 161 max(0, bounds[2] - images[0].shape[1]), 162 max(0, bounds[3] - images[0].shape[0]), 163 ] 164 # Cap off the bounds 165 bounds = [ 166 max(0, bounds[0]), 167 max(0, bounds[1]), 168 min(images[0].shape[1], bounds[2]), 169 min(images[0].shape[0], bounds[3]), 170 ] 171 172 # Crop the images 173 crops = [] 174 for image in images: 175 # Create a blank image of the right size 176 crop = np.zeros((crop_size, crop_size), dtype=image.dtype) 177 178 # Insert the cropped image into the blank image, leaving a black buffer 179 # around the edges if the crop would go beyond the original image bounds 180 crop[ 181 displacements[1] : crop_size - displacements[3], 182 displacements[0] : crop_size - displacements[2], 183 ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]] 184 crops.append(crop) 185 return crops 186 187 def extract_images( 188 self, 189 crop_size: int = 100, 190 in_pixels: bool = True, 191 input_path: str = None, 192 channels: Iterable[int | str] = None, 193 apply_gain: bool | Iterable[bool] = True, 194 ) -> list[np.ndarray]: 195 """ 196 Extract the images from the scan and tile, reading from the file. Called 197 "extract" because it must read and extract the images from file, which is slow. 198 Use this if you're interested in only a few events, as it is inefficient when 199 reading multiple events from the same tile. 200 :param crop_size: the square size of the image crop to get for this event. 201 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 202 :param input_path: the path to the input images. Defaults to None (uses the scan's path). 203 :param channels: the channels to extract images for. Defaults to all channels. 204 :param apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True. 205 Can be supplied as a list to apply gain to individual channels. 206 :return: a list of cropped images from the scan in the order of the channels. 207 """ 208 frames = Frame.get_frames(self.tile, channels) 209 if isinstance(apply_gain, bool): 210 apply_gain = [apply_gain] * len(frames) 211 images = [f.get_image(input_path, a) for f, a in zip(frames, apply_gain)] 212 return self.crop_images(images, crop_size, in_pixels) 213 214 @classmethod 215 def extract_images_for_list( 216 cls, 217 events: list[Self], 218 crop_size: int | list[int] = 75, 219 in_pixels: bool = True, 220 input_path: str = None, 221 channels: Iterable[int | str] = None, 222 apply_gain: bool | Iterable[bool] = True, 223 ) -> list[list[np.ndarray]]: 224 """ 225 Get the images for a list of events, ensuring that there is no wasteful reading 226 of the same tile multiple times. This function is more efficient than calling 227 extract_event_images for each event. 228 :param events: the events to extract images for. 229 :param crop_size: the square size of the image crop to get for this event. 230 Defaults to four times the size of the event. 231 :param in_pixels: whether the crop size is in pixels or micrometers. 232 Defaults to pixels, and is ignored if crop_size is None. 233 :param input_path: the path to the input images. Will only work for lists of events 234 from the same scan. Defaults to None (uses the scan's path). 235 :param channels: the channels to extract images for. Defaults to all channels. 236 :param apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True. 237 Can be supplied as a list to apply gain to individual channels. 238 :return: a list of lists of cropped images for each event. 239 """ 240 # Validation 241 if len(events) == 0: 242 return [] 243 if isinstance(crop_size, int): 244 crop_size = [crop_size] * len(events) 245 246 # Get the order of the events when sorted by slide/tile 247 order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__())) 248 249 # Allocate the list to size 250 crops = [[]] * len(events) 251 last_tile = None 252 images = None # Holds large numpy arrays, so expensive to compare 253 # Iterate through in slide/tile sorted order 254 for i in order: 255 if last_tile != events[i].tile: 256 # Gather the frame images, preserving them for the next event 257 frames = Frame.get_frames(events[i].tile, channels) 258 if isinstance(apply_gain, bool): 259 gain_list = [apply_gain] * len(frames) 260 else: 261 gain_list = apply_gain 262 images = [f.get_image(input_path, a) for f, a in zip(frames, gain_list)] 263 last_tile = events[i].tile 264 # Use the frame images to crop the event images 265 crops[i] = events[i].crop_images(images, crop_size[i], in_pixels) 266 return crops
A class that represents a single event in a scan, making it easy to evaluate singular events. Required metadata is exposed as attributes, and optional metadata and features are stored as DataFrames.
Homogeneous transformation matrices for converting between scanner and slide coordinates. The matrices are 3x3, with the final column representing the translation in micrometers (um). For more information, see affine transformations.
Transformations are nominal, and accuracy is not guaranteed; this is due to imperfections in slides and alignment in the scanners. Units are in micrometers.
98 def get_scan_position(self) -> tuple[float, float]: 99 """ 100 Get the position of the event in the scanner's coordinate frame. 101 :return: the scan position of the event in micrometers (um). 102 """ 103 # Get overall pixel position 104 pixel_x = self.x + (self.scan.tile_width_px * self.tile.x) 105 pixel_y = self.y + (self.scan.tile_height_px * self.tile.y) 106 # Convert to micrometers 107 x_um = pixel_x * self.scan.pixel_size_um 108 y_um = pixel_y * self.scan.pixel_size_um 109 # Add the scan's origin in the scanner frame 110 x_um += self.scan.roi[self.tile.n_roi].origin_x_um 111 y_um += self.scan.roi[self.tile.n_roi].origin_y_um 112 return x_um, y_um
Get the position of the event in the scanner's coordinate frame.
Returns
the scan position of the event in micrometers (um).
114 def get_slide_position(self) -> tuple[float, float]: 115 """ 116 Get the slide position of the event in micrometers (um). 117 :return: the slide position of the event. 118 """ 119 # Turn scan_position into a 3x1 vector 120 scan_position = self.get_scan_position() 121 scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]]) 122 123 # Multiply by the appropriate homogeneous matrix 124 if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value): 125 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7] 126 elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value): 127 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER] 128 else: 129 raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.") 130 slide_position = np.matmul(transform, scan_position) 131 return float(slide_position[0][0]), float(slide_position[1][0])
Get the slide position of the event in micrometers (um).
Returns
the slide position of the event.
133 def crop_images( 134 self, images: Iterable[np.ndarray], crop_size: int = 100, in_pixels: bool = True 135 ) -> list[np.ndarray]: 136 """ 137 Get the event crops from the frame images. Called "get" because it does not 138 need to extract anything; it is very quick for extracting multiple events from 139 the same tile. 140 Use this if you're interested in many events. 141 :param images: the frame images. 142 :param crop_size: the square size of the image crop to get for this event. 143 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 144 :return: image_size x image_size crops of the event in the provided frames. If 145 the event is too close to the edge, the crop will be smaller and not centered. 146 """ 147 # Convert a crop size in micrometers to pixels 148 if not in_pixels: 149 crop_size = round(crop_size / self.scan.pixel_size_um) 150 # Find the crop bounds 151 bounds = [ 152 self.x - (crop_size // 2) + 1, 153 self.y - (crop_size // 2) + 1, 154 self.x + math.ceil(crop_size / 2) + 1, 155 self.y + math.ceil(crop_size / 2) + 1, 156 ] 157 # Determine how much the bounds violate the image size 158 displacements = [ 159 max(0, -bounds[0]), 160 max(0, -bounds[1]), 161 max(0, bounds[2] - images[0].shape[1]), 162 max(0, bounds[3] - images[0].shape[0]), 163 ] 164 # Cap off the bounds 165 bounds = [ 166 max(0, bounds[0]), 167 max(0, bounds[1]), 168 min(images[0].shape[1], bounds[2]), 169 min(images[0].shape[0], bounds[3]), 170 ] 171 172 # Crop the images 173 crops = [] 174 for image in images: 175 # Create a blank image of the right size 176 crop = np.zeros((crop_size, crop_size), dtype=image.dtype) 177 178 # Insert the cropped image into the blank image, leaving a black buffer 179 # around the edges if the crop would go beyond the original image bounds 180 crop[ 181 displacements[1] : crop_size - displacements[3], 182 displacements[0] : crop_size - displacements[2], 183 ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]] 184 crops.append(crop) 185 return crops
Get the event crops from the frame images. Called "get" because it does not need to extract anything; it is very quick for extracting multiple events from the same tile. Use this if you're interested in many events.
Parameters
- images: the frame images.
- crop_size: the square size of the image crop to get for this event.
- in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
Returns
image_size x image_size crops of the event in the provided frames. If the event is too close to the edge, the crop will be smaller and not centered.
187 def extract_images( 188 self, 189 crop_size: int = 100, 190 in_pixels: bool = True, 191 input_path: str = None, 192 channels: Iterable[int | str] = None, 193 apply_gain: bool | Iterable[bool] = True, 194 ) -> list[np.ndarray]: 195 """ 196 Extract the images from the scan and tile, reading from the file. Called 197 "extract" because it must read and extract the images from file, which is slow. 198 Use this if you're interested in only a few events, as it is inefficient when 199 reading multiple events from the same tile. 200 :param crop_size: the square size of the image crop to get for this event. 201 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 202 :param input_path: the path to the input images. Defaults to None (uses the scan's path). 203 :param channels: the channels to extract images for. Defaults to all channels. 204 :param apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True. 205 Can be supplied as a list to apply gain to individual channels. 206 :return: a list of cropped images from the scan in the order of the channels. 207 """ 208 frames = Frame.get_frames(self.tile, channels) 209 if isinstance(apply_gain, bool): 210 apply_gain = [apply_gain] * len(frames) 211 images = [f.get_image(input_path, a) for f, a in zip(frames, apply_gain)] 212 return self.crop_images(images, crop_size, in_pixels)
Extract the images from the scan and tile, reading from the file. Called "extract" because it must read and extract the images from file, which is slow. Use this if you're interested in only a few events, as it is inefficient when reading multiple events from the same tile.
Parameters
- crop_size: the square size of the image crop to get for this event.
- in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
- input_path: the path to the input images. Defaults to None (uses the scan's path).
- channels: the channels to extract images for. Defaults to all channels.
- apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True. Can be supplied as a list to apply gain to individual channels.
Returns
a list of cropped images from the scan in the order of the channels.
214 @classmethod 215 def extract_images_for_list( 216 cls, 217 events: list[Self], 218 crop_size: int | list[int] = 75, 219 in_pixels: bool = True, 220 input_path: str = None, 221 channels: Iterable[int | str] = None, 222 apply_gain: bool | Iterable[bool] = True, 223 ) -> list[list[np.ndarray]]: 224 """ 225 Get the images for a list of events, ensuring that there is no wasteful reading 226 of the same tile multiple times. This function is more efficient than calling 227 extract_event_images for each event. 228 :param events: the events to extract images for. 229 :param crop_size: the square size of the image crop to get for this event. 230 Defaults to four times the size of the event. 231 :param in_pixels: whether the crop size is in pixels or micrometers. 232 Defaults to pixels, and is ignored if crop_size is None. 233 :param input_path: the path to the input images. Will only work for lists of events 234 from the same scan. Defaults to None (uses the scan's path). 235 :param channels: the channels to extract images for. Defaults to all channels. 236 :param apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True. 237 Can be supplied as a list to apply gain to individual channels. 238 :return: a list of lists of cropped images for each event. 239 """ 240 # Validation 241 if len(events) == 0: 242 return [] 243 if isinstance(crop_size, int): 244 crop_size = [crop_size] * len(events) 245 246 # Get the order of the events when sorted by slide/tile 247 order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__())) 248 249 # Allocate the list to size 250 crops = [[]] * len(events) 251 last_tile = None 252 images = None # Holds large numpy arrays, so expensive to compare 253 # Iterate through in slide/tile sorted order 254 for i in order: 255 if last_tile != events[i].tile: 256 # Gather the frame images, preserving them for the next event 257 frames = Frame.get_frames(events[i].tile, channels) 258 if isinstance(apply_gain, bool): 259 gain_list = [apply_gain] * len(frames) 260 else: 261 gain_list = apply_gain 262 images = [f.get_image(input_path, a) for f, a in zip(frames, gain_list)] 263 last_tile = events[i].tile 264 # Use the frame images to crop the event images 265 crops[i] = events[i].crop_images(images, crop_size[i], in_pixels) 266 return crops
Get the images for a list of events, ensuring that there is no wasteful reading of the same tile multiple times. This function is more efficient than calling extract_event_images for each event.
Parameters
- events: the events to extract images for.
- crop_size: the square size of the image crop to get for this event. Defaults to four times the size of the event.
- in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels, and is ignored if crop_size is None.
- input_path: the path to the input images. Will only work for lists of events from the same scan. Defaults to None (uses the scan's path).
- channels: the channels to extract images for. Defaults to all channels.
- apply_gain: whether to apply scanner-calculated gain to the images, if not already applied. Defaults to True. Can be supplied as a list to apply gain to individual channels.
Returns
a list of lists of cropped images for each event.
269class EventArray: 270 """ 271 A class that holds a large number of events' data, making it easy to analyze and 272 manipulate many events at once. A more separated version of the Event class. 273 """ 274 275 INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y"] 276 277 def __init__( 278 self, 279 info: pd.DataFrame = None, 280 metadata: pd.DataFrame = None, 281 features: pd.DataFrame = None, 282 ): 283 # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y" 284 if info is not None: 285 if list(info.columns) != self.INFO_COLUMNS: 286 raise ValueError( 287 'EventArray.info must have columns "slide_id", "tile", "roi", "x", "y"' 288 ) 289 # Copy first to avoid modifying the original 290 info = info.copy() 291 # Ensure that the columns are the right types 292 info["slide_id"] = info["slide_id"].astype(str) 293 info["tile"] = info["tile"].astype(np.uint16) 294 info["roi"] = info["roi"].astype(np.uint8) 295 info["x"] = info["x"].round().astype(np.uint16) 296 info["y"] = info["y"].round().astype(np.uint16) 297 # All DataFrames must all have the same number of rows 298 if metadata is not None and (info is None or len(info) != len(metadata)): 299 raise ValueError( 300 "If EventArray.metadata is not None, it should match rows with .info" 301 ) 302 if features is not None and (info is None or len(info) != len(features)): 303 raise ValueError( 304 "If EventArray.features is not None, it should match rows with .info" 305 ) 306 # No columns named "metadata_", "features_", or "None" 307 column_names = [] 308 if metadata is not None: 309 column_names += metadata.columns.tolist() 310 if features is not None: 311 column_names += features.columns.tolist() 312 if any([col.lower().startswith("metadata_") for col in column_names]): 313 raise ValueError("EventArray column names cannot start with 'metadata_'") 314 if any([col.lower().startswith("features_") for col in column_names]): 315 raise ValueError("EventArray column names cannot start with 'features_'") 316 if any([col.lower() == "none" for col in column_names]): 317 raise ValueError("EventArray column names cannot be 'none'") 318 319 self.info = info 320 self.metadata = metadata 321 self.features = features 322 323 def __len__(self) -> int: 324 # Convenience method to get the number of events 325 if self.info is None: 326 return 0 327 else: 328 return len(self.info) 329 330 def __eq__(self, other): 331 is_equal = True 332 # Parse all possibilities for info 333 if isinstance(self.info, pd.DataFrame): 334 if isinstance(other.info, pd.DataFrame): 335 is_equal = self.info.equals(other.info) 336 if not is_equal: 337 return False 338 else: 339 return False 340 elif self.info is None: 341 if other.info is not None: 342 return False 343 344 # Parse all possibilities for metadata 345 if isinstance(self.metadata, pd.DataFrame): 346 if isinstance(other.metadata, pd.DataFrame): 347 is_equal = self.metadata.equals(other.metadata) 348 if not is_equal: 349 return False 350 else: 351 return False 352 elif self.metadata is None: 353 if other.metadata is not None: 354 return False 355 356 # Parse all possibilities for features 357 if isinstance(self.features, pd.DataFrame): 358 if isinstance(other.features, pd.DataFrame): 359 is_equal = self.features.equals(other.features) 360 if not is_equal: 361 return False 362 else: 363 return False 364 elif self.features is None: 365 if other.features is not None: 366 return False 367 368 return is_equal 369 370 def get_sort_order( 371 self, by: Hashable | Sequence[Hashable], ascending: bool | Sequence[bool] = True 372 ): 373 """ 374 Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames. 375 :param by: name of the column(s) to sort by. 376 :param ascending: whether to sort in ascending order; can be a list to match by 377 :return: the order of the indices to sort by. 378 """ 379 columns = self.get(by) 380 return columns.sort_values(by=by, ascending=ascending).index 381 382 def sort( 383 self, 384 by: Hashable | Sequence[Hashable], 385 ascending: bool | Sequence[bool] = True, 386 ) -> Self: 387 """ 388 Sort the EventArray by column(s) in the info, metadata, or features DataFrames. 389 :param by: name of the column(s) to sort by. 390 :param ascending: whether to sort in ascending order; can be a list to match by 391 :return: a new, sorted EventArray. 392 """ 393 order = self.get_sort_order(by, ascending) 394 info = self.info.loc[order].reset_index(drop=True) 395 if self.metadata is not None: 396 metadata = self.metadata.loc[order].reset_index(drop=True) 397 else: 398 metadata = None 399 if self.features is not None: 400 features = self.features.loc[order].reset_index(drop=True) 401 else: 402 features = None 403 return EventArray(info, metadata, features) 404 405 def get(self, column_names: Hashable | Sequence[Hashable]) -> pd.DataFrame: 406 """ 407 Get a DataFrame with the specified columns from the EventArray, by value. 408 :param column_names: the names of the columns to get. 409 :return: a DataFrame with the specified columns. 410 """ 411 if isinstance(column_names, Hashable): 412 column_names = [column_names] # Drop into a list for the loop 413 columns = [] 414 for column_name in column_names: 415 if column_name in self.info.columns: 416 columns.append(self.info[column_name]) 417 elif self.metadata is not None and column_name in self.metadata.columns: 418 columns.append(self.metadata[column_name]) 419 elif self.features is not None and column_name in self.features.columns: 420 columns.append(self.features[column_name]) 421 else: 422 raise ValueError(f"Column {column_name} not found in EventArray") 423 return pd.concat(columns, axis=1) 424 425 def rows(self, rows: Sequence[Hashable]) -> Self: 426 """ 427 Get a subset of the EventArray rows based on a boolean or integer index, by value. 428 :param rows: row labels, indices, or boolean mask; anything for .loc[] 429 :return: a new EventArray with the subset of events. 430 """ 431 info = self.info.loc[rows].reset_index(drop=True) 432 if self.metadata is not None: 433 metadata = self.metadata.loc[rows].reset_index(drop=True) 434 else: 435 metadata = None 436 if self.features is not None: 437 features = self.features.loc[rows].reset_index(drop=True) 438 else: 439 features = None 440 return EventArray(info, metadata, features) 441 442 def copy(self) -> Self: 443 """ 444 Create a deep copy of the EventArray. 445 :return: a deep copy of the EventArray. 446 """ 447 return EventArray( 448 info=self.info.copy(), 449 metadata=None if self.metadata is None else self.metadata.copy(), 450 features=None if self.features is None else self.features.copy(), 451 ) 452 453 # TODO: add a "filter" convenience function that takes a column name and values to filter by 454 455 def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None: 456 """ 457 Add metadata to the EventArray. Removes the need to check if metadata is None. 458 Overwrites any existing metadata with the same column names as the new metadata. 459 :param new_metadata: the metadata to add. 460 """ 461 if len(self) != len(new_metadata): 462 raise ValueError("New metadata must match length of existing info") 463 464 if self.metadata is None: 465 self.metadata = new_metadata 466 else: 467 if isinstance(new_metadata, pd.Series): 468 self.metadata[new_metadata.name] = new_metadata 469 else: 470 # It's a DataFrame 471 self.metadata[new_metadata.columns] = new_metadata 472 473 def add_features(self, new_features: pd.Series | pd.DataFrame) -> None: 474 """ 475 Add features to the EventArray. Removes the need to check if features is None. 476 Overwrites any existing features with the same column names as the new features. 477 :param new_features: the features to add. 478 """ 479 if len(self) != len(new_features): 480 raise ValueError("New features must match length of existing info") 481 482 if self.features is None: 483 self.features = new_features 484 else: 485 if isinstance(new_features, pd.Series): 486 self.features[new_features.name] = new_features 487 else: 488 # It's a DataFrame 489 self.features[new_features.columns] = new_features 490 491 @classmethod 492 def merge(cls, events: Iterable[Self]) -> Self: 493 """ 494 Combine EventArrays in a list into a single EventArray. 495 :param events: the new list of events. 496 """ 497 all_info = [] 498 all_metadata = [] 499 all_features = [] 500 for event_array in events: 501 # Skip empty EventArrays 502 if event_array.info is not None: 503 all_info.append(event_array.info) 504 if event_array.metadata is not None: 505 all_metadata.append(event_array.metadata) 506 if event_array.features is not None: 507 all_features.append(event_array.features) 508 if len(all_info) == 0: 509 return EventArray() 510 else: 511 all_info = pd.concat(all_info, ignore_index=True) 512 if len(all_metadata) == 0: 513 all_metadata = None 514 else: 515 all_metadata = pd.concat(all_metadata, ignore_index=True) 516 if len(all_features) == 0: 517 all_features = None 518 else: 519 all_features = pd.concat(all_features, ignore_index=True) 520 521 return EventArray(all_info, all_metadata, all_features) 522 523 def to_events( 524 self, 525 scans: Scan | Iterable[Scan], 526 ignore_missing_scans=True, 527 ignore_metadata=False, 528 ignore_features=False, 529 ) -> list[Event]: 530 """ 531 Get the events in the EventArray as a list of events. 532 :param scans: the scans that the events belong to, auto-matched by slide_id. 533 Pass None if you don't care about scan metadata (pass ignore_missing_scans). 534 :param ignore_missing_scans: whether to create blank scans for events without scans. 535 :param ignore_metadata: whether to ignore metadata or not 536 :param ignore_features: whether to ignore features or not 537 :return: 538 """ 539 if isinstance(scans, Scan): 540 scans = [scans] 541 scans = {scan.slide_id: scan for scan in scans} 542 events = [] 543 for i in range(len(self.info)): 544 # Determine the associated scan 545 slide_id = self.info["slide_id"][i] 546 if slide_id not in scans: 547 if ignore_missing_scans: 548 # Create a placeholder scan if the scan is missing 549 scan = Scan.make_placeholder( 550 slide_id, 551 self.info["tile"][i], 552 self.info["roi"][i], 553 ) 554 else: 555 raise ValueError( 556 f"Scan {self.info['slide_id'][i]} not found for event {i}." 557 ) 558 else: 559 scan = scans[slide_id] 560 561 # Prepare the metadata and features 562 if ignore_metadata or self.metadata is None: 563 metadata = None 564 else: 565 # This Series creation method is less efficient, 566 # but required for preserving dtypes 567 metadata = pd.Series( 568 {col: self.metadata.loc[i, col] for col in self.metadata.columns}, 569 dtype=object, 570 ) 571 if ignore_features or self.features is None: 572 features = None 573 else: 574 features = pd.Series( 575 {col: self.features.loc[i, col] for col in self.features.columns}, 576 dtype=object, 577 ) 578 # Create the event and append it to the list 579 events.append( 580 Event( 581 scan, 582 Tile(scan, self.info["tile"][i], self.info["roi"][i]), 583 self.info["x"][i], 584 self.info["y"][i], 585 metadata=metadata, 586 features=features, 587 ) 588 ) 589 return events 590 591 @classmethod 592 def from_events(cls, events: Iterable[Event]) -> Self: 593 """ 594 Set the events in the EventArray to a new list of events. 595 :param events: the new list of events. 596 """ 597 info = pd.DataFrame( 598 { 599 "slide_id": [event.scan.slide_id for event in events], 600 "tile": [event.tile.n for event in events], 601 "roi": [event.tile.n_roi for event in events], 602 "x": [event.x for event in events], 603 "y": [event.y for event in events], 604 } 605 ) 606 metadata_list = [event.metadata for event in events] 607 # Iterate through and ensure that all metadata is the same shape 608 for metadata in metadata_list: 609 if type(metadata) != type(metadata_list[0]): 610 raise ValueError("All metadata must be the same type.") 611 if metadata is not None and metadata.shape != metadata_list[0].shape: 612 raise ValueError("All metadata must be the same shape.") 613 if metadata_list[0] is None: 614 metadata = None 615 else: 616 metadata = pd.DataFrame(metadata_list) 617 features_list = [event.features for event in events] 618 # Iterate through and ensure that all features are the same shape 619 for features in features_list: 620 if type(features) != type(features_list[0]): 621 raise ValueError("All features must be the same type.") 622 if features is not None and features.shape != features_list[0].shape: 623 raise ValueError("All features must be the same shape.") 624 if features_list[0] is None: 625 features = None 626 else: 627 features = pd.DataFrame(features_list) 628 return EventArray(info=info, metadata=metadata, features=features) 629 630 def to_dataframe(self) -> pd.DataFrame: 631 """ 632 Convert all the data in the EventArray to a single DataFrame. 633 :return: a DataFrame with all the data in the EventArray. 634 """ 635 # Make a copy of the info DataFrame and prepend "info_" to the column names 636 output = self.info.copy() 637 # Combine with the metadata and prepend "metadata_" to the column names 638 if self.metadata is not None: 639 metadata = self.metadata.copy() 640 metadata.columns = [f"metadata_{col}" for col in metadata.columns] 641 output = pd.concat([output, metadata], axis=1) 642 # Combine with the features and prepend "features_" to the column names 643 if self.features is not None: 644 features = self.features.copy() 645 features.columns = [f"features_{col}" for col in features.columns] 646 output = pd.concat([output, features], axis=1) 647 return output 648 649 @classmethod 650 def from_dataframe(cls, df) -> Self: 651 """ 652 From a single, special DataFrame, create an EventArray. 653 :return: a DataFrame with all the data in the EventArray. 654 """ 655 # Split the columns into info, metadata, and features and strip prefix 656 info = df[[col for col in df.columns if col in cls.INFO_COLUMNS]].copy() 657 if info.size == 0: 658 info = None 659 metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy() 660 metadata.columns = [col.replace("metadata_", "") for col in metadata.columns] 661 if metadata.size == 0: 662 metadata = None 663 features = df[[col for col in df.columns if col.startswith("features_")]].copy() 664 features.columns = [col.replace("features_", "") for col in features.columns] 665 if features.size == 0: 666 features = None 667 return cls(info=info, metadata=metadata, features=features) 668 669 @classmethod 670 def from_mask( 671 cls, 672 mask: np.ndarray, 673 slide_id: str, 674 tile_n: int, 675 n_roi: int = 0, 676 include_cell_id: bool = True, 677 images: list[np.ndarray] = None, 678 image_labels: list[str] = None, 679 properties: list[str] = None, 680 ) -> Self: 681 """ 682 Extract events from a mask DataFrame, including metadata and features. 683 :param mask: the mask to extract events from. 684 :param slide_id: the slide ID the mask is from. 685 :param tile_n: the tile number the mask is from. 686 :param n_roi: the ROI number the mask is from. 687 :param include_cell_id: whether to include the cell_id, or numerical 688 mask label, as metadata in the EventArray. 689 :param images: the intensity images to extract features from. 690 :param image_labels: the labels for the intensity images. 691 :param properties: list of properties to extract in addition to the defaults: 692 :return: EventArray corresponding to the mask labels. 693 """ 694 if extract_mask_info is None: 695 raise ModuleNotFoundError( 696 "csi_images.csi_images dependencies not installed. Install csi-images " 697 "with [imageio] option to resolve." 698 ) 699 # Gather mask_info 700 if images is not None and image_labels is not None: 701 if len(images) != len(image_labels): 702 raise ValueError("Intensity images and labels must match lengths.") 703 704 mask_info = extract_mask_info(mask, images, image_labels, properties) 705 706 if len(mask_info) == 0: 707 return EventArray() 708 709 # Combine provided info and mask info 710 info = pd.DataFrame( 711 { 712 "slide_id": slide_id, 713 "tile": tile_n, 714 "roi": n_roi, 715 "x": mask_info["x"], 716 "y": mask_info["y"], 717 }, 718 ) 719 # Extract a metadata column if desired 720 if include_cell_id: 721 metadata = pd.DataFrame({"cell_id": mask_info["id"]}) 722 else: 723 metadata = None 724 # If any additional properties were extracted, add them as features 725 mask_info = mask_info.drop(columns=["id", "x", "y"], errors="ignore") 726 if len(mask_info.columns) > 0: 727 features = mask_info 728 else: 729 features = None 730 return EventArray(info, metadata, features) 731 732 def save_csv(self, output_path: str) -> bool: 733 """ 734 Save the events to an CSV file, including metadata and features. 735 :param output_path: 736 :return: 737 """ 738 self.to_dataframe().to_csv(output_path, index=False) 739 return os.path.exists(output_path) 740 741 @classmethod 742 def load_csv(cls, input_path: str) -> Self: 743 """ 744 Load the events from an CSV file, including metadata and features. 745 :param input_path: 746 :return: 747 """ 748 # Load the CSV file 749 df = pd.read_csv(input_path) 750 return cls.from_dataframe(df) 751 752 def save_hdf5(self, output_path: str) -> bool: 753 """ 754 Save the events to an HDF5 file, including metadata and features. 755 Uses the pandas-provided HDF5 functions for ease, and external compatibility, 756 though these files are slightly harder to view in HDFView or similar. 757 :param output_path: 758 :return: 759 """ 760 # Open the output_path as an HDF5 file 761 with pd.HDFStore(output_path) as store: 762 # Store the dataframes in the HDF5 file 763 if self.info is not None: 764 store.put("info", self.info, index=False) 765 if self.metadata is not None: 766 store.put("metadata", self.metadata, index=False) 767 if self.features is not None: 768 store.put("features", self.features, index=False) 769 return os.path.exists(output_path) 770 771 @classmethod 772 def load_hdf5(cls, input_path: str) -> Self: 773 """ 774 Load the events from an HDF5 file, including metadata and features. 775 :param input_path: 776 :return: 777 """ 778 # Open the input_path as an HDF5 file 779 with pd.HDFStore(input_path, "r") as store: 780 # Load the dataframes from the HDF5 file 781 info = store.get("info") if "info" in store else None 782 metadata = store.get("metadata") if "metadata" in store else None 783 features = store.get("features") if "features" in store else None 784 return cls(info=info, metadata=metadata, features=features) 785 786 def save_ocular(self, output_path: str, event_type: str = "cells"): 787 """ 788 Save the events to an OCULAR file. Relies on the dataframe originating 789 from an OCULAR file (same columns; duplicate metadata/info). 790 :param output_path: 791 :param event_type: 792 :return: 793 """ 794 if pyreadr is None: 795 raise ModuleNotFoundError( 796 "pyreadr not installed. Install pyreadr directly " 797 "or install csi-images with [rds] option to resolve." 798 ) 799 if event_type == "cells": 800 file_stub = "rc-final" 801 elif event_type == "others": 802 file_stub = "others-final" 803 else: 804 raise ValueError("Invalid event type. Must be cells or others.") 805 806 # Ensure good metadata 807 metadata = pd.DataFrame( 808 { 809 "slide_id": self.info["slide_id"], 810 "frame_id": self.info["tile"], 811 "cell_id": ( 812 self.metadata["cell_id"] 813 if "cell_id" in self.metadata.columns 814 else range(len(self.info)) 815 ), 816 "cellx": self.info["x"], 817 "celly": self.info["y"], 818 } 819 ) 820 if self.metadata is not None: 821 metadata[self.metadata.columns] = self.metadata.copy() 822 823 # Check for the "ocular_interesting" column 824 if event_type == "cells": 825 if "ocular_interesting" in metadata.columns: 826 interesting_rows = metadata["ocular_interesting"].to_numpy(dtype=bool) 827 elif "hcpc" in metadata.columns: 828 # Interesting cells don't get an hcpc designation, leaving them as -1 829 interesting_rows = ( 830 metadata["hcpc"].to_numpy() == -1 831 ) # interesting cells 832 else: 833 interesting_rows = [] 834 if sum(interesting_rows) > 0: 835 # Split the metadata into interesting and regular 836 interesting_events = self.rows(interesting_rows) 837 interesting_df = pd.concat( 838 [interesting_events.features, interesting_events.metadata], axis=1 839 ) 840 data_events = self.rows(~interesting_rows) 841 data_df = pd.concat( 842 [data_events.features, data_events.metadata], axis=1 843 ) 844 data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore") 845 846 # Drop particular columns for "interesting" 847 interesting_df = interesting_df.drop( 848 [ 849 "clust", 850 "hcpc", 851 "frame_id", 852 "cell_id", 853 "unique_id", 854 "ocular_interesting", 855 ], 856 axis=1, 857 errors="ignore", 858 ) 859 # Save both .csv and .rds 860 interesting_stub = os.path.join(output_path, "ocular_interesting") 861 interesting_df.to_csv(f"{interesting_stub}.csv") 862 # Suppress pandas FutureWarning 863 with warnings.catch_warnings(): 864 warnings.simplefilter(action="ignore", category=FutureWarning) 865 pyreadr.write_rds(f"{interesting_stub}.rds", interesting_df) 866 else: 867 data_df = pd.concat([self.features, metadata], axis=1) 868 else: 869 # Get all data and reset_index (will copy it) 870 data_df = pd.concat([self.features, metadata], axis=1) 871 872 # Split based on cluster number to conform to *-final[1-4].rds 873 n_clusters = max(data_df["clust"]) + 1 874 split_idx = [round(i * n_clusters / 4) for i in range(5)] 875 for i in range(4): 876 subset = (split_idx[i] <= data_df["clust"]) & ( 877 data_df["clust"] < split_idx[i + 1] 878 ) 879 data_df.loc[subset, "hcpc"] = i + 1 880 subset = data_df[subset].reset_index(drop=True) 881 # Suppress pandas FutureWarning 882 with warnings.catch_warnings(): 883 warnings.simplefilter(action="ignore", category=FutureWarning) 884 pyreadr.write_rds( 885 os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset 886 ) 887 888 # Create new example cell strings 889 data_df["example_cell_id"] = ( 890 data_df["slide_id"] 891 + " " 892 + data_df["frame_id"].astype(str) 893 + " " 894 + data_df["cell_id"].astype(str) 895 + " " 896 + data_df["cellx"].astype(int).astype(str) 897 + " " 898 + data_df["celly"].astype(int).astype(str) 899 ) 900 # Find averagable data columns 901 if "cellcluster_id" in data_df.columns: 902 end_idx = data_df.columns.get_loc("cellcluster_id") 903 else: 904 end_idx = data_df.columns.get_loc("slide_id") 905 avg_cols = data_df.columns[:end_idx].tolist() 906 # Group by cluster and average 907 data_df = data_df.groupby("clust").agg( 908 **{col: (col, "mean") for col in avg_cols}, 909 count=("clust", "size"), # count rows in each cluster 910 example_cells=("example_cell_id", lambda x: ",".join(x)), 911 hcpc=("hcpc", lambda x: x.iloc[0]), 912 ) 913 data_df = data_df.reset_index() # Do NOT drop, index is "clust" 914 # Create new columns 915 metadata = pd.DataFrame( 916 { 917 "count": data_df["count"], 918 "example_cells": data_df["example_cells"], 919 "clust": data_df["clust"].astype(int), 920 "hcpc": data_df["hcpc"].astype(int), 921 "id": data_df["clust"].astype(int).astype(str), 922 "cccluster": "0", # Dummy value 923 "ccdistance": 0.0, # Dummy value 924 "rownum": list(range(len(data_df))), 925 "framegroup": 0, # Dummy value 926 } 927 ) 928 # Need to pad the features to 761 columns, as per OCULAR report needs 929 additional_columns = range(len(avg_cols), 761) 930 if len(additional_columns) > 0: 931 padding = pd.DataFrame( 932 np.zeros((len(data_df), len(additional_columns))), 933 columns=[f"pad{i}" for i in additional_columns], 934 ) 935 data_df = pd.concat([data_df[avg_cols], padding, metadata], axis=1) 936 else: 937 data_df = pd.concat([data_df[avg_cols], metadata], axis=1) 938 939 # Save the cluster data 940 data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv")) 941 # Suppress pandas FutureWarning 942 with warnings.catch_warnings(): 943 warnings.simplefilter(action="ignore", category=FutureWarning) 944 pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df) 945 946 @classmethod 947 def load_ocular( 948 cls, 949 input_path: str, 950 event_type="cells", 951 cell_data_files=( 952 "rc-final1.rds", 953 "rc-final2.rds", 954 "rc-final3.rds", 955 "rc-final4.rds", 956 "ocular_interesting.rds", 957 ), 958 others_data_files=( 959 "others-final1.rds", 960 "others-final2.rds", 961 "others-final3.rds", 962 "others-final4.rds", 963 ), 964 atlas_data_files=( 965 "ocular_interesting.rds", 966 "ocular_not_interesting.rds", 967 ), 968 drop_common_events=True, 969 log=None, 970 ) -> Self: 971 """ 972 973 :param input_path: 974 :param event_type: 975 :param cell_data_files: 976 :param others_data_files: 977 :param atlas_data_files: 978 :param drop_common_events: 979 :param log: 980 :return: 981 """ 982 if pyreadr is None: 983 raise ModuleNotFoundError( 984 "pyreadr not installed. Install pyreadr directly " 985 "or install csi-images with [rds] option to resolve." 986 ) 987 # Check if the input path is a directory or a file 988 if os.path.isfile(input_path): 989 data_files = [os.path.basename(input_path)] 990 input_path = os.path.dirname(input_path) 991 if event_type == "cells": 992 data_files = cell_data_files 993 elif event_type == "others": 994 data_files = others_data_files 995 else: 996 raise ValueError("Invalid event type.") 997 998 # Load the data from the OCULAR files 999 file_data = {} 1000 for file in data_files: 1001 file_path = os.path.join(input_path, file) 1002 if not os.path.isfile(file_path): 1003 if log is not None: 1004 log.warning(f"{file} not found for in {input_path}") 1005 continue 1006 file_data[file] = pyreadr.read_r(file_path) 1007 # Get the DataFrame associated with None (pyreadr dict quirk) 1008 file_data[file] = file_data[file][None] 1009 if len(file_data[file]) == 0: 1010 # File gets dropped from the dict 1011 file_data.pop(file) 1012 if log is not None: 1013 log.warning(f"{file} has no cells") 1014 continue 1015 1016 if log is not None: 1017 log.debug(f"{file} has {len(file_data[file])} cells") 1018 1019 # Drop common cells if requested and in this file 1020 if ( 1021 file in atlas_data_files 1022 and drop_common_events 1023 and "catalogue_classification" in file_data[file] 1024 ): 1025 common_cell_indices = ( 1026 file_data[file]["catalogue_classification"] == "common_cell" 1027 ) 1028 if log is not None: 1029 log.debug( 1030 f"Dropping {int(pd.Series.sum(common_cell_indices))}" 1031 f"common cells from {file}" 1032 ) 1033 file_data[file] = file_data[file][common_cell_indices == False] 1034 1035 if len(file_data[file]) == 0: 1036 # File gets dropped from the dict 1037 file_data.pop(file) 1038 if log is not None: 1039 log.warning(f"{file} has no cells after dropping common cells") 1040 continue 1041 1042 # Extract frame_id and cell_id 1043 # DAPI- events already have frame_id cell_id outside rowname 1044 if event_type == "cells" and "frame_id" not in file_data[file].columns: 1045 file_data[file]["rowname"] = file_data[file]["rowname"].astype("str") 1046 # get frame_id cell_id from rownames column and split into two columns 1047 split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True) 1048 if len(split_res.columns) != 2: 1049 log.warning( 1050 f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}' 1051 ) 1052 # then assign it back to the dataframe 1053 file_data[file][["frame_id", "cell_id"]] = split_res.astype("int") 1054 # reset indexes since they can cause NaN values in concat 1055 file_data[file] = file_data[file].reset_index(drop=True) 1056 1057 # Merge the data from all files 1058 if len(file_data) == 0: 1059 return EventArray() 1060 elif len(file_data) == 1: 1061 data = [file_data[file] for file in file_data.keys()][0] 1062 else: 1063 data = pd.concat(file_data.values()) 1064 1065 if log is not None: 1066 log.debug(f"Gathered a total of {len(data)} events") 1067 1068 # Others is missing the "slide_id". Insert it right before "frame_id" column 1069 if event_type == "others" and "slide_id" not in data.columns: 1070 if os.path.basename(input_path) == "ocular": 1071 slide_id = os.path.basename(os.path.dirname(input_path)) 1072 else: 1073 slide_id = "UNKNOWN" 1074 data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id) 1075 1076 # Sort according to ascending cell_id to keep the original, which is in manual_df 1077 data = data.sort_values(by=["cell_id"], ascending=True) 1078 # Filter out duplicates by x & y 1079 data = data.assign( 1080 unique_id=data["slide_id"] 1081 + "_" 1082 + data["frame_id"].astype(str) 1083 + "_" 1084 + data["cellx"].astype(int).astype(str) 1085 + "_" 1086 + data["celly"].astype(int).astype(str) 1087 ) 1088 data = data.drop_duplicates(subset=["unique_id"], keep="first") 1089 # Normal unique_id is with cell_id 1090 data = data.assign( 1091 unique_id=data["slide_id"] 1092 + "_" 1093 + data["frame_id"].astype(str) 1094 + "_" 1095 + data["cell_id"].astype(str) 1096 ) 1097 data = data.reset_index(drop=True) 1098 # All columns up to "slide_id" are features; drop the "slide_id" 1099 features = data.loc[:, :"slide_id"].iloc[:, :-1] 1100 data = data.loc[:, "slide_id":] 1101 # Grab the info columns 1102 info = data[["slide_id", "frame_id", "cellx", "celly"]] 1103 info.columns = ["slide_id", "tile", "x", "y"] 1104 info = info.assign(roi=0) # OCULAR only works on 1 ROI, as far as known 1105 info = info[["slide_id", "tile", "roi", "x", "y"]] 1106 # Metadata has duplicate columns for later convenience 1107 metadata = data 1108 # Certain columns tend to be problematic with mixed data formats... 1109 for col in ["TRITC", "CY5", "FITC"]: 1110 if col in metadata: 1111 labels = { 1112 "False": False, 1113 "True": True, 1114 "FALSE": False, 1115 "TRUE": True, 1116 } 1117 metadata[col] = metadata[col].map(labels).astype(bool) 1118 for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]: 1119 if col in metadata: 1120 metadata[col] = metadata[col].fillna(-1).astype(int) 1121 return EventArray(info, metadata, features)
A class that holds a large number of events' data, making it easy to analyze and manipulate many events at once. A more separated version of the Event class.
277 def __init__( 278 self, 279 info: pd.DataFrame = None, 280 metadata: pd.DataFrame = None, 281 features: pd.DataFrame = None, 282 ): 283 # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y" 284 if info is not None: 285 if list(info.columns) != self.INFO_COLUMNS: 286 raise ValueError( 287 'EventArray.info must have columns "slide_id", "tile", "roi", "x", "y"' 288 ) 289 # Copy first to avoid modifying the original 290 info = info.copy() 291 # Ensure that the columns are the right types 292 info["slide_id"] = info["slide_id"].astype(str) 293 info["tile"] = info["tile"].astype(np.uint16) 294 info["roi"] = info["roi"].astype(np.uint8) 295 info["x"] = info["x"].round().astype(np.uint16) 296 info["y"] = info["y"].round().astype(np.uint16) 297 # All DataFrames must all have the same number of rows 298 if metadata is not None and (info is None or len(info) != len(metadata)): 299 raise ValueError( 300 "If EventArray.metadata is not None, it should match rows with .info" 301 ) 302 if features is not None and (info is None or len(info) != len(features)): 303 raise ValueError( 304 "If EventArray.features is not None, it should match rows with .info" 305 ) 306 # No columns named "metadata_", "features_", or "None" 307 column_names = [] 308 if metadata is not None: 309 column_names += metadata.columns.tolist() 310 if features is not None: 311 column_names += features.columns.tolist() 312 if any([col.lower().startswith("metadata_") for col in column_names]): 313 raise ValueError("EventArray column names cannot start with 'metadata_'") 314 if any([col.lower().startswith("features_") for col in column_names]): 315 raise ValueError("EventArray column names cannot start with 'features_'") 316 if any([col.lower() == "none" for col in column_names]): 317 raise ValueError("EventArray column names cannot be 'none'") 318 319 self.info = info 320 self.metadata = metadata 321 self.features = features
370 def get_sort_order( 371 self, by: Hashable | Sequence[Hashable], ascending: bool | Sequence[bool] = True 372 ): 373 """ 374 Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames. 375 :param by: name of the column(s) to sort by. 376 :param ascending: whether to sort in ascending order; can be a list to match by 377 :return: the order of the indices to sort by. 378 """ 379 columns = self.get(by) 380 return columns.sort_values(by=by, ascending=ascending).index
Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.
Parameters
- by: name of the column(s) to sort by.
- ascending: whether to sort in ascending order; can be a list to match by
Returns
the order of the indices to sort by.
382 def sort( 383 self, 384 by: Hashable | Sequence[Hashable], 385 ascending: bool | Sequence[bool] = True, 386 ) -> Self: 387 """ 388 Sort the EventArray by column(s) in the info, metadata, or features DataFrames. 389 :param by: name of the column(s) to sort by. 390 :param ascending: whether to sort in ascending order; can be a list to match by 391 :return: a new, sorted EventArray. 392 """ 393 order = self.get_sort_order(by, ascending) 394 info = self.info.loc[order].reset_index(drop=True) 395 if self.metadata is not None: 396 metadata = self.metadata.loc[order].reset_index(drop=True) 397 else: 398 metadata = None 399 if self.features is not None: 400 features = self.features.loc[order].reset_index(drop=True) 401 else: 402 features = None 403 return EventArray(info, metadata, features)
Sort the EventArray by column(s) in the info, metadata, or features DataFrames.
Parameters
- by: name of the column(s) to sort by.
- ascending: whether to sort in ascending order; can be a list to match by
Returns
a new, sorted EventArray.
405 def get(self, column_names: Hashable | Sequence[Hashable]) -> pd.DataFrame: 406 """ 407 Get a DataFrame with the specified columns from the EventArray, by value. 408 :param column_names: the names of the columns to get. 409 :return: a DataFrame with the specified columns. 410 """ 411 if isinstance(column_names, Hashable): 412 column_names = [column_names] # Drop into a list for the loop 413 columns = [] 414 for column_name in column_names: 415 if column_name in self.info.columns: 416 columns.append(self.info[column_name]) 417 elif self.metadata is not None and column_name in self.metadata.columns: 418 columns.append(self.metadata[column_name]) 419 elif self.features is not None and column_name in self.features.columns: 420 columns.append(self.features[column_name]) 421 else: 422 raise ValueError(f"Column {column_name} not found in EventArray") 423 return pd.concat(columns, axis=1)
Get a DataFrame with the specified columns from the EventArray, by value.
Parameters
- column_names: the names of the columns to get.
Returns
a DataFrame with the specified columns.
425 def rows(self, rows: Sequence[Hashable]) -> Self: 426 """ 427 Get a subset of the EventArray rows based on a boolean or integer index, by value. 428 :param rows: row labels, indices, or boolean mask; anything for .loc[] 429 :return: a new EventArray with the subset of events. 430 """ 431 info = self.info.loc[rows].reset_index(drop=True) 432 if self.metadata is not None: 433 metadata = self.metadata.loc[rows].reset_index(drop=True) 434 else: 435 metadata = None 436 if self.features is not None: 437 features = self.features.loc[rows].reset_index(drop=True) 438 else: 439 features = None 440 return EventArray(info, metadata, features)
Get a subset of the EventArray rows based on a boolean or integer index, by value.
Parameters
- rows: row labels, indices, or boolean mask; anything for .loc[]
Returns
a new EventArray with the subset of events.
442 def copy(self) -> Self: 443 """ 444 Create a deep copy of the EventArray. 445 :return: a deep copy of the EventArray. 446 """ 447 return EventArray( 448 info=self.info.copy(), 449 metadata=None if self.metadata is None else self.metadata.copy(), 450 features=None if self.features is None else self.features.copy(), 451 )
Create a deep copy of the EventArray.
Returns
a deep copy of the EventArray.
455 def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None: 456 """ 457 Add metadata to the EventArray. Removes the need to check if metadata is None. 458 Overwrites any existing metadata with the same column names as the new metadata. 459 :param new_metadata: the metadata to add. 460 """ 461 if len(self) != len(new_metadata): 462 raise ValueError("New metadata must match length of existing info") 463 464 if self.metadata is None: 465 self.metadata = new_metadata 466 else: 467 if isinstance(new_metadata, pd.Series): 468 self.metadata[new_metadata.name] = new_metadata 469 else: 470 # It's a DataFrame 471 self.metadata[new_metadata.columns] = new_metadata
Add metadata to the EventArray. Removes the need to check if metadata is None. Overwrites any existing metadata with the same column names as the new metadata.
Parameters
- new_metadata: the metadata to add.
473 def add_features(self, new_features: pd.Series | pd.DataFrame) -> None: 474 """ 475 Add features to the EventArray. Removes the need to check if features is None. 476 Overwrites any existing features with the same column names as the new features. 477 :param new_features: the features to add. 478 """ 479 if len(self) != len(new_features): 480 raise ValueError("New features must match length of existing info") 481 482 if self.features is None: 483 self.features = new_features 484 else: 485 if isinstance(new_features, pd.Series): 486 self.features[new_features.name] = new_features 487 else: 488 # It's a DataFrame 489 self.features[new_features.columns] = new_features
Add features to the EventArray. Removes the need to check if features is None. Overwrites any existing features with the same column names as the new features.
Parameters
- new_features: the features to add.
491 @classmethod 492 def merge(cls, events: Iterable[Self]) -> Self: 493 """ 494 Combine EventArrays in a list into a single EventArray. 495 :param events: the new list of events. 496 """ 497 all_info = [] 498 all_metadata = [] 499 all_features = [] 500 for event_array in events: 501 # Skip empty EventArrays 502 if event_array.info is not None: 503 all_info.append(event_array.info) 504 if event_array.metadata is not None: 505 all_metadata.append(event_array.metadata) 506 if event_array.features is not None: 507 all_features.append(event_array.features) 508 if len(all_info) == 0: 509 return EventArray() 510 else: 511 all_info = pd.concat(all_info, ignore_index=True) 512 if len(all_metadata) == 0: 513 all_metadata = None 514 else: 515 all_metadata = pd.concat(all_metadata, ignore_index=True) 516 if len(all_features) == 0: 517 all_features = None 518 else: 519 all_features = pd.concat(all_features, ignore_index=True) 520 521 return EventArray(all_info, all_metadata, all_features)
Combine EventArrays in a list into a single EventArray.
Parameters
- events: the new list of events.
523 def to_events( 524 self, 525 scans: Scan | Iterable[Scan], 526 ignore_missing_scans=True, 527 ignore_metadata=False, 528 ignore_features=False, 529 ) -> list[Event]: 530 """ 531 Get the events in the EventArray as a list of events. 532 :param scans: the scans that the events belong to, auto-matched by slide_id. 533 Pass None if you don't care about scan metadata (pass ignore_missing_scans). 534 :param ignore_missing_scans: whether to create blank scans for events without scans. 535 :param ignore_metadata: whether to ignore metadata or not 536 :param ignore_features: whether to ignore features or not 537 :return: 538 """ 539 if isinstance(scans, Scan): 540 scans = [scans] 541 scans = {scan.slide_id: scan for scan in scans} 542 events = [] 543 for i in range(len(self.info)): 544 # Determine the associated scan 545 slide_id = self.info["slide_id"][i] 546 if slide_id not in scans: 547 if ignore_missing_scans: 548 # Create a placeholder scan if the scan is missing 549 scan = Scan.make_placeholder( 550 slide_id, 551 self.info["tile"][i], 552 self.info["roi"][i], 553 ) 554 else: 555 raise ValueError( 556 f"Scan {self.info['slide_id'][i]} not found for event {i}." 557 ) 558 else: 559 scan = scans[slide_id] 560 561 # Prepare the metadata and features 562 if ignore_metadata or self.metadata is None: 563 metadata = None 564 else: 565 # This Series creation method is less efficient, 566 # but required for preserving dtypes 567 metadata = pd.Series( 568 {col: self.metadata.loc[i, col] for col in self.metadata.columns}, 569 dtype=object, 570 ) 571 if ignore_features or self.features is None: 572 features = None 573 else: 574 features = pd.Series( 575 {col: self.features.loc[i, col] for col in self.features.columns}, 576 dtype=object, 577 ) 578 # Create the event and append it to the list 579 events.append( 580 Event( 581 scan, 582 Tile(scan, self.info["tile"][i], self.info["roi"][i]), 583 self.info["x"][i], 584 self.info["y"][i], 585 metadata=metadata, 586 features=features, 587 ) 588 ) 589 return events
Get the events in the EventArray as a list of events.
Parameters
- scans: the scans that the events belong to, auto-matched by slide_id. Pass None if you don't care about scan metadata (pass ignore_missing_scans).
- ignore_missing_scans: whether to create blank scans for events without scans.
- ignore_metadata: whether to ignore metadata or not
- ignore_features: whether to ignore features or not
Returns
591 @classmethod 592 def from_events(cls, events: Iterable[Event]) -> Self: 593 """ 594 Set the events in the EventArray to a new list of events. 595 :param events: the new list of events. 596 """ 597 info = pd.DataFrame( 598 { 599 "slide_id": [event.scan.slide_id for event in events], 600 "tile": [event.tile.n for event in events], 601 "roi": [event.tile.n_roi for event in events], 602 "x": [event.x for event in events], 603 "y": [event.y for event in events], 604 } 605 ) 606 metadata_list = [event.metadata for event in events] 607 # Iterate through and ensure that all metadata is the same shape 608 for metadata in metadata_list: 609 if type(metadata) != type(metadata_list[0]): 610 raise ValueError("All metadata must be the same type.") 611 if metadata is not None and metadata.shape != metadata_list[0].shape: 612 raise ValueError("All metadata must be the same shape.") 613 if metadata_list[0] is None: 614 metadata = None 615 else: 616 metadata = pd.DataFrame(metadata_list) 617 features_list = [event.features for event in events] 618 # Iterate through and ensure that all features are the same shape 619 for features in features_list: 620 if type(features) != type(features_list[0]): 621 raise ValueError("All features must be the same type.") 622 if features is not None and features.shape != features_list[0].shape: 623 raise ValueError("All features must be the same shape.") 624 if features_list[0] is None: 625 features = None 626 else: 627 features = pd.DataFrame(features_list) 628 return EventArray(info=info, metadata=metadata, features=features)
Set the events in the EventArray to a new list of events.
Parameters
- events: the new list of events.
630 def to_dataframe(self) -> pd.DataFrame: 631 """ 632 Convert all the data in the EventArray to a single DataFrame. 633 :return: a DataFrame with all the data in the EventArray. 634 """ 635 # Make a copy of the info DataFrame and prepend "info_" to the column names 636 output = self.info.copy() 637 # Combine with the metadata and prepend "metadata_" to the column names 638 if self.metadata is not None: 639 metadata = self.metadata.copy() 640 metadata.columns = [f"metadata_{col}" for col in metadata.columns] 641 output = pd.concat([output, metadata], axis=1) 642 # Combine with the features and prepend "features_" to the column names 643 if self.features is not None: 644 features = self.features.copy() 645 features.columns = [f"features_{col}" for col in features.columns] 646 output = pd.concat([output, features], axis=1) 647 return output
Convert all the data in the EventArray to a single DataFrame.
Returns
a DataFrame with all the data in the EventArray.
649 @classmethod 650 def from_dataframe(cls, df) -> Self: 651 """ 652 From a single, special DataFrame, create an EventArray. 653 :return: a DataFrame with all the data in the EventArray. 654 """ 655 # Split the columns into info, metadata, and features and strip prefix 656 info = df[[col for col in df.columns if col in cls.INFO_COLUMNS]].copy() 657 if info.size == 0: 658 info = None 659 metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy() 660 metadata.columns = [col.replace("metadata_", "") for col in metadata.columns] 661 if metadata.size == 0: 662 metadata = None 663 features = df[[col for col in df.columns if col.startswith("features_")]].copy() 664 features.columns = [col.replace("features_", "") for col in features.columns] 665 if features.size == 0: 666 features = None 667 return cls(info=info, metadata=metadata, features=features)
From a single, special DataFrame, create an EventArray.
Returns
a DataFrame with all the data in the EventArray.
669 @classmethod 670 def from_mask( 671 cls, 672 mask: np.ndarray, 673 slide_id: str, 674 tile_n: int, 675 n_roi: int = 0, 676 include_cell_id: bool = True, 677 images: list[np.ndarray] = None, 678 image_labels: list[str] = None, 679 properties: list[str] = None, 680 ) -> Self: 681 """ 682 Extract events from a mask DataFrame, including metadata and features. 683 :param mask: the mask to extract events from. 684 :param slide_id: the slide ID the mask is from. 685 :param tile_n: the tile number the mask is from. 686 :param n_roi: the ROI number the mask is from. 687 :param include_cell_id: whether to include the cell_id, or numerical 688 mask label, as metadata in the EventArray. 689 :param images: the intensity images to extract features from. 690 :param image_labels: the labels for the intensity images. 691 :param properties: list of properties to extract in addition to the defaults: 692 :return: EventArray corresponding to the mask labels. 693 """ 694 if extract_mask_info is None: 695 raise ModuleNotFoundError( 696 "csi_images.csi_images dependencies not installed. Install csi-images " 697 "with [imageio] option to resolve." 698 ) 699 # Gather mask_info 700 if images is not None and image_labels is not None: 701 if len(images) != len(image_labels): 702 raise ValueError("Intensity images and labels must match lengths.") 703 704 mask_info = extract_mask_info(mask, images, image_labels, properties) 705 706 if len(mask_info) == 0: 707 return EventArray() 708 709 # Combine provided info and mask info 710 info = pd.DataFrame( 711 { 712 "slide_id": slide_id, 713 "tile": tile_n, 714 "roi": n_roi, 715 "x": mask_info["x"], 716 "y": mask_info["y"], 717 }, 718 ) 719 # Extract a metadata column if desired 720 if include_cell_id: 721 metadata = pd.DataFrame({"cell_id": mask_info["id"]}) 722 else: 723 metadata = None 724 # If any additional properties were extracted, add them as features 725 mask_info = mask_info.drop(columns=["id", "x", "y"], errors="ignore") 726 if len(mask_info.columns) > 0: 727 features = mask_info 728 else: 729 features = None 730 return EventArray(info, metadata, features)
Extract events from a mask DataFrame, including metadata and features.
Parameters
- mask: the mask to extract events from.
- slide_id: the slide ID the mask is from.
- tile_n: the tile number the mask is from.
- n_roi: the ROI number the mask is from.
- include_cell_id: whether to include the cell_id, or numerical mask label, as metadata in the EventArray.
- images: the intensity images to extract features from.
- image_labels: the labels for the intensity images.
- properties: list of properties to extract in addition to the defaults:
Returns
EventArray corresponding to the mask labels.
732 def save_csv(self, output_path: str) -> bool: 733 """ 734 Save the events to an CSV file, including metadata and features. 735 :param output_path: 736 :return: 737 """ 738 self.to_dataframe().to_csv(output_path, index=False) 739 return os.path.exists(output_path)
Save the events to an CSV file, including metadata and features.
Parameters
- output_path:
Returns
741 @classmethod 742 def load_csv(cls, input_path: str) -> Self: 743 """ 744 Load the events from an CSV file, including metadata and features. 745 :param input_path: 746 :return: 747 """ 748 # Load the CSV file 749 df = pd.read_csv(input_path) 750 return cls.from_dataframe(df)
Load the events from an CSV file, including metadata and features.
Parameters
- input_path:
Returns
752 def save_hdf5(self, output_path: str) -> bool: 753 """ 754 Save the events to an HDF5 file, including metadata and features. 755 Uses the pandas-provided HDF5 functions for ease, and external compatibility, 756 though these files are slightly harder to view in HDFView or similar. 757 :param output_path: 758 :return: 759 """ 760 # Open the output_path as an HDF5 file 761 with pd.HDFStore(output_path) as store: 762 # Store the dataframes in the HDF5 file 763 if self.info is not None: 764 store.put("info", self.info, index=False) 765 if self.metadata is not None: 766 store.put("metadata", self.metadata, index=False) 767 if self.features is not None: 768 store.put("features", self.features, index=False) 769 return os.path.exists(output_path)
Save the events to an HDF5 file, including metadata and features. Uses the pandas-provided HDF5 functions for ease, and external compatibility, though these files are slightly harder to view in HDFView or similar.
Parameters
- output_path:
Returns
771 @classmethod 772 def load_hdf5(cls, input_path: str) -> Self: 773 """ 774 Load the events from an HDF5 file, including metadata and features. 775 :param input_path: 776 :return: 777 """ 778 # Open the input_path as an HDF5 file 779 with pd.HDFStore(input_path, "r") as store: 780 # Load the dataframes from the HDF5 file 781 info = store.get("info") if "info" in store else None 782 metadata = store.get("metadata") if "metadata" in store else None 783 features = store.get("features") if "features" in store else None 784 return cls(info=info, metadata=metadata, features=features)
Load the events from an HDF5 file, including metadata and features.
Parameters
- input_path:
Returns
786 def save_ocular(self, output_path: str, event_type: str = "cells"): 787 """ 788 Save the events to an OCULAR file. Relies on the dataframe originating 789 from an OCULAR file (same columns; duplicate metadata/info). 790 :param output_path: 791 :param event_type: 792 :return: 793 """ 794 if pyreadr is None: 795 raise ModuleNotFoundError( 796 "pyreadr not installed. Install pyreadr directly " 797 "or install csi-images with [rds] option to resolve." 798 ) 799 if event_type == "cells": 800 file_stub = "rc-final" 801 elif event_type == "others": 802 file_stub = "others-final" 803 else: 804 raise ValueError("Invalid event type. Must be cells or others.") 805 806 # Ensure good metadata 807 metadata = pd.DataFrame( 808 { 809 "slide_id": self.info["slide_id"], 810 "frame_id": self.info["tile"], 811 "cell_id": ( 812 self.metadata["cell_id"] 813 if "cell_id" in self.metadata.columns 814 else range(len(self.info)) 815 ), 816 "cellx": self.info["x"], 817 "celly": self.info["y"], 818 } 819 ) 820 if self.metadata is not None: 821 metadata[self.metadata.columns] = self.metadata.copy() 822 823 # Check for the "ocular_interesting" column 824 if event_type == "cells": 825 if "ocular_interesting" in metadata.columns: 826 interesting_rows = metadata["ocular_interesting"].to_numpy(dtype=bool) 827 elif "hcpc" in metadata.columns: 828 # Interesting cells don't get an hcpc designation, leaving them as -1 829 interesting_rows = ( 830 metadata["hcpc"].to_numpy() == -1 831 ) # interesting cells 832 else: 833 interesting_rows = [] 834 if sum(interesting_rows) > 0: 835 # Split the metadata into interesting and regular 836 interesting_events = self.rows(interesting_rows) 837 interesting_df = pd.concat( 838 [interesting_events.features, interesting_events.metadata], axis=1 839 ) 840 data_events = self.rows(~interesting_rows) 841 data_df = pd.concat( 842 [data_events.features, data_events.metadata], axis=1 843 ) 844 data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore") 845 846 # Drop particular columns for "interesting" 847 interesting_df = interesting_df.drop( 848 [ 849 "clust", 850 "hcpc", 851 "frame_id", 852 "cell_id", 853 "unique_id", 854 "ocular_interesting", 855 ], 856 axis=1, 857 errors="ignore", 858 ) 859 # Save both .csv and .rds 860 interesting_stub = os.path.join(output_path, "ocular_interesting") 861 interesting_df.to_csv(f"{interesting_stub}.csv") 862 # Suppress pandas FutureWarning 863 with warnings.catch_warnings(): 864 warnings.simplefilter(action="ignore", category=FutureWarning) 865 pyreadr.write_rds(f"{interesting_stub}.rds", interesting_df) 866 else: 867 data_df = pd.concat([self.features, metadata], axis=1) 868 else: 869 # Get all data and reset_index (will copy it) 870 data_df = pd.concat([self.features, metadata], axis=1) 871 872 # Split based on cluster number to conform to *-final[1-4].rds 873 n_clusters = max(data_df["clust"]) + 1 874 split_idx = [round(i * n_clusters / 4) for i in range(5)] 875 for i in range(4): 876 subset = (split_idx[i] <= data_df["clust"]) & ( 877 data_df["clust"] < split_idx[i + 1] 878 ) 879 data_df.loc[subset, "hcpc"] = i + 1 880 subset = data_df[subset].reset_index(drop=True) 881 # Suppress pandas FutureWarning 882 with warnings.catch_warnings(): 883 warnings.simplefilter(action="ignore", category=FutureWarning) 884 pyreadr.write_rds( 885 os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset 886 ) 887 888 # Create new example cell strings 889 data_df["example_cell_id"] = ( 890 data_df["slide_id"] 891 + " " 892 + data_df["frame_id"].astype(str) 893 + " " 894 + data_df["cell_id"].astype(str) 895 + " " 896 + data_df["cellx"].astype(int).astype(str) 897 + " " 898 + data_df["celly"].astype(int).astype(str) 899 ) 900 # Find averagable data columns 901 if "cellcluster_id" in data_df.columns: 902 end_idx = data_df.columns.get_loc("cellcluster_id") 903 else: 904 end_idx = data_df.columns.get_loc("slide_id") 905 avg_cols = data_df.columns[:end_idx].tolist() 906 # Group by cluster and average 907 data_df = data_df.groupby("clust").agg( 908 **{col: (col, "mean") for col in avg_cols}, 909 count=("clust", "size"), # count rows in each cluster 910 example_cells=("example_cell_id", lambda x: ",".join(x)), 911 hcpc=("hcpc", lambda x: x.iloc[0]), 912 ) 913 data_df = data_df.reset_index() # Do NOT drop, index is "clust" 914 # Create new columns 915 metadata = pd.DataFrame( 916 { 917 "count": data_df["count"], 918 "example_cells": data_df["example_cells"], 919 "clust": data_df["clust"].astype(int), 920 "hcpc": data_df["hcpc"].astype(int), 921 "id": data_df["clust"].astype(int).astype(str), 922 "cccluster": "0", # Dummy value 923 "ccdistance": 0.0, # Dummy value 924 "rownum": list(range(len(data_df))), 925 "framegroup": 0, # Dummy value 926 } 927 ) 928 # Need to pad the features to 761 columns, as per OCULAR report needs 929 additional_columns = range(len(avg_cols), 761) 930 if len(additional_columns) > 0: 931 padding = pd.DataFrame( 932 np.zeros((len(data_df), len(additional_columns))), 933 columns=[f"pad{i}" for i in additional_columns], 934 ) 935 data_df = pd.concat([data_df[avg_cols], padding, metadata], axis=1) 936 else: 937 data_df = pd.concat([data_df[avg_cols], metadata], axis=1) 938 939 # Save the cluster data 940 data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv")) 941 # Suppress pandas FutureWarning 942 with warnings.catch_warnings(): 943 warnings.simplefilter(action="ignore", category=FutureWarning) 944 pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df)
Save the events to an OCULAR file. Relies on the dataframe originating from an OCULAR file (same columns; duplicate metadata/info).
Parameters
- output_path:
- event_type:
Returns
946 @classmethod 947 def load_ocular( 948 cls, 949 input_path: str, 950 event_type="cells", 951 cell_data_files=( 952 "rc-final1.rds", 953 "rc-final2.rds", 954 "rc-final3.rds", 955 "rc-final4.rds", 956 "ocular_interesting.rds", 957 ), 958 others_data_files=( 959 "others-final1.rds", 960 "others-final2.rds", 961 "others-final3.rds", 962 "others-final4.rds", 963 ), 964 atlas_data_files=( 965 "ocular_interesting.rds", 966 "ocular_not_interesting.rds", 967 ), 968 drop_common_events=True, 969 log=None, 970 ) -> Self: 971 """ 972 973 :param input_path: 974 :param event_type: 975 :param cell_data_files: 976 :param others_data_files: 977 :param atlas_data_files: 978 :param drop_common_events: 979 :param log: 980 :return: 981 """ 982 if pyreadr is None: 983 raise ModuleNotFoundError( 984 "pyreadr not installed. Install pyreadr directly " 985 "or install csi-images with [rds] option to resolve." 986 ) 987 # Check if the input path is a directory or a file 988 if os.path.isfile(input_path): 989 data_files = [os.path.basename(input_path)] 990 input_path = os.path.dirname(input_path) 991 if event_type == "cells": 992 data_files = cell_data_files 993 elif event_type == "others": 994 data_files = others_data_files 995 else: 996 raise ValueError("Invalid event type.") 997 998 # Load the data from the OCULAR files 999 file_data = {} 1000 for file in data_files: 1001 file_path = os.path.join(input_path, file) 1002 if not os.path.isfile(file_path): 1003 if log is not None: 1004 log.warning(f"{file} not found for in {input_path}") 1005 continue 1006 file_data[file] = pyreadr.read_r(file_path) 1007 # Get the DataFrame associated with None (pyreadr dict quirk) 1008 file_data[file] = file_data[file][None] 1009 if len(file_data[file]) == 0: 1010 # File gets dropped from the dict 1011 file_data.pop(file) 1012 if log is not None: 1013 log.warning(f"{file} has no cells") 1014 continue 1015 1016 if log is not None: 1017 log.debug(f"{file} has {len(file_data[file])} cells") 1018 1019 # Drop common cells if requested and in this file 1020 if ( 1021 file in atlas_data_files 1022 and drop_common_events 1023 and "catalogue_classification" in file_data[file] 1024 ): 1025 common_cell_indices = ( 1026 file_data[file]["catalogue_classification"] == "common_cell" 1027 ) 1028 if log is not None: 1029 log.debug( 1030 f"Dropping {int(pd.Series.sum(common_cell_indices))}" 1031 f"common cells from {file}" 1032 ) 1033 file_data[file] = file_data[file][common_cell_indices == False] 1034 1035 if len(file_data[file]) == 0: 1036 # File gets dropped from the dict 1037 file_data.pop(file) 1038 if log is not None: 1039 log.warning(f"{file} has no cells after dropping common cells") 1040 continue 1041 1042 # Extract frame_id and cell_id 1043 # DAPI- events already have frame_id cell_id outside rowname 1044 if event_type == "cells" and "frame_id" not in file_data[file].columns: 1045 file_data[file]["rowname"] = file_data[file]["rowname"].astype("str") 1046 # get frame_id cell_id from rownames column and split into two columns 1047 split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True) 1048 if len(split_res.columns) != 2: 1049 log.warning( 1050 f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}' 1051 ) 1052 # then assign it back to the dataframe 1053 file_data[file][["frame_id", "cell_id"]] = split_res.astype("int") 1054 # reset indexes since they can cause NaN values in concat 1055 file_data[file] = file_data[file].reset_index(drop=True) 1056 1057 # Merge the data from all files 1058 if len(file_data) == 0: 1059 return EventArray() 1060 elif len(file_data) == 1: 1061 data = [file_data[file] for file in file_data.keys()][0] 1062 else: 1063 data = pd.concat(file_data.values()) 1064 1065 if log is not None: 1066 log.debug(f"Gathered a total of {len(data)} events") 1067 1068 # Others is missing the "slide_id". Insert it right before "frame_id" column 1069 if event_type == "others" and "slide_id" not in data.columns: 1070 if os.path.basename(input_path) == "ocular": 1071 slide_id = os.path.basename(os.path.dirname(input_path)) 1072 else: 1073 slide_id = "UNKNOWN" 1074 data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id) 1075 1076 # Sort according to ascending cell_id to keep the original, which is in manual_df 1077 data = data.sort_values(by=["cell_id"], ascending=True) 1078 # Filter out duplicates by x & y 1079 data = data.assign( 1080 unique_id=data["slide_id"] 1081 + "_" 1082 + data["frame_id"].astype(str) 1083 + "_" 1084 + data["cellx"].astype(int).astype(str) 1085 + "_" 1086 + data["celly"].astype(int).astype(str) 1087 ) 1088 data = data.drop_duplicates(subset=["unique_id"], keep="first") 1089 # Normal unique_id is with cell_id 1090 data = data.assign( 1091 unique_id=data["slide_id"] 1092 + "_" 1093 + data["frame_id"].astype(str) 1094 + "_" 1095 + data["cell_id"].astype(str) 1096 ) 1097 data = data.reset_index(drop=True) 1098 # All columns up to "slide_id" are features; drop the "slide_id" 1099 features = data.loc[:, :"slide_id"].iloc[:, :-1] 1100 data = data.loc[:, "slide_id":] 1101 # Grab the info columns 1102 info = data[["slide_id", "frame_id", "cellx", "celly"]] 1103 info.columns = ["slide_id", "tile", "x", "y"] 1104 info = info.assign(roi=0) # OCULAR only works on 1 ROI, as far as known 1105 info = info[["slide_id", "tile", "roi", "x", "y"]] 1106 # Metadata has duplicate columns for later convenience 1107 metadata = data 1108 # Certain columns tend to be problematic with mixed data formats... 1109 for col in ["TRITC", "CY5", "FITC"]: 1110 if col in metadata: 1111 labels = { 1112 "False": False, 1113 "True": True, 1114 "FALSE": False, 1115 "TRUE": True, 1116 } 1117 metadata[col] = metadata[col].map(labels).astype(bool) 1118 for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]: 1119 if col in metadata: 1120 metadata[col] = metadata[col].fillna(-1).astype(int) 1121 return EventArray(info, metadata, features)
Parameters
- input_path:
- event_type:
- cell_data_files:
- others_data_files:
- atlas_data_files:
- drop_common_events:
- log: