csi_images.csi_events
Contains the Event class, which represents a single event in a scan. The Event class optionally holds metadata and features. Lists of events with similar metadata or features can be combined into DataFrames for analysis.
The Event class holds the position of the event in the frame, which can be converted to the position in the scanner or slide coordinate positions. See the csi_utils.csi_scans documentation page for more information on the coordinate systems.
1""" 2Contains the Event class, which represents a single event in a scan. 3The Event class optionally holds metadata and features. Lists of events with 4similar metadata or features can be combined into DataFrames for analysis. 5 6The Event class holds the position of the event in the frame, which can be converted 7to the position in the scanner or slide coordinate positions. See the 8csi_utils.csi_scans documentation page for more information on the coordinate systems. 9""" 10 11import os 12import math 13import typing 14 15import numpy as np 16import pandas as pd 17 18import pyreadr 19 20from .csi_scans import Scan 21from .csi_tiles import Tile 22from .csi_frames import Frame 23 24 25class Event: 26 """ 27 A class that represents a single event in a scan, making it easy to evaluate 28 singular events. Required metadata is exposed as attributes, and optional 29 metadata and features are stored as DataFrames. 30 """ 31 32 SCAN_TO_SLIDE_TRANSFORM = { 33 # Axioscan zero is in the top-right corner instead of top-left 34 Scan.Type.AXIOSCAN7: np.array( 35 [ 36 [1, 0, 75000], 37 [0, 1, 0], 38 [0, 0, 1], 39 ] 40 ), 41 # BZScanner coordinates are a special kind of messed up: 42 # - The slide is upside-down. 43 # - The slide is oriented vertically, with the barcode at the bottom. 44 # - Tiles are numbered from the top-right 45 Scan.Type.BZSCANNER: np.array( 46 [ 47 [0, -1, 75000], 48 [-1, 0, 25000], 49 [0, 0, 1], 50 ] 51 ), 52 } 53 """ 54 Homogeneous transformation matrices for converting between scanner and slide 55 coordinates. The matrices are 3x3, with the final column representing the 56 translation in micrometers (um). For more information, see 57 [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations). 58 59 Transformations are nominal, and accuracy is not guaranteed; this is due to 60 imperfections in slides and alignment in the scanners. Units are in micrometers. 61 """ 62 63 def __init__( 64 self, 65 scan: Scan, 66 tile: Tile, 67 x: int, 68 y: int, 69 size: int = 12, # End-to-end size in pixels 70 metadata: pd.Series = None, 71 features: pd.Series = None, 72 ): 73 self.scan = scan 74 self.tile = tile 75 self.x = x 76 self.y = y 77 self.size = size 78 self.metadata = metadata 79 self.features = features 80 81 def __repr__(self) -> str: 82 return f"{self.scan.slide_id}-{self.tile.n}-{self.x}-{self.y}" 83 84 def __eq__(self, other) -> bool: 85 return self.__repr__() == other.__repr__() 86 87 def __lt__(self, other): 88 return self.__repr__() < other.__repr__() 89 90 def get_scan_position(self) -> tuple[float, float]: 91 """ 92 Get the position of the event in the scanner's coordinate frame. 93 :return: the scan position of the event in micrometers (um). 94 """ 95 # Get overall pixel position 96 pixel_x = self.x + (self.scan.tile_width_px * self.tile.x) 97 pixel_y = self.y + (self.scan.tile_height_px * self.tile.y) 98 # Convert to micrometers 99 x_um = pixel_x * self.scan.pixel_size_um 100 y_um = pixel_y * self.scan.pixel_size_um 101 # Add the scan's origin in the scanner frame 102 x_um += self.scan.roi[self.tile.n_roi].origin_x_um 103 y_um += self.scan.roi[self.tile.n_roi].origin_y_um 104 return x_um, y_um 105 106 def get_slide_position(self) -> tuple[float, float]: 107 """ 108 Get the slide position of the event in micrometers (um). 109 :return: the slide position of the event. 110 """ 111 # Turn scan_position into a 3x1 vector 112 scan_position = self.get_scan_position() 113 scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]]) 114 115 # Multiply by the appropriate homogeneous matrix 116 if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value): 117 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7] 118 elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value): 119 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER] 120 else: 121 raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.") 122 slide_position = np.matmul(transform, scan_position) 123 return float(slide_position[0][0]), float(slide_position[1][0]) 124 125 def crop_images( 126 self, images: list[np.ndarray], crop_size: int = 100, in_pixels: bool = True 127 ) -> list[np.ndarray]: 128 """ 129 Get the event crops from the frame images. Called "get" because it does not 130 need to extract anything; it is very quick for extracting multiple events from 131 the same tile. 132 Use this if you're interested in many events. 133 :param images: the frame images. 134 :param crop_size: the square size of the image crop to get for this event. 135 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 136 :return: image_size x image_size crops of the event in the provided frames. If 137 the event is too close to the edge, the crop will be smaller and not centered. 138 """ 139 # Convert a crop size in micrometers to pixels 140 if not in_pixels: 141 crop_size = round(crop_size / self.scan.pixel_size_um) 142 # Find the crop bounds 143 bounds = [ 144 self.x - crop_size // 2, 145 self.y - crop_size // 2, 146 self.x + math.ceil(crop_size / 2), 147 self.y + math.ceil(crop_size / 2), 148 ] 149 # Determine how much the bounds violate the image size 150 displacements = [ 151 max(0, -bounds[0]), 152 max(0, -bounds[1]), 153 max(0, bounds[2] - images[0].shape[1]), 154 max(0, bounds[3] - images[0].shape[0]), 155 ] 156 # Cap off the bounds 157 bounds = [ 158 max(0, bounds[0]), 159 max(0, bounds[1]), 160 min(images[0].shape[1], bounds[2]), 161 min(images[0].shape[0], bounds[3]), 162 ] 163 164 # Crop the images 165 cropped_images = [] 166 for image in images: 167 # Create a blank image of the right size 168 cropped_image = np.zeros((crop_size, crop_size), dtype=image.dtype) 169 170 # Insert the cropped image into the blank image, leaving a black buffer 171 # around the edges if the crop would go beyond the original image bounds 172 cropped_image[ 173 displacements[1] : crop_size - displacements[3], 174 displacements[0] : crop_size - displacements[2], 175 ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]] 176 cropped_images.append(cropped_image) 177 return cropped_images 178 179 def extract_images( 180 self, crop_size: int = 100, in_pixels: bool = True 181 ) -> list[np.ndarray]: 182 """ 183 Extract the images from the scan and tile, reading from the file. Called 184 "extract" because it must read and extract the images from file, which is slow. 185 Use this if you're interested in only a few events, as it is inefficient when 186 reading multiple events from the same tile. 187 :param crop_size: the square size of the image crop to get for this event. 188 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 189 :return: a list of cropped images from the scan in the order of the channels. 190 """ 191 frames = Frame.get_frames(self.tile) 192 images = [frame.get_image() for frame in frames] 193 return self.crop_images(images, crop_size, in_pixels) 194 195 @classmethod 196 def extract_images_for_list( 197 cls, 198 events: list[typing.Self], 199 crop_size: int | list[int] = None, 200 in_pixels: bool = True, 201 ) -> list[list[np.ndarray]]: 202 """ 203 Get the images for a list of events, ensuring that there is no wasteful reading 204 of the same tile multiple times. This function is more efficient than calling 205 extract_event_images for each event. 206 TODO: test this function 207 :param events: the events to extract images for. 208 :param crop_size: the square size of the image crop to get for this event. 209 Defaults to four times the size of the event. 210 :param in_pixels: whether the crop size is in pixels or micrometers. 211 Defaults to pixels, and is ignored if crop_size is None. 212 :return: a list of lists of cropped images for each event. 213 """ 214 if len(events) == 0: 215 return [] 216 217 # Populate a crop size if none provided 218 if crop_size is None: 219 crop_size = [4 * event.size for event in events] 220 in_pixels = True 221 # Propagate a constant crop size 222 elif isinstance(crop_size, int): 223 crop_size = [crop_size] * len(events) 224 225 # Sort the events by tile; use a shallow copy to avoid modifying the original 226 order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__())) 227 228 # Allocate the list to size 229 images = [None] * len(events) 230 last_tile = None 231 frame_images = None # Holds large numpy arrays, so expensive to compare 232 # Iterate through in sorted order 233 for i in order: 234 if last_tile != events[i].tile: 235 # Gather the frame images, preserving them for the next event 236 frames = Frame.get_frames(events[i].tile) 237 frame_images = [frame.get_image() for frame in frames] 238 239 last_tile = events[i].tile 240 # Use the frame images to crop the event images 241 # Preserve the original order using order[i] 242 images[i] = events[i].crop_images(frame_images, crop_size[i], in_pixels) 243 return images 244 245 246class EventArray: 247 """ 248 A class that holds a large number of events' data, making it easy to analyze and 249 manipulate many events at once. A more separated version of the Event class. 250 """ 251 252 INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y", "size"] 253 254 def __init__( 255 self, 256 info: pd.DataFrame = None, 257 metadata: pd.DataFrame = None, 258 features: pd.DataFrame = None, 259 ): 260 # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y", "size" 261 if info is not None and ( 262 not all(col in info.columns for col in self.INFO_COLUMNS) 263 or len(info.columns) != 6 264 ): 265 raise ValueError( 266 "EventArray.info must have columns 'slide_id', 'tile', 'roi', 'x', 'y', 'size'" 267 ) 268 # All DataFrames must all have the same number of rows 269 if metadata is not None and (info is None or len(info) != len(metadata)): 270 raise ValueError( 271 "If EventArray.metadata is not None, it should match rows with .info" 272 ) 273 if features is not None and (info is None or len(info) != len(features)): 274 raise ValueError( 275 "If EventArray.features is not None, it should match rows with .info" 276 ) 277 self.info = info 278 self.metadata = metadata 279 self.features = features 280 281 def __len__(self) -> int: 282 # Convenience method to get the number of events 283 if self.info is None: 284 return 0 285 else: 286 return len(self.info) 287 288 def __eq__(self, other): 289 is_equal = True 290 # Parse all possibilities for info 291 if isinstance(self.info, pd.DataFrame): 292 if isinstance(other.info, pd.DataFrame): 293 is_equal = self.info.equals(other.info) 294 if not is_equal: 295 return False 296 else: 297 return False 298 elif self.info is None: 299 if other.info is not None: 300 return False 301 302 # Parse all possibilities for metadata 303 if isinstance(self.metadata, pd.DataFrame): 304 if isinstance(other.metadata, pd.DataFrame): 305 is_equal = self.metadata.equals(other.metadata) 306 if not is_equal: 307 return False 308 else: 309 return False 310 elif self.metadata is None: 311 if other.metadata is not None: 312 return False 313 314 # Parse all possibilities for features 315 if isinstance(self.features, pd.DataFrame): 316 if isinstance(other.features, pd.DataFrame): 317 is_equal = self.features.equals(other.features) 318 if not is_equal: 319 return False 320 else: 321 return False 322 elif self.features is None: 323 if other.features is not None: 324 return False 325 326 return is_equal 327 328 def sort(self, by: str | list[str], ascending: bool = True) -> typing.Self: 329 """ 330 Sort the EventArray by a column in the info, metadata, or features DataFrames. 331 :param by: name of the column to sort by. 332 :param ascending: whether to sort in ascending order. 333 :return: 334 """ 335 everything = pd.concat([self.info, self.metadata, self.features], axis=1) 336 order = everything.sort_values(by=by, ascending=ascending).index 337 self.info = self.info.loc[order].reset_index(drop=True) 338 if self.metadata is not None: 339 self.metadata = self.metadata.loc[order].reset_index(drop=True) 340 if self.features is not None: 341 self.features = self.features.loc[order].reset_index(drop=True) 342 return self 343 344 def add_metadata(self, new_metadata: pd.DataFrame) -> None: 345 """ 346 Add metadata to the EventArray. 347 :param new_metadata: the metadata to add. 348 """ 349 if self.metadata is None: 350 if len(self) != len(new_metadata): 351 raise ValueError("New metadata does not match length of existing info") 352 self.metadata = new_metadata 353 else: 354 # Add the new metadata columns to the existing metadata 355 self.metadata = pd.concat([self.metadata, new_metadata], axis=1) 356 357 def add_features(self, new_features: pd.DataFrame) -> None: 358 """ 359 Add features to the EventArray. 360 :param new_features: the metadata to add. 361 """ 362 if self.features is None: 363 if len(self) != len(new_features): 364 raise ValueError("New metadata does not match length of existing info") 365 self.features = new_features 366 else: 367 # Add the new metadata columns to the existing metadata 368 self.features = pd.concat([self.features, new_features], axis=1) 369 370 @classmethod 371 def from_list(cls, events: list[typing.Self]) -> typing.Self: 372 """ 373 Combine EventArrays in a list into a single EventArray. 374 :param events: the new list of events. 375 """ 376 all_info = [] 377 all_metadata = [] 378 all_features = [] 379 for event_array in events: 380 # Skip empty EventArrays 381 if event_array.info is not None: 382 all_info.append(event_array.info) 383 if event_array.metadata is not None: 384 all_metadata.append(event_array.metadata) 385 if event_array.features is not None: 386 all_features.append(event_array.features) 387 if len(all_info) == 0: 388 return EventArray() 389 else: 390 all_info = pd.concat(all_info, ignore_index=True) 391 if len(all_metadata) == 0: 392 all_metadata = None 393 else: 394 all_metadata = pd.concat(all_metadata, ignore_index=True) 395 if len(all_features) == 0: 396 all_features = None 397 else: 398 all_features = pd.concat(all_features, ignore_index=True) 399 400 return EventArray(all_info, all_metadata, all_features) 401 402 @classmethod 403 def from_events(cls, events: list[Event]) -> typing.Self: 404 """ 405 Set the events in the EventArray to a new list of events. 406 :param events: the new list of events. 407 """ 408 # Return an empty array if we were passed nothing 409 if events is None or len(events) == 0: 410 return EventArray() 411 # Otherwise, grab the info 412 info = pd.DataFrame( 413 { 414 "slide_id": [event.scan.slide_id for event in events], 415 "tile": [event.tile.n for event in events], 416 "roi": [event.tile.n_roi for event in events], 417 "x": [event.x for event in events], 418 "y": [event.y for event in events], 419 "size": [event.size for event in events], 420 } 421 ) 422 metadata_list = [event.metadata for event in events] 423 # Iterate through and ensure that all metadata is the same shape 424 for metadata in metadata_list: 425 if type(metadata) != type(metadata_list[0]): 426 raise ValueError("All metadata must be the same type.") 427 if metadata is not None and metadata.shape != metadata_list[0].shape: 428 raise ValueError("All metadata must be the same shape.") 429 if metadata_list[0] is None: 430 metadata = None 431 else: 432 metadata = pd.DataFrame(metadata_list) 433 features_list = [event.features for event in events] 434 # Iterate through and ensure that all features are the same shape 435 for features in features_list: 436 if type(features) != type(features_list[0]): 437 raise ValueError("All features must be the same type.") 438 if features is not None and features.shape != features_list[0].shape: 439 raise ValueError("All features must be the same shape.") 440 if features_list[0] is None: 441 features = None 442 else: 443 features = pd.DataFrame(features_list) 444 return EventArray(info=info, metadata=metadata, features=features) 445 446 def to_events( 447 self, 448 scans: list[Scan], 449 ignore_missing_scans=True, 450 ignore_metadata=False, 451 ignore_features=False, 452 ) -> list[Event]: 453 """ 454 Get the events in the EventArray as a list of events. 455 :param scans: the scans that the events belong to. Pass an empty list if you 456 don't care about scan metadata. 457 :param ignore_missing_scans: whether to create blank scans for events without scans. 458 :param ignore_metadata: whether to ignore metadata or not 459 :param ignore_features: whether to ignore features or not 460 :return: 461 """ 462 events = [] 463 for i in range(len(self.info)): 464 # Determine the associated scan 465 scan = None 466 for s in scans: 467 if s.slide_id == self.info["slide_id"][i]: 468 scan = s 469 break 470 if scan is None: 471 if ignore_missing_scans: 472 # Create a placeholder scan if the scan is missing 473 scan = Scan.make_placeholder( 474 self.info["slide_id"][i], 475 self.info["tile"][i], 476 self.info["roi"][i], 477 ) 478 else: 479 raise ValueError( 480 f"Scan {self.info['slide_id'][i]} not found for event {i}." 481 ) 482 # Add to the list 483 events.append( 484 Event( 485 scan, 486 Tile(scan, self.info["tile"][i], self.info["roi"][i]), 487 self.info["x"][i], 488 self.info["y"][i], 489 size=self.info["size"][i], 490 metadata=None if ignore_metadata else self.metadata.loc[i], 491 features=None if ignore_features else self.features.loc[i], 492 ) 493 ) 494 return events 495 496 def to_dataframe(self) -> pd.DataFrame: 497 """ 498 Convert all the data in the EventArray to a single DataFrame. 499 :return: a DataFrame with all the data in the EventArray. 500 """ 501 # Make a copy of the info DataFrame and prepend "info_" to the column names 502 output = self.info.copy() 503 output.columns = [f"info_{col}" for col in output.columns] 504 # Combine with the metadata and prepend "metadata_" to the column names 505 if self.metadata is not None: 506 metadata = self.metadata.copy() 507 metadata.columns = [f"metadata_{col}" for col in metadata.columns] 508 output = pd.concat([output, metadata], axis=1) 509 # Combine with the features and prepend "features_" to the column names 510 if self.features is not None: 511 features = self.features.copy() 512 features.columns = [f"features_{col}" for col in features.columns] 513 output = pd.concat([output, features], axis=1) 514 return output 515 516 @classmethod 517 def from_dataframe(cls, df) -> typing.Self: 518 """ 519 From a single, special DataFrame, create an EventArray. 520 :return: a DataFrame with all the data in the EventArray. 521 """ 522 # Split the columns into info, metadata, and features and strip prefix 523 info = df[[col for col in df.columns if col.startswith("info_")]].copy() 524 info.columns = [col.replace("info_", "") for col in info.columns] 525 if info.size == 0: 526 info = None 527 metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy() 528 metadata.columns = [col.replace("metadata_", "") for col in metadata.columns] 529 if metadata.size == 0: 530 metadata = None 531 features = df[[col for col in df.columns if col.startswith("features_")]].copy() 532 features.columns = [col.replace("features_", "") for col in features.columns] 533 if features.size == 0: 534 features = None 535 return cls(info=info, metadata=metadata, features=features) 536 537 def save_csv(self, output_path: str) -> bool: 538 """ 539 Save the events to an CSV file, including metadata and features. 540 :param output_path: 541 :return: 542 """ 543 self.to_dataframe().to_csv(output_path, index=False) 544 return os.path.exists(output_path) 545 546 @classmethod 547 def load_csv(cls, input_path: str) -> typing.Self: 548 """ 549 Load the events from an CSV file, including metadata and features. 550 :param input_path: 551 :return: 552 """ 553 # Load the CSV file 554 df = pd.read_csv(input_path) 555 return cls.from_dataframe(df) 556 557 def save_hdf5(self, output_path: str) -> bool: 558 """ 559 Save the events to an HDF5 file, including metadata and features. 560 Uses the pandas-provided HDF5 functions for ease, and external compatibility, 561 though these files are slightly harder to view in HDFView or similar. 562 :param output_path: 563 :return: 564 """ 565 # Open the output_path as an HDF5 file 566 with pd.HDFStore(output_path) as store: 567 # Store the dataframes in the HDF5 file 568 if self.info is not None: 569 store.put("info", self.info, index=False) 570 if self.metadata is not None: 571 store.put("metadata", self.metadata, index=False) 572 if self.features is not None: 573 store.put("features", self.features, index=False) 574 return os.path.exists(output_path) 575 576 @classmethod 577 def load_hdf5(cls, input_path: str) -> typing.Self: 578 """ 579 Load the events from an HDF5 file, including metadata and features. 580 :param input_path: 581 :return: 582 """ 583 # Open the input_path as an HDF5 file 584 with pd.HDFStore(input_path) as store: 585 # Load the dataframes from the HDF5 file 586 info = store.get("info") if "info" in store else None 587 metadata = store.get("metadata") if "metadata" in store else None 588 features = store.get("features") if "features" in store else None 589 return cls(info=info, metadata=metadata, features=features) 590 591 @classmethod 592 def load_ocular( 593 cls, 594 input_path: str, 595 event_type="cells", 596 cell_data_files=( 597 "rc-final1.rds", 598 "rc-final2.rds", 599 "rc-final3.rds", 600 "rc-final4.rds", 601 "ocular_interesting.rds", 602 ), 603 others_data_files=( 604 "others-final1.rds", 605 "others-final2.rds", 606 "others-final3.rds", 607 "others-final4.rds", 608 ), 609 atlas_data_files=( 610 "ocular_interesting.rds", 611 "ocular_not_interesting.rds", 612 ), 613 merge_event_data_with_stats=True, 614 filter_and_generate_morphs=True, 615 drop_common_events=True, 616 log=None, 617 ) -> typing.Self: 618 """ 619 620 :param input_path: 621 :param event_type: 622 :param cell_data_files: 623 :param others_data_files: 624 :param atlas_data_files: 625 :param merge_event_data_with_stats: 626 :param filter_and_generate_morphs: 627 :param drop_common_events: 628 :param log: 629 :return: 630 """ 631 # Check if the input path is a directory or a file 632 if os.path.isfile(input_path): 633 data_files = [os.path.basename(input_path)] 634 input_path = os.path.dirname(input_path) 635 if event_type == "cells": 636 data_files = cell_data_files 637 elif event_type == "others": 638 data_files = others_data_files 639 else: 640 raise ValueError("Invalid event type.") 641 642 # Load the data from the OCULAR files 643 file_data = {} 644 for file in data_files: 645 file_path = os.path.join(input_path, file) 646 if not os.path.isfile(file_path): 647 if log is not None: 648 log.warning(f"{file} not found for in {input_path}") 649 continue 650 file_data[file] = pyreadr.read_r(file_path) 651 # Get the DataFrame associated with None (pyreadr dict quirk) 652 file_data[file] = file_data[file][None] 653 if len(file_data[file]) == 0: 654 # File gets dropped from the dict 655 file_data.pop(file) 656 if log is not None: 657 log.warning(f"{file} has no cells") 658 continue 659 660 if log is not None: 661 log.debug(f"{file} has {len(file_data[file])} cells") 662 663 # Drop common cells if requested and in this file 664 if file in atlas_data_files and drop_common_events: 665 common_cell_indices = ( 666 file_data[file]["catalogue_classification"] == "common_cell" 667 ) 668 if log is not None: 669 log.debug( 670 f"Dropping {int(pd.Series.sum(common_cell_indices))}" 671 f"common cells from {file}" 672 ) 673 file_data[file] = file_data[file][common_cell_indices == False] 674 675 if len(file_data[file]) == 0: 676 # File gets dropped from the dict 677 file_data.pop(file) 678 if log is not None: 679 log.warning(f"{file} has no cells after dropping common cells") 680 continue 681 682 # Extract frame_id and cell_id 683 # DAPI- events already have frame_id cell_id outside rowname 684 if event_type == "cells": 685 file_data[file]["rowname"] = file_data[file]["rowname"].astype("str") 686 # get frame_id cell_id from rownames column and split into two columns 687 split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True) 688 if len(split_res.columns) != 2: 689 log.warning( 690 f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}' 691 ) 692 # then assign it back to the dataframe 693 file_data[file][["frame_id", "cell_id"]] = split_res.astype("int") 694 # reset indexes since they can cause NaN values in concat 695 file_data[file].reset_index(drop=True, inplace=True) 696 697 # Merge the data from all files 698 if len(file_data) == 0: 699 return EventArray() 700 elif len(file_data) == 1: 701 data = [file_data[file] for file in file_data.keys()][0] 702 else: 703 data = pd.concat(file_data.values()) 704 705 if log is not None: 706 log.debug(f"Gathered a total of {len(data)} events") 707 708 # Others is missing the "slide_id". Insert it right before "frame_id" column 709 if event_type == "others" and "slide_id" not in data.columns: 710 if os.path.basename(input_path) == "ocular": 711 slide_id = os.path.basename(os.path.dirname(input_path)) 712 else: 713 slide_id = "UNKNOWN" 714 data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id) 715 716 # Sort according to ascending cell_id to keep the original, which is in manual_df 717 data = data.sort_values(by=["cell_id"], ascending=True) 718 # Filter out duplicates by x & y 719 data = data.assign( 720 unique_id=data["slide_id"] 721 + "_" 722 + data["frame_id"].astype(str) 723 + "_" 724 + data["cellx"].astype(int).astype(str) 725 + "_" 726 + data["celly"].astype(int).astype(str) 727 ) 728 data = data.drop_duplicates(subset=["unique_id"], keep="first", inplace=False) 729 # Filter out duplicates by cell_id 730 data = data.assign( 731 unique_id=data["slide_id"] 732 + "_" 733 + data["frame_id"].astype(str) 734 + "_" 735 + data["cell_id"].astype(str) 736 ) 737 data.reset_index(drop=True, inplace=True) 738 # All columns up to "slide_id" are features; drop the "slide_id" 739 features = data.loc[:, :"slide_id"].iloc[:, :-1] 740 data = data.loc[:, "slide_id":] 741 # Grab the info columns 742 info = data[["slide_id", "frame_id", "cellx", "celly"]] 743 info.columns = ["slide_id", "tile", "x", "y"] 744 info = info.assign( 745 roi=0, # OCULAR only works on 1 ROI, as far as known 746 size=25, # Static, for later montaging 747 ) 748 info = info[["slide_id", "tile", "roi", "x", "y", "size"]] 749 # Metadata has duplicate columns for later convenience 750 metadata = data 751 return EventArray(info, metadata, features) 752 753 def save_ocular(self, output_path: str, event_type: str = "cells") -> bool: 754 """ 755 Save the events to an OCULAR file. Relies on the dataframe originating 756 from an OCULAR file (same columns; duplicate metadata/info). 757 :param output_path: 758 :return: 759 """ 760 if event_type == "cells": 761 file_stub = "rc-final" 762 elif event_type == "others": 763 file_stub = "others-final" 764 else: 765 raise ValueError("Invalid event type. Must be cells or others.") 766 767 # Check for the "ocular_interesting" column 768 if event_type == "cells" and "ocular_interesting" in self.metadata.columns: 769 interesting = self.metadata["ocular_interesting"] 770 # Split the metadata into interesting and regular 771 # Interesting will only have dropped columns, with no internal changes 772 interesting = pd.concat( 773 [self.features[interesting], self.metadata[interesting]], axis=1 774 ).reset_index(drop=True) 775 # Data will get some columns changed, so copy it 776 data = ( 777 pd.concat( 778 [self.features[~interesting], self.metadata[~interesting]], axis=1 779 ) 780 .copy(deep=True) 781 .reset_index(drop=True) 782 .drop(columns=["ocular_interesting"]) 783 ) 784 785 # Drop particular columns for "interesting" 786 interesting = interesting.drop( 787 [ 788 "clust", 789 "hcpc", 790 "frame_id", 791 "cell_id", 792 "unique_id", 793 "ocular_interesting", 794 ], 795 axis=1, 796 ) 797 # Save both .csv and .rds 798 interesting.to_csv( 799 os.path.join(output_path, "ocular_interesting.csv"), index=False 800 ) 801 pyreadr.write_rds( 802 os.path.join(output_path, "ocular_interesting.rds"), interesting 803 ) 804 else: 805 # Get all data, copying it 806 data = ( 807 pd.concat([self.features, self.metadata], axis=1) 808 .copy(deep=True) 809 .reset_index(drop=True) 810 ) 811 812 # Split based on cluster number to conform to *-final[1-4].rds 813 n_clusters = max(data["clust"]) + 1 814 split_idx = [round(i * n_clusters / 4) for i in range(5)] 815 for i in range(4): 816 subset = (split_idx[i] <= data["clust"]) & ( 817 data["clust"] < split_idx[i + 1] 818 ) 819 subset = data[subset].reset_index(drop=True) 820 subset["hcpc"] = i + 1 821 pyreadr.write_rds( 822 os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset 823 ) 824 825 # Create new example cell strings 826 data["example_cell_id"] = ( 827 data["slide_id"] 828 + " " 829 + data["frame_id"].astype(str) 830 + " " 831 + data["cell_id"].astype(str) 832 + " " 833 + data["cellx"].astype(int).astype(str) 834 + " " 835 + data["celly"].astype(int).astype(str) 836 ) 837 # Find averagable data columns 838 if "cellcluster_id" in data.columns: 839 avg_cols = data.columns[: data.columns.get_loc("cellcluster_id")].tolist() 840 else: 841 avg_cols = data.columns[: data.columns.get_loc("slide_id")].tolist() 842 # Group by cluster and average 843 data = data.groupby("clust").agg( 844 **{col: (col, "mean") for col in avg_cols}, 845 count=("clust", "size"), # count rows in each cluster 846 example_cells=("example_cell_id", lambda x: ",".join(x)), 847 hcpc=("hcpc", lambda x: x.iloc[0]), 848 ) 849 data = data.reset_index() # Do NOT drop, index is "clust" 850 # Create new columns 851 metadata = pd.DataFrame( 852 { 853 "count": data["count"], 854 "example_cells": data["example_cells"], 855 "clust": data["clust"].astype(int), 856 "hcpc": data["hcpc"].astype(int), 857 "id": data["clust"].astype(int).astype(str), 858 "cccluster": "0", # Dummy value 859 "ccdistance": 0.0, # Dummy value 860 "rownum": list(range(len(data))), 861 "framegroup": 0, # Dummy value 862 } 863 ) 864 data = pd.concat([data.loc[:, avg_cols], metadata], axis=1) 865 # Save the data 866 data.to_csv(os.path.join(output_path, f"{file_stub}.csv"), index=False) 867 pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data)
26class Event: 27 """ 28 A class that represents a single event in a scan, making it easy to evaluate 29 singular events. Required metadata is exposed as attributes, and optional 30 metadata and features are stored as DataFrames. 31 """ 32 33 SCAN_TO_SLIDE_TRANSFORM = { 34 # Axioscan zero is in the top-right corner instead of top-left 35 Scan.Type.AXIOSCAN7: np.array( 36 [ 37 [1, 0, 75000], 38 [0, 1, 0], 39 [0, 0, 1], 40 ] 41 ), 42 # BZScanner coordinates are a special kind of messed up: 43 # - The slide is upside-down. 44 # - The slide is oriented vertically, with the barcode at the bottom. 45 # - Tiles are numbered from the top-right 46 Scan.Type.BZSCANNER: np.array( 47 [ 48 [0, -1, 75000], 49 [-1, 0, 25000], 50 [0, 0, 1], 51 ] 52 ), 53 } 54 """ 55 Homogeneous transformation matrices for converting between scanner and slide 56 coordinates. The matrices are 3x3, with the final column representing the 57 translation in micrometers (um). For more information, see 58 [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations). 59 60 Transformations are nominal, and accuracy is not guaranteed; this is due to 61 imperfections in slides and alignment in the scanners. Units are in micrometers. 62 """ 63 64 def __init__( 65 self, 66 scan: Scan, 67 tile: Tile, 68 x: int, 69 y: int, 70 size: int = 12, # End-to-end size in pixels 71 metadata: pd.Series = None, 72 features: pd.Series = None, 73 ): 74 self.scan = scan 75 self.tile = tile 76 self.x = x 77 self.y = y 78 self.size = size 79 self.metadata = metadata 80 self.features = features 81 82 def __repr__(self) -> str: 83 return f"{self.scan.slide_id}-{self.tile.n}-{self.x}-{self.y}" 84 85 def __eq__(self, other) -> bool: 86 return self.__repr__() == other.__repr__() 87 88 def __lt__(self, other): 89 return self.__repr__() < other.__repr__() 90 91 def get_scan_position(self) -> tuple[float, float]: 92 """ 93 Get the position of the event in the scanner's coordinate frame. 94 :return: the scan position of the event in micrometers (um). 95 """ 96 # Get overall pixel position 97 pixel_x = self.x + (self.scan.tile_width_px * self.tile.x) 98 pixel_y = self.y + (self.scan.tile_height_px * self.tile.y) 99 # Convert to micrometers 100 x_um = pixel_x * self.scan.pixel_size_um 101 y_um = pixel_y * self.scan.pixel_size_um 102 # Add the scan's origin in the scanner frame 103 x_um += self.scan.roi[self.tile.n_roi].origin_x_um 104 y_um += self.scan.roi[self.tile.n_roi].origin_y_um 105 return x_um, y_um 106 107 def get_slide_position(self) -> tuple[float, float]: 108 """ 109 Get the slide position of the event in micrometers (um). 110 :return: the slide position of the event. 111 """ 112 # Turn scan_position into a 3x1 vector 113 scan_position = self.get_scan_position() 114 scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]]) 115 116 # Multiply by the appropriate homogeneous matrix 117 if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value): 118 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7] 119 elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value): 120 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER] 121 else: 122 raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.") 123 slide_position = np.matmul(transform, scan_position) 124 return float(slide_position[0][0]), float(slide_position[1][0]) 125 126 def crop_images( 127 self, images: list[np.ndarray], crop_size: int = 100, in_pixels: bool = True 128 ) -> list[np.ndarray]: 129 """ 130 Get the event crops from the frame images. Called "get" because it does not 131 need to extract anything; it is very quick for extracting multiple events from 132 the same tile. 133 Use this if you're interested in many events. 134 :param images: the frame images. 135 :param crop_size: the square size of the image crop to get for this event. 136 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 137 :return: image_size x image_size crops of the event in the provided frames. If 138 the event is too close to the edge, the crop will be smaller and not centered. 139 """ 140 # Convert a crop size in micrometers to pixels 141 if not in_pixels: 142 crop_size = round(crop_size / self.scan.pixel_size_um) 143 # Find the crop bounds 144 bounds = [ 145 self.x - crop_size // 2, 146 self.y - crop_size // 2, 147 self.x + math.ceil(crop_size / 2), 148 self.y + math.ceil(crop_size / 2), 149 ] 150 # Determine how much the bounds violate the image size 151 displacements = [ 152 max(0, -bounds[0]), 153 max(0, -bounds[1]), 154 max(0, bounds[2] - images[0].shape[1]), 155 max(0, bounds[3] - images[0].shape[0]), 156 ] 157 # Cap off the bounds 158 bounds = [ 159 max(0, bounds[0]), 160 max(0, bounds[1]), 161 min(images[0].shape[1], bounds[2]), 162 min(images[0].shape[0], bounds[3]), 163 ] 164 165 # Crop the images 166 cropped_images = [] 167 for image in images: 168 # Create a blank image of the right size 169 cropped_image = np.zeros((crop_size, crop_size), dtype=image.dtype) 170 171 # Insert the cropped image into the blank image, leaving a black buffer 172 # around the edges if the crop would go beyond the original image bounds 173 cropped_image[ 174 displacements[1] : crop_size - displacements[3], 175 displacements[0] : crop_size - displacements[2], 176 ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]] 177 cropped_images.append(cropped_image) 178 return cropped_images 179 180 def extract_images( 181 self, crop_size: int = 100, in_pixels: bool = True 182 ) -> list[np.ndarray]: 183 """ 184 Extract the images from the scan and tile, reading from the file. Called 185 "extract" because it must read and extract the images from file, which is slow. 186 Use this if you're interested in only a few events, as it is inefficient when 187 reading multiple events from the same tile. 188 :param crop_size: the square size of the image crop to get for this event. 189 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 190 :return: a list of cropped images from the scan in the order of the channels. 191 """ 192 frames = Frame.get_frames(self.tile) 193 images = [frame.get_image() for frame in frames] 194 return self.crop_images(images, crop_size, in_pixels) 195 196 @classmethod 197 def extract_images_for_list( 198 cls, 199 events: list[typing.Self], 200 crop_size: int | list[int] = None, 201 in_pixels: bool = True, 202 ) -> list[list[np.ndarray]]: 203 """ 204 Get the images for a list of events, ensuring that there is no wasteful reading 205 of the same tile multiple times. This function is more efficient than calling 206 extract_event_images for each event. 207 TODO: test this function 208 :param events: the events to extract images for. 209 :param crop_size: the square size of the image crop to get for this event. 210 Defaults to four times the size of the event. 211 :param in_pixels: whether the crop size is in pixels or micrometers. 212 Defaults to pixels, and is ignored if crop_size is None. 213 :return: a list of lists of cropped images for each event. 214 """ 215 if len(events) == 0: 216 return [] 217 218 # Populate a crop size if none provided 219 if crop_size is None: 220 crop_size = [4 * event.size for event in events] 221 in_pixels = True 222 # Propagate a constant crop size 223 elif isinstance(crop_size, int): 224 crop_size = [crop_size] * len(events) 225 226 # Sort the events by tile; use a shallow copy to avoid modifying the original 227 order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__())) 228 229 # Allocate the list to size 230 images = [None] * len(events) 231 last_tile = None 232 frame_images = None # Holds large numpy arrays, so expensive to compare 233 # Iterate through in sorted order 234 for i in order: 235 if last_tile != events[i].tile: 236 # Gather the frame images, preserving them for the next event 237 frames = Frame.get_frames(events[i].tile) 238 frame_images = [frame.get_image() for frame in frames] 239 240 last_tile = events[i].tile 241 # Use the frame images to crop the event images 242 # Preserve the original order using order[i] 243 images[i] = events[i].crop_images(frame_images, crop_size[i], in_pixels) 244 return images
A class that represents a single event in a scan, making it easy to evaluate singular events. Required metadata is exposed as attributes, and optional metadata and features are stored as DataFrames.
64 def __init__( 65 self, 66 scan: Scan, 67 tile: Tile, 68 x: int, 69 y: int, 70 size: int = 12, # End-to-end size in pixels 71 metadata: pd.Series = None, 72 features: pd.Series = None, 73 ): 74 self.scan = scan 75 self.tile = tile 76 self.x = x 77 self.y = y 78 self.size = size 79 self.metadata = metadata 80 self.features = features
Homogeneous transformation matrices for converting between scanner and slide coordinates. The matrices are 3x3, with the final column representing the translation in micrometers (um). For more information, see affine transformations.
Transformations are nominal, and accuracy is not guaranteed; this is due to imperfections in slides and alignment in the scanners. Units are in micrometers.
91 def get_scan_position(self) -> tuple[float, float]: 92 """ 93 Get the position of the event in the scanner's coordinate frame. 94 :return: the scan position of the event in micrometers (um). 95 """ 96 # Get overall pixel position 97 pixel_x = self.x + (self.scan.tile_width_px * self.tile.x) 98 pixel_y = self.y + (self.scan.tile_height_px * self.tile.y) 99 # Convert to micrometers 100 x_um = pixel_x * self.scan.pixel_size_um 101 y_um = pixel_y * self.scan.pixel_size_um 102 # Add the scan's origin in the scanner frame 103 x_um += self.scan.roi[self.tile.n_roi].origin_x_um 104 y_um += self.scan.roi[self.tile.n_roi].origin_y_um 105 return x_um, y_um
Get the position of the event in the scanner's coordinate frame.
Returns
the scan position of the event in micrometers (um).
107 def get_slide_position(self) -> tuple[float, float]: 108 """ 109 Get the slide position of the event in micrometers (um). 110 :return: the slide position of the event. 111 """ 112 # Turn scan_position into a 3x1 vector 113 scan_position = self.get_scan_position() 114 scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]]) 115 116 # Multiply by the appropriate homogeneous matrix 117 if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value): 118 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7] 119 elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value): 120 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER] 121 else: 122 raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.") 123 slide_position = np.matmul(transform, scan_position) 124 return float(slide_position[0][0]), float(slide_position[1][0])
Get the slide position of the event in micrometers (um).
Returns
the slide position of the event.
126 def crop_images( 127 self, images: list[np.ndarray], crop_size: int = 100, in_pixels: bool = True 128 ) -> list[np.ndarray]: 129 """ 130 Get the event crops from the frame images. Called "get" because it does not 131 need to extract anything; it is very quick for extracting multiple events from 132 the same tile. 133 Use this if you're interested in many events. 134 :param images: the frame images. 135 :param crop_size: the square size of the image crop to get for this event. 136 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 137 :return: image_size x image_size crops of the event in the provided frames. If 138 the event is too close to the edge, the crop will be smaller and not centered. 139 """ 140 # Convert a crop size in micrometers to pixels 141 if not in_pixels: 142 crop_size = round(crop_size / self.scan.pixel_size_um) 143 # Find the crop bounds 144 bounds = [ 145 self.x - crop_size // 2, 146 self.y - crop_size // 2, 147 self.x + math.ceil(crop_size / 2), 148 self.y + math.ceil(crop_size / 2), 149 ] 150 # Determine how much the bounds violate the image size 151 displacements = [ 152 max(0, -bounds[0]), 153 max(0, -bounds[1]), 154 max(0, bounds[2] - images[0].shape[1]), 155 max(0, bounds[3] - images[0].shape[0]), 156 ] 157 # Cap off the bounds 158 bounds = [ 159 max(0, bounds[0]), 160 max(0, bounds[1]), 161 min(images[0].shape[1], bounds[2]), 162 min(images[0].shape[0], bounds[3]), 163 ] 164 165 # Crop the images 166 cropped_images = [] 167 for image in images: 168 # Create a blank image of the right size 169 cropped_image = np.zeros((crop_size, crop_size), dtype=image.dtype) 170 171 # Insert the cropped image into the blank image, leaving a black buffer 172 # around the edges if the crop would go beyond the original image bounds 173 cropped_image[ 174 displacements[1] : crop_size - displacements[3], 175 displacements[0] : crop_size - displacements[2], 176 ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]] 177 cropped_images.append(cropped_image) 178 return cropped_images
Get the event crops from the frame images. Called "get" because it does not need to extract anything; it is very quick for extracting multiple events from the same tile. Use this if you're interested in many events.
Parameters
- images: the frame images.
- crop_size: the square size of the image crop to get for this event.
- in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
Returns
image_size x image_size crops of the event in the provided frames. If the event is too close to the edge, the crop will be smaller and not centered.
180 def extract_images( 181 self, crop_size: int = 100, in_pixels: bool = True 182 ) -> list[np.ndarray]: 183 """ 184 Extract the images from the scan and tile, reading from the file. Called 185 "extract" because it must read and extract the images from file, which is slow. 186 Use this if you're interested in only a few events, as it is inefficient when 187 reading multiple events from the same tile. 188 :param crop_size: the square size of the image crop to get for this event. 189 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 190 :return: a list of cropped images from the scan in the order of the channels. 191 """ 192 frames = Frame.get_frames(self.tile) 193 images = [frame.get_image() for frame in frames] 194 return self.crop_images(images, crop_size, in_pixels)
Extract the images from the scan and tile, reading from the file. Called "extract" because it must read and extract the images from file, which is slow. Use this if you're interested in only a few events, as it is inefficient when reading multiple events from the same tile.
Parameters
- crop_size: the square size of the image crop to get for this event.
- in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
Returns
a list of cropped images from the scan in the order of the channels.
196 @classmethod 197 def extract_images_for_list( 198 cls, 199 events: list[typing.Self], 200 crop_size: int | list[int] = None, 201 in_pixels: bool = True, 202 ) -> list[list[np.ndarray]]: 203 """ 204 Get the images for a list of events, ensuring that there is no wasteful reading 205 of the same tile multiple times. This function is more efficient than calling 206 extract_event_images for each event. 207 TODO: test this function 208 :param events: the events to extract images for. 209 :param crop_size: the square size of the image crop to get for this event. 210 Defaults to four times the size of the event. 211 :param in_pixels: whether the crop size is in pixels or micrometers. 212 Defaults to pixels, and is ignored if crop_size is None. 213 :return: a list of lists of cropped images for each event. 214 """ 215 if len(events) == 0: 216 return [] 217 218 # Populate a crop size if none provided 219 if crop_size is None: 220 crop_size = [4 * event.size for event in events] 221 in_pixels = True 222 # Propagate a constant crop size 223 elif isinstance(crop_size, int): 224 crop_size = [crop_size] * len(events) 225 226 # Sort the events by tile; use a shallow copy to avoid modifying the original 227 order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__())) 228 229 # Allocate the list to size 230 images = [None] * len(events) 231 last_tile = None 232 frame_images = None # Holds large numpy arrays, so expensive to compare 233 # Iterate through in sorted order 234 for i in order: 235 if last_tile != events[i].tile: 236 # Gather the frame images, preserving them for the next event 237 frames = Frame.get_frames(events[i].tile) 238 frame_images = [frame.get_image() for frame in frames] 239 240 last_tile = events[i].tile 241 # Use the frame images to crop the event images 242 # Preserve the original order using order[i] 243 images[i] = events[i].crop_images(frame_images, crop_size[i], in_pixels) 244 return images
Get the images for a list of events, ensuring that there is no wasteful reading of the same tile multiple times. This function is more efficient than calling extract_event_images for each event. TODO: test this function
Parameters
- events: the events to extract images for.
- crop_size: the square size of the image crop to get for this event. Defaults to four times the size of the event.
- in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels, and is ignored if crop_size is None.
Returns
a list of lists of cropped images for each event.
247class EventArray: 248 """ 249 A class that holds a large number of events' data, making it easy to analyze and 250 manipulate many events at once. A more separated version of the Event class. 251 """ 252 253 INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y", "size"] 254 255 def __init__( 256 self, 257 info: pd.DataFrame = None, 258 metadata: pd.DataFrame = None, 259 features: pd.DataFrame = None, 260 ): 261 # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y", "size" 262 if info is not None and ( 263 not all(col in info.columns for col in self.INFO_COLUMNS) 264 or len(info.columns) != 6 265 ): 266 raise ValueError( 267 "EventArray.info must have columns 'slide_id', 'tile', 'roi', 'x', 'y', 'size'" 268 ) 269 # All DataFrames must all have the same number of rows 270 if metadata is not None and (info is None or len(info) != len(metadata)): 271 raise ValueError( 272 "If EventArray.metadata is not None, it should match rows with .info" 273 ) 274 if features is not None and (info is None or len(info) != len(features)): 275 raise ValueError( 276 "If EventArray.features is not None, it should match rows with .info" 277 ) 278 self.info = info 279 self.metadata = metadata 280 self.features = features 281 282 def __len__(self) -> int: 283 # Convenience method to get the number of events 284 if self.info is None: 285 return 0 286 else: 287 return len(self.info) 288 289 def __eq__(self, other): 290 is_equal = True 291 # Parse all possibilities for info 292 if isinstance(self.info, pd.DataFrame): 293 if isinstance(other.info, pd.DataFrame): 294 is_equal = self.info.equals(other.info) 295 if not is_equal: 296 return False 297 else: 298 return False 299 elif self.info is None: 300 if other.info is not None: 301 return False 302 303 # Parse all possibilities for metadata 304 if isinstance(self.metadata, pd.DataFrame): 305 if isinstance(other.metadata, pd.DataFrame): 306 is_equal = self.metadata.equals(other.metadata) 307 if not is_equal: 308 return False 309 else: 310 return False 311 elif self.metadata is None: 312 if other.metadata is not None: 313 return False 314 315 # Parse all possibilities for features 316 if isinstance(self.features, pd.DataFrame): 317 if isinstance(other.features, pd.DataFrame): 318 is_equal = self.features.equals(other.features) 319 if not is_equal: 320 return False 321 else: 322 return False 323 elif self.features is None: 324 if other.features is not None: 325 return False 326 327 return is_equal 328 329 def sort(self, by: str | list[str], ascending: bool = True) -> typing.Self: 330 """ 331 Sort the EventArray by a column in the info, metadata, or features DataFrames. 332 :param by: name of the column to sort by. 333 :param ascending: whether to sort in ascending order. 334 :return: 335 """ 336 everything = pd.concat([self.info, self.metadata, self.features], axis=1) 337 order = everything.sort_values(by=by, ascending=ascending).index 338 self.info = self.info.loc[order].reset_index(drop=True) 339 if self.metadata is not None: 340 self.metadata = self.metadata.loc[order].reset_index(drop=True) 341 if self.features is not None: 342 self.features = self.features.loc[order].reset_index(drop=True) 343 return self 344 345 def add_metadata(self, new_metadata: pd.DataFrame) -> None: 346 """ 347 Add metadata to the EventArray. 348 :param new_metadata: the metadata to add. 349 """ 350 if self.metadata is None: 351 if len(self) != len(new_metadata): 352 raise ValueError("New metadata does not match length of existing info") 353 self.metadata = new_metadata 354 else: 355 # Add the new metadata columns to the existing metadata 356 self.metadata = pd.concat([self.metadata, new_metadata], axis=1) 357 358 def add_features(self, new_features: pd.DataFrame) -> None: 359 """ 360 Add features to the EventArray. 361 :param new_features: the metadata to add. 362 """ 363 if self.features is None: 364 if len(self) != len(new_features): 365 raise ValueError("New metadata does not match length of existing info") 366 self.features = new_features 367 else: 368 # Add the new metadata columns to the existing metadata 369 self.features = pd.concat([self.features, new_features], axis=1) 370 371 @classmethod 372 def from_list(cls, events: list[typing.Self]) -> typing.Self: 373 """ 374 Combine EventArrays in a list into a single EventArray. 375 :param events: the new list of events. 376 """ 377 all_info = [] 378 all_metadata = [] 379 all_features = [] 380 for event_array in events: 381 # Skip empty EventArrays 382 if event_array.info is not None: 383 all_info.append(event_array.info) 384 if event_array.metadata is not None: 385 all_metadata.append(event_array.metadata) 386 if event_array.features is not None: 387 all_features.append(event_array.features) 388 if len(all_info) == 0: 389 return EventArray() 390 else: 391 all_info = pd.concat(all_info, ignore_index=True) 392 if len(all_metadata) == 0: 393 all_metadata = None 394 else: 395 all_metadata = pd.concat(all_metadata, ignore_index=True) 396 if len(all_features) == 0: 397 all_features = None 398 else: 399 all_features = pd.concat(all_features, ignore_index=True) 400 401 return EventArray(all_info, all_metadata, all_features) 402 403 @classmethod 404 def from_events(cls, events: list[Event]) -> typing.Self: 405 """ 406 Set the events in the EventArray to a new list of events. 407 :param events: the new list of events. 408 """ 409 # Return an empty array if we were passed nothing 410 if events is None or len(events) == 0: 411 return EventArray() 412 # Otherwise, grab the info 413 info = pd.DataFrame( 414 { 415 "slide_id": [event.scan.slide_id for event in events], 416 "tile": [event.tile.n for event in events], 417 "roi": [event.tile.n_roi for event in events], 418 "x": [event.x for event in events], 419 "y": [event.y for event in events], 420 "size": [event.size for event in events], 421 } 422 ) 423 metadata_list = [event.metadata for event in events] 424 # Iterate through and ensure that all metadata is the same shape 425 for metadata in metadata_list: 426 if type(metadata) != type(metadata_list[0]): 427 raise ValueError("All metadata must be the same type.") 428 if metadata is not None and metadata.shape != metadata_list[0].shape: 429 raise ValueError("All metadata must be the same shape.") 430 if metadata_list[0] is None: 431 metadata = None 432 else: 433 metadata = pd.DataFrame(metadata_list) 434 features_list = [event.features for event in events] 435 # Iterate through and ensure that all features are the same shape 436 for features in features_list: 437 if type(features) != type(features_list[0]): 438 raise ValueError("All features must be the same type.") 439 if features is not None and features.shape != features_list[0].shape: 440 raise ValueError("All features must be the same shape.") 441 if features_list[0] is None: 442 features = None 443 else: 444 features = pd.DataFrame(features_list) 445 return EventArray(info=info, metadata=metadata, features=features) 446 447 def to_events( 448 self, 449 scans: list[Scan], 450 ignore_missing_scans=True, 451 ignore_metadata=False, 452 ignore_features=False, 453 ) -> list[Event]: 454 """ 455 Get the events in the EventArray as a list of events. 456 :param scans: the scans that the events belong to. Pass an empty list if you 457 don't care about scan metadata. 458 :param ignore_missing_scans: whether to create blank scans for events without scans. 459 :param ignore_metadata: whether to ignore metadata or not 460 :param ignore_features: whether to ignore features or not 461 :return: 462 """ 463 events = [] 464 for i in range(len(self.info)): 465 # Determine the associated scan 466 scan = None 467 for s in scans: 468 if s.slide_id == self.info["slide_id"][i]: 469 scan = s 470 break 471 if scan is None: 472 if ignore_missing_scans: 473 # Create a placeholder scan if the scan is missing 474 scan = Scan.make_placeholder( 475 self.info["slide_id"][i], 476 self.info["tile"][i], 477 self.info["roi"][i], 478 ) 479 else: 480 raise ValueError( 481 f"Scan {self.info['slide_id'][i]} not found for event {i}." 482 ) 483 # Add to the list 484 events.append( 485 Event( 486 scan, 487 Tile(scan, self.info["tile"][i], self.info["roi"][i]), 488 self.info["x"][i], 489 self.info["y"][i], 490 size=self.info["size"][i], 491 metadata=None if ignore_metadata else self.metadata.loc[i], 492 features=None if ignore_features else self.features.loc[i], 493 ) 494 ) 495 return events 496 497 def to_dataframe(self) -> pd.DataFrame: 498 """ 499 Convert all the data in the EventArray to a single DataFrame. 500 :return: a DataFrame with all the data in the EventArray. 501 """ 502 # Make a copy of the info DataFrame and prepend "info_" to the column names 503 output = self.info.copy() 504 output.columns = [f"info_{col}" for col in output.columns] 505 # Combine with the metadata and prepend "metadata_" to the column names 506 if self.metadata is not None: 507 metadata = self.metadata.copy() 508 metadata.columns = [f"metadata_{col}" for col in metadata.columns] 509 output = pd.concat([output, metadata], axis=1) 510 # Combine with the features and prepend "features_" to the column names 511 if self.features is not None: 512 features = self.features.copy() 513 features.columns = [f"features_{col}" for col in features.columns] 514 output = pd.concat([output, features], axis=1) 515 return output 516 517 @classmethod 518 def from_dataframe(cls, df) -> typing.Self: 519 """ 520 From a single, special DataFrame, create an EventArray. 521 :return: a DataFrame with all the data in the EventArray. 522 """ 523 # Split the columns into info, metadata, and features and strip prefix 524 info = df[[col for col in df.columns if col.startswith("info_")]].copy() 525 info.columns = [col.replace("info_", "") for col in info.columns] 526 if info.size == 0: 527 info = None 528 metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy() 529 metadata.columns = [col.replace("metadata_", "") for col in metadata.columns] 530 if metadata.size == 0: 531 metadata = None 532 features = df[[col for col in df.columns if col.startswith("features_")]].copy() 533 features.columns = [col.replace("features_", "") for col in features.columns] 534 if features.size == 0: 535 features = None 536 return cls(info=info, metadata=metadata, features=features) 537 538 def save_csv(self, output_path: str) -> bool: 539 """ 540 Save the events to an CSV file, including metadata and features. 541 :param output_path: 542 :return: 543 """ 544 self.to_dataframe().to_csv(output_path, index=False) 545 return os.path.exists(output_path) 546 547 @classmethod 548 def load_csv(cls, input_path: str) -> typing.Self: 549 """ 550 Load the events from an CSV file, including metadata and features. 551 :param input_path: 552 :return: 553 """ 554 # Load the CSV file 555 df = pd.read_csv(input_path) 556 return cls.from_dataframe(df) 557 558 def save_hdf5(self, output_path: str) -> bool: 559 """ 560 Save the events to an HDF5 file, including metadata and features. 561 Uses the pandas-provided HDF5 functions for ease, and external compatibility, 562 though these files are slightly harder to view in HDFView or similar. 563 :param output_path: 564 :return: 565 """ 566 # Open the output_path as an HDF5 file 567 with pd.HDFStore(output_path) as store: 568 # Store the dataframes in the HDF5 file 569 if self.info is not None: 570 store.put("info", self.info, index=False) 571 if self.metadata is not None: 572 store.put("metadata", self.metadata, index=False) 573 if self.features is not None: 574 store.put("features", self.features, index=False) 575 return os.path.exists(output_path) 576 577 @classmethod 578 def load_hdf5(cls, input_path: str) -> typing.Self: 579 """ 580 Load the events from an HDF5 file, including metadata and features. 581 :param input_path: 582 :return: 583 """ 584 # Open the input_path as an HDF5 file 585 with pd.HDFStore(input_path) as store: 586 # Load the dataframes from the HDF5 file 587 info = store.get("info") if "info" in store else None 588 metadata = store.get("metadata") if "metadata" in store else None 589 features = store.get("features") if "features" in store else None 590 return cls(info=info, metadata=metadata, features=features) 591 592 @classmethod 593 def load_ocular( 594 cls, 595 input_path: str, 596 event_type="cells", 597 cell_data_files=( 598 "rc-final1.rds", 599 "rc-final2.rds", 600 "rc-final3.rds", 601 "rc-final4.rds", 602 "ocular_interesting.rds", 603 ), 604 others_data_files=( 605 "others-final1.rds", 606 "others-final2.rds", 607 "others-final3.rds", 608 "others-final4.rds", 609 ), 610 atlas_data_files=( 611 "ocular_interesting.rds", 612 "ocular_not_interesting.rds", 613 ), 614 merge_event_data_with_stats=True, 615 filter_and_generate_morphs=True, 616 drop_common_events=True, 617 log=None, 618 ) -> typing.Self: 619 """ 620 621 :param input_path: 622 :param event_type: 623 :param cell_data_files: 624 :param others_data_files: 625 :param atlas_data_files: 626 :param merge_event_data_with_stats: 627 :param filter_and_generate_morphs: 628 :param drop_common_events: 629 :param log: 630 :return: 631 """ 632 # Check if the input path is a directory or a file 633 if os.path.isfile(input_path): 634 data_files = [os.path.basename(input_path)] 635 input_path = os.path.dirname(input_path) 636 if event_type == "cells": 637 data_files = cell_data_files 638 elif event_type == "others": 639 data_files = others_data_files 640 else: 641 raise ValueError("Invalid event type.") 642 643 # Load the data from the OCULAR files 644 file_data = {} 645 for file in data_files: 646 file_path = os.path.join(input_path, file) 647 if not os.path.isfile(file_path): 648 if log is not None: 649 log.warning(f"{file} not found for in {input_path}") 650 continue 651 file_data[file] = pyreadr.read_r(file_path) 652 # Get the DataFrame associated with None (pyreadr dict quirk) 653 file_data[file] = file_data[file][None] 654 if len(file_data[file]) == 0: 655 # File gets dropped from the dict 656 file_data.pop(file) 657 if log is not None: 658 log.warning(f"{file} has no cells") 659 continue 660 661 if log is not None: 662 log.debug(f"{file} has {len(file_data[file])} cells") 663 664 # Drop common cells if requested and in this file 665 if file in atlas_data_files and drop_common_events: 666 common_cell_indices = ( 667 file_data[file]["catalogue_classification"] == "common_cell" 668 ) 669 if log is not None: 670 log.debug( 671 f"Dropping {int(pd.Series.sum(common_cell_indices))}" 672 f"common cells from {file}" 673 ) 674 file_data[file] = file_data[file][common_cell_indices == False] 675 676 if len(file_data[file]) == 0: 677 # File gets dropped from the dict 678 file_data.pop(file) 679 if log is not None: 680 log.warning(f"{file} has no cells after dropping common cells") 681 continue 682 683 # Extract frame_id and cell_id 684 # DAPI- events already have frame_id cell_id outside rowname 685 if event_type == "cells": 686 file_data[file]["rowname"] = file_data[file]["rowname"].astype("str") 687 # get frame_id cell_id from rownames column and split into two columns 688 split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True) 689 if len(split_res.columns) != 2: 690 log.warning( 691 f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}' 692 ) 693 # then assign it back to the dataframe 694 file_data[file][["frame_id", "cell_id"]] = split_res.astype("int") 695 # reset indexes since they can cause NaN values in concat 696 file_data[file].reset_index(drop=True, inplace=True) 697 698 # Merge the data from all files 699 if len(file_data) == 0: 700 return EventArray() 701 elif len(file_data) == 1: 702 data = [file_data[file] for file in file_data.keys()][0] 703 else: 704 data = pd.concat(file_data.values()) 705 706 if log is not None: 707 log.debug(f"Gathered a total of {len(data)} events") 708 709 # Others is missing the "slide_id". Insert it right before "frame_id" column 710 if event_type == "others" and "slide_id" not in data.columns: 711 if os.path.basename(input_path) == "ocular": 712 slide_id = os.path.basename(os.path.dirname(input_path)) 713 else: 714 slide_id = "UNKNOWN" 715 data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id) 716 717 # Sort according to ascending cell_id to keep the original, which is in manual_df 718 data = data.sort_values(by=["cell_id"], ascending=True) 719 # Filter out duplicates by x & y 720 data = data.assign( 721 unique_id=data["slide_id"] 722 + "_" 723 + data["frame_id"].astype(str) 724 + "_" 725 + data["cellx"].astype(int).astype(str) 726 + "_" 727 + data["celly"].astype(int).astype(str) 728 ) 729 data = data.drop_duplicates(subset=["unique_id"], keep="first", inplace=False) 730 # Filter out duplicates by cell_id 731 data = data.assign( 732 unique_id=data["slide_id"] 733 + "_" 734 + data["frame_id"].astype(str) 735 + "_" 736 + data["cell_id"].astype(str) 737 ) 738 data.reset_index(drop=True, inplace=True) 739 # All columns up to "slide_id" are features; drop the "slide_id" 740 features = data.loc[:, :"slide_id"].iloc[:, :-1] 741 data = data.loc[:, "slide_id":] 742 # Grab the info columns 743 info = data[["slide_id", "frame_id", "cellx", "celly"]] 744 info.columns = ["slide_id", "tile", "x", "y"] 745 info = info.assign( 746 roi=0, # OCULAR only works on 1 ROI, as far as known 747 size=25, # Static, for later montaging 748 ) 749 info = info[["slide_id", "tile", "roi", "x", "y", "size"]] 750 # Metadata has duplicate columns for later convenience 751 metadata = data 752 return EventArray(info, metadata, features) 753 754 def save_ocular(self, output_path: str, event_type: str = "cells") -> bool: 755 """ 756 Save the events to an OCULAR file. Relies on the dataframe originating 757 from an OCULAR file (same columns; duplicate metadata/info). 758 :param output_path: 759 :return: 760 """ 761 if event_type == "cells": 762 file_stub = "rc-final" 763 elif event_type == "others": 764 file_stub = "others-final" 765 else: 766 raise ValueError("Invalid event type. Must be cells or others.") 767 768 # Check for the "ocular_interesting" column 769 if event_type == "cells" and "ocular_interesting" in self.metadata.columns: 770 interesting = self.metadata["ocular_interesting"] 771 # Split the metadata into interesting and regular 772 # Interesting will only have dropped columns, with no internal changes 773 interesting = pd.concat( 774 [self.features[interesting], self.metadata[interesting]], axis=1 775 ).reset_index(drop=True) 776 # Data will get some columns changed, so copy it 777 data = ( 778 pd.concat( 779 [self.features[~interesting], self.metadata[~interesting]], axis=1 780 ) 781 .copy(deep=True) 782 .reset_index(drop=True) 783 .drop(columns=["ocular_interesting"]) 784 ) 785 786 # Drop particular columns for "interesting" 787 interesting = interesting.drop( 788 [ 789 "clust", 790 "hcpc", 791 "frame_id", 792 "cell_id", 793 "unique_id", 794 "ocular_interesting", 795 ], 796 axis=1, 797 ) 798 # Save both .csv and .rds 799 interesting.to_csv( 800 os.path.join(output_path, "ocular_interesting.csv"), index=False 801 ) 802 pyreadr.write_rds( 803 os.path.join(output_path, "ocular_interesting.rds"), interesting 804 ) 805 else: 806 # Get all data, copying it 807 data = ( 808 pd.concat([self.features, self.metadata], axis=1) 809 .copy(deep=True) 810 .reset_index(drop=True) 811 ) 812 813 # Split based on cluster number to conform to *-final[1-4].rds 814 n_clusters = max(data["clust"]) + 1 815 split_idx = [round(i * n_clusters / 4) for i in range(5)] 816 for i in range(4): 817 subset = (split_idx[i] <= data["clust"]) & ( 818 data["clust"] < split_idx[i + 1] 819 ) 820 subset = data[subset].reset_index(drop=True) 821 subset["hcpc"] = i + 1 822 pyreadr.write_rds( 823 os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset 824 ) 825 826 # Create new example cell strings 827 data["example_cell_id"] = ( 828 data["slide_id"] 829 + " " 830 + data["frame_id"].astype(str) 831 + " " 832 + data["cell_id"].astype(str) 833 + " " 834 + data["cellx"].astype(int).astype(str) 835 + " " 836 + data["celly"].astype(int).astype(str) 837 ) 838 # Find averagable data columns 839 if "cellcluster_id" in data.columns: 840 avg_cols = data.columns[: data.columns.get_loc("cellcluster_id")].tolist() 841 else: 842 avg_cols = data.columns[: data.columns.get_loc("slide_id")].tolist() 843 # Group by cluster and average 844 data = data.groupby("clust").agg( 845 **{col: (col, "mean") for col in avg_cols}, 846 count=("clust", "size"), # count rows in each cluster 847 example_cells=("example_cell_id", lambda x: ",".join(x)), 848 hcpc=("hcpc", lambda x: x.iloc[0]), 849 ) 850 data = data.reset_index() # Do NOT drop, index is "clust" 851 # Create new columns 852 metadata = pd.DataFrame( 853 { 854 "count": data["count"], 855 "example_cells": data["example_cells"], 856 "clust": data["clust"].astype(int), 857 "hcpc": data["hcpc"].astype(int), 858 "id": data["clust"].astype(int).astype(str), 859 "cccluster": "0", # Dummy value 860 "ccdistance": 0.0, # Dummy value 861 "rownum": list(range(len(data))), 862 "framegroup": 0, # Dummy value 863 } 864 ) 865 data = pd.concat([data.loc[:, avg_cols], metadata], axis=1) 866 # Save the data 867 data.to_csv(os.path.join(output_path, f"{file_stub}.csv"), index=False) 868 pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data)
A class that holds a large number of events' data, making it easy to analyze and manipulate many events at once. A more separated version of the Event class.
255 def __init__( 256 self, 257 info: pd.DataFrame = None, 258 metadata: pd.DataFrame = None, 259 features: pd.DataFrame = None, 260 ): 261 # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y", "size" 262 if info is not None and ( 263 not all(col in info.columns for col in self.INFO_COLUMNS) 264 or len(info.columns) != 6 265 ): 266 raise ValueError( 267 "EventArray.info must have columns 'slide_id', 'tile', 'roi', 'x', 'y', 'size'" 268 ) 269 # All DataFrames must all have the same number of rows 270 if metadata is not None and (info is None or len(info) != len(metadata)): 271 raise ValueError( 272 "If EventArray.metadata is not None, it should match rows with .info" 273 ) 274 if features is not None and (info is None or len(info) != len(features)): 275 raise ValueError( 276 "If EventArray.features is not None, it should match rows with .info" 277 ) 278 self.info = info 279 self.metadata = metadata 280 self.features = features
329 def sort(self, by: str | list[str], ascending: bool = True) -> typing.Self: 330 """ 331 Sort the EventArray by a column in the info, metadata, or features DataFrames. 332 :param by: name of the column to sort by. 333 :param ascending: whether to sort in ascending order. 334 :return: 335 """ 336 everything = pd.concat([self.info, self.metadata, self.features], axis=1) 337 order = everything.sort_values(by=by, ascending=ascending).index 338 self.info = self.info.loc[order].reset_index(drop=True) 339 if self.metadata is not None: 340 self.metadata = self.metadata.loc[order].reset_index(drop=True) 341 if self.features is not None: 342 self.features = self.features.loc[order].reset_index(drop=True) 343 return self
Sort the EventArray by a column in the info, metadata, or features DataFrames.
Parameters
- by: name of the column to sort by.
- ascending: whether to sort in ascending order.
Returns
345 def add_metadata(self, new_metadata: pd.DataFrame) -> None: 346 """ 347 Add metadata to the EventArray. 348 :param new_metadata: the metadata to add. 349 """ 350 if self.metadata is None: 351 if len(self) != len(new_metadata): 352 raise ValueError("New metadata does not match length of existing info") 353 self.metadata = new_metadata 354 else: 355 # Add the new metadata columns to the existing metadata 356 self.metadata = pd.concat([self.metadata, new_metadata], axis=1)
Add metadata to the EventArray.
Parameters
- new_metadata: the metadata to add.
358 def add_features(self, new_features: pd.DataFrame) -> None: 359 """ 360 Add features to the EventArray. 361 :param new_features: the metadata to add. 362 """ 363 if self.features is None: 364 if len(self) != len(new_features): 365 raise ValueError("New metadata does not match length of existing info") 366 self.features = new_features 367 else: 368 # Add the new metadata columns to the existing metadata 369 self.features = pd.concat([self.features, new_features], axis=1)
Add features to the EventArray.
Parameters
- new_features: the metadata to add.
371 @classmethod 372 def from_list(cls, events: list[typing.Self]) -> typing.Self: 373 """ 374 Combine EventArrays in a list into a single EventArray. 375 :param events: the new list of events. 376 """ 377 all_info = [] 378 all_metadata = [] 379 all_features = [] 380 for event_array in events: 381 # Skip empty EventArrays 382 if event_array.info is not None: 383 all_info.append(event_array.info) 384 if event_array.metadata is not None: 385 all_metadata.append(event_array.metadata) 386 if event_array.features is not None: 387 all_features.append(event_array.features) 388 if len(all_info) == 0: 389 return EventArray() 390 else: 391 all_info = pd.concat(all_info, ignore_index=True) 392 if len(all_metadata) == 0: 393 all_metadata = None 394 else: 395 all_metadata = pd.concat(all_metadata, ignore_index=True) 396 if len(all_features) == 0: 397 all_features = None 398 else: 399 all_features = pd.concat(all_features, ignore_index=True) 400 401 return EventArray(all_info, all_metadata, all_features)
Combine EventArrays in a list into a single EventArray.
Parameters
- events: the new list of events.
403 @classmethod 404 def from_events(cls, events: list[Event]) -> typing.Self: 405 """ 406 Set the events in the EventArray to a new list of events. 407 :param events: the new list of events. 408 """ 409 # Return an empty array if we were passed nothing 410 if events is None or len(events) == 0: 411 return EventArray() 412 # Otherwise, grab the info 413 info = pd.DataFrame( 414 { 415 "slide_id": [event.scan.slide_id for event in events], 416 "tile": [event.tile.n for event in events], 417 "roi": [event.tile.n_roi for event in events], 418 "x": [event.x for event in events], 419 "y": [event.y for event in events], 420 "size": [event.size for event in events], 421 } 422 ) 423 metadata_list = [event.metadata for event in events] 424 # Iterate through and ensure that all metadata is the same shape 425 for metadata in metadata_list: 426 if type(metadata) != type(metadata_list[0]): 427 raise ValueError("All metadata must be the same type.") 428 if metadata is not None and metadata.shape != metadata_list[0].shape: 429 raise ValueError("All metadata must be the same shape.") 430 if metadata_list[0] is None: 431 metadata = None 432 else: 433 metadata = pd.DataFrame(metadata_list) 434 features_list = [event.features for event in events] 435 # Iterate through and ensure that all features are the same shape 436 for features in features_list: 437 if type(features) != type(features_list[0]): 438 raise ValueError("All features must be the same type.") 439 if features is not None and features.shape != features_list[0].shape: 440 raise ValueError("All features must be the same shape.") 441 if features_list[0] is None: 442 features = None 443 else: 444 features = pd.DataFrame(features_list) 445 return EventArray(info=info, metadata=metadata, features=features)
Set the events in the EventArray to a new list of events.
Parameters
- events: the new list of events.
447 def to_events( 448 self, 449 scans: list[Scan], 450 ignore_missing_scans=True, 451 ignore_metadata=False, 452 ignore_features=False, 453 ) -> list[Event]: 454 """ 455 Get the events in the EventArray as a list of events. 456 :param scans: the scans that the events belong to. Pass an empty list if you 457 don't care about scan metadata. 458 :param ignore_missing_scans: whether to create blank scans for events without scans. 459 :param ignore_metadata: whether to ignore metadata or not 460 :param ignore_features: whether to ignore features or not 461 :return: 462 """ 463 events = [] 464 for i in range(len(self.info)): 465 # Determine the associated scan 466 scan = None 467 for s in scans: 468 if s.slide_id == self.info["slide_id"][i]: 469 scan = s 470 break 471 if scan is None: 472 if ignore_missing_scans: 473 # Create a placeholder scan if the scan is missing 474 scan = Scan.make_placeholder( 475 self.info["slide_id"][i], 476 self.info["tile"][i], 477 self.info["roi"][i], 478 ) 479 else: 480 raise ValueError( 481 f"Scan {self.info['slide_id'][i]} not found for event {i}." 482 ) 483 # Add to the list 484 events.append( 485 Event( 486 scan, 487 Tile(scan, self.info["tile"][i], self.info["roi"][i]), 488 self.info["x"][i], 489 self.info["y"][i], 490 size=self.info["size"][i], 491 metadata=None if ignore_metadata else self.metadata.loc[i], 492 features=None if ignore_features else self.features.loc[i], 493 ) 494 ) 495 return events
Get the events in the EventArray as a list of events.
Parameters
- scans: the scans that the events belong to. Pass an empty list if you don't care about scan metadata.
- ignore_missing_scans: whether to create blank scans for events without scans.
- ignore_metadata: whether to ignore metadata or not
- ignore_features: whether to ignore features or not
Returns
497 def to_dataframe(self) -> pd.DataFrame: 498 """ 499 Convert all the data in the EventArray to a single DataFrame. 500 :return: a DataFrame with all the data in the EventArray. 501 """ 502 # Make a copy of the info DataFrame and prepend "info_" to the column names 503 output = self.info.copy() 504 output.columns = [f"info_{col}" for col in output.columns] 505 # Combine with the metadata and prepend "metadata_" to the column names 506 if self.metadata is not None: 507 metadata = self.metadata.copy() 508 metadata.columns = [f"metadata_{col}" for col in metadata.columns] 509 output = pd.concat([output, metadata], axis=1) 510 # Combine with the features and prepend "features_" to the column names 511 if self.features is not None: 512 features = self.features.copy() 513 features.columns = [f"features_{col}" for col in features.columns] 514 output = pd.concat([output, features], axis=1) 515 return output
Convert all the data in the EventArray to a single DataFrame.
Returns
a DataFrame with all the data in the EventArray.
517 @classmethod 518 def from_dataframe(cls, df) -> typing.Self: 519 """ 520 From a single, special DataFrame, create an EventArray. 521 :return: a DataFrame with all the data in the EventArray. 522 """ 523 # Split the columns into info, metadata, and features and strip prefix 524 info = df[[col for col in df.columns if col.startswith("info_")]].copy() 525 info.columns = [col.replace("info_", "") for col in info.columns] 526 if info.size == 0: 527 info = None 528 metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy() 529 metadata.columns = [col.replace("metadata_", "") for col in metadata.columns] 530 if metadata.size == 0: 531 metadata = None 532 features = df[[col for col in df.columns if col.startswith("features_")]].copy() 533 features.columns = [col.replace("features_", "") for col in features.columns] 534 if features.size == 0: 535 features = None 536 return cls(info=info, metadata=metadata, features=features)
From a single, special DataFrame, create an EventArray.
Returns
a DataFrame with all the data in the EventArray.
538 def save_csv(self, output_path: str) -> bool: 539 """ 540 Save the events to an CSV file, including metadata and features. 541 :param output_path: 542 :return: 543 """ 544 self.to_dataframe().to_csv(output_path, index=False) 545 return os.path.exists(output_path)
Save the events to an CSV file, including metadata and features.
Parameters
- output_path:
Returns
547 @classmethod 548 def load_csv(cls, input_path: str) -> typing.Self: 549 """ 550 Load the events from an CSV file, including metadata and features. 551 :param input_path: 552 :return: 553 """ 554 # Load the CSV file 555 df = pd.read_csv(input_path) 556 return cls.from_dataframe(df)
Load the events from an CSV file, including metadata and features.
Parameters
- input_path:
Returns
558 def save_hdf5(self, output_path: str) -> bool: 559 """ 560 Save the events to an HDF5 file, including metadata and features. 561 Uses the pandas-provided HDF5 functions for ease, and external compatibility, 562 though these files are slightly harder to view in HDFView or similar. 563 :param output_path: 564 :return: 565 """ 566 # Open the output_path as an HDF5 file 567 with pd.HDFStore(output_path) as store: 568 # Store the dataframes in the HDF5 file 569 if self.info is not None: 570 store.put("info", self.info, index=False) 571 if self.metadata is not None: 572 store.put("metadata", self.metadata, index=False) 573 if self.features is not None: 574 store.put("features", self.features, index=False) 575 return os.path.exists(output_path)
Save the events to an HDF5 file, including metadata and features. Uses the pandas-provided HDF5 functions for ease, and external compatibility, though these files are slightly harder to view in HDFView or similar.
Parameters
- output_path:
Returns
577 @classmethod 578 def load_hdf5(cls, input_path: str) -> typing.Self: 579 """ 580 Load the events from an HDF5 file, including metadata and features. 581 :param input_path: 582 :return: 583 """ 584 # Open the input_path as an HDF5 file 585 with pd.HDFStore(input_path) as store: 586 # Load the dataframes from the HDF5 file 587 info = store.get("info") if "info" in store else None 588 metadata = store.get("metadata") if "metadata" in store else None 589 features = store.get("features") if "features" in store else None 590 return cls(info=info, metadata=metadata, features=features)
Load the events from an HDF5 file, including metadata and features.
Parameters
- input_path:
Returns
592 @classmethod 593 def load_ocular( 594 cls, 595 input_path: str, 596 event_type="cells", 597 cell_data_files=( 598 "rc-final1.rds", 599 "rc-final2.rds", 600 "rc-final3.rds", 601 "rc-final4.rds", 602 "ocular_interesting.rds", 603 ), 604 others_data_files=( 605 "others-final1.rds", 606 "others-final2.rds", 607 "others-final3.rds", 608 "others-final4.rds", 609 ), 610 atlas_data_files=( 611 "ocular_interesting.rds", 612 "ocular_not_interesting.rds", 613 ), 614 merge_event_data_with_stats=True, 615 filter_and_generate_morphs=True, 616 drop_common_events=True, 617 log=None, 618 ) -> typing.Self: 619 """ 620 621 :param input_path: 622 :param event_type: 623 :param cell_data_files: 624 :param others_data_files: 625 :param atlas_data_files: 626 :param merge_event_data_with_stats: 627 :param filter_and_generate_morphs: 628 :param drop_common_events: 629 :param log: 630 :return: 631 """ 632 # Check if the input path is a directory or a file 633 if os.path.isfile(input_path): 634 data_files = [os.path.basename(input_path)] 635 input_path = os.path.dirname(input_path) 636 if event_type == "cells": 637 data_files = cell_data_files 638 elif event_type == "others": 639 data_files = others_data_files 640 else: 641 raise ValueError("Invalid event type.") 642 643 # Load the data from the OCULAR files 644 file_data = {} 645 for file in data_files: 646 file_path = os.path.join(input_path, file) 647 if not os.path.isfile(file_path): 648 if log is not None: 649 log.warning(f"{file} not found for in {input_path}") 650 continue 651 file_data[file] = pyreadr.read_r(file_path) 652 # Get the DataFrame associated with None (pyreadr dict quirk) 653 file_data[file] = file_data[file][None] 654 if len(file_data[file]) == 0: 655 # File gets dropped from the dict 656 file_data.pop(file) 657 if log is not None: 658 log.warning(f"{file} has no cells") 659 continue 660 661 if log is not None: 662 log.debug(f"{file} has {len(file_data[file])} cells") 663 664 # Drop common cells if requested and in this file 665 if file in atlas_data_files and drop_common_events: 666 common_cell_indices = ( 667 file_data[file]["catalogue_classification"] == "common_cell" 668 ) 669 if log is not None: 670 log.debug( 671 f"Dropping {int(pd.Series.sum(common_cell_indices))}" 672 f"common cells from {file}" 673 ) 674 file_data[file] = file_data[file][common_cell_indices == False] 675 676 if len(file_data[file]) == 0: 677 # File gets dropped from the dict 678 file_data.pop(file) 679 if log is not None: 680 log.warning(f"{file} has no cells after dropping common cells") 681 continue 682 683 # Extract frame_id and cell_id 684 # DAPI- events already have frame_id cell_id outside rowname 685 if event_type == "cells": 686 file_data[file]["rowname"] = file_data[file]["rowname"].astype("str") 687 # get frame_id cell_id from rownames column and split into two columns 688 split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True) 689 if len(split_res.columns) != 2: 690 log.warning( 691 f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}' 692 ) 693 # then assign it back to the dataframe 694 file_data[file][["frame_id", "cell_id"]] = split_res.astype("int") 695 # reset indexes since they can cause NaN values in concat 696 file_data[file].reset_index(drop=True, inplace=True) 697 698 # Merge the data from all files 699 if len(file_data) == 0: 700 return EventArray() 701 elif len(file_data) == 1: 702 data = [file_data[file] for file in file_data.keys()][0] 703 else: 704 data = pd.concat(file_data.values()) 705 706 if log is not None: 707 log.debug(f"Gathered a total of {len(data)} events") 708 709 # Others is missing the "slide_id". Insert it right before "frame_id" column 710 if event_type == "others" and "slide_id" not in data.columns: 711 if os.path.basename(input_path) == "ocular": 712 slide_id = os.path.basename(os.path.dirname(input_path)) 713 else: 714 slide_id = "UNKNOWN" 715 data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id) 716 717 # Sort according to ascending cell_id to keep the original, which is in manual_df 718 data = data.sort_values(by=["cell_id"], ascending=True) 719 # Filter out duplicates by x & y 720 data = data.assign( 721 unique_id=data["slide_id"] 722 + "_" 723 + data["frame_id"].astype(str) 724 + "_" 725 + data["cellx"].astype(int).astype(str) 726 + "_" 727 + data["celly"].astype(int).astype(str) 728 ) 729 data = data.drop_duplicates(subset=["unique_id"], keep="first", inplace=False) 730 # Filter out duplicates by cell_id 731 data = data.assign( 732 unique_id=data["slide_id"] 733 + "_" 734 + data["frame_id"].astype(str) 735 + "_" 736 + data["cell_id"].astype(str) 737 ) 738 data.reset_index(drop=True, inplace=True) 739 # All columns up to "slide_id" are features; drop the "slide_id" 740 features = data.loc[:, :"slide_id"].iloc[:, :-1] 741 data = data.loc[:, "slide_id":] 742 # Grab the info columns 743 info = data[["slide_id", "frame_id", "cellx", "celly"]] 744 info.columns = ["slide_id", "tile", "x", "y"] 745 info = info.assign( 746 roi=0, # OCULAR only works on 1 ROI, as far as known 747 size=25, # Static, for later montaging 748 ) 749 info = info[["slide_id", "tile", "roi", "x", "y", "size"]] 750 # Metadata has duplicate columns for later convenience 751 metadata = data 752 return EventArray(info, metadata, features)
Parameters
- input_path:
- event_type:
- cell_data_files:
- others_data_files:
- atlas_data_files:
- merge_event_data_with_stats:
- filter_and_generate_morphs:
- drop_common_events:
- log:
Returns
754 def save_ocular(self, output_path: str, event_type: str = "cells") -> bool: 755 """ 756 Save the events to an OCULAR file. Relies on the dataframe originating 757 from an OCULAR file (same columns; duplicate metadata/info). 758 :param output_path: 759 :return: 760 """ 761 if event_type == "cells": 762 file_stub = "rc-final" 763 elif event_type == "others": 764 file_stub = "others-final" 765 else: 766 raise ValueError("Invalid event type. Must be cells or others.") 767 768 # Check for the "ocular_interesting" column 769 if event_type == "cells" and "ocular_interesting" in self.metadata.columns: 770 interesting = self.metadata["ocular_interesting"] 771 # Split the metadata into interesting and regular 772 # Interesting will only have dropped columns, with no internal changes 773 interesting = pd.concat( 774 [self.features[interesting], self.metadata[interesting]], axis=1 775 ).reset_index(drop=True) 776 # Data will get some columns changed, so copy it 777 data = ( 778 pd.concat( 779 [self.features[~interesting], self.metadata[~interesting]], axis=1 780 ) 781 .copy(deep=True) 782 .reset_index(drop=True) 783 .drop(columns=["ocular_interesting"]) 784 ) 785 786 # Drop particular columns for "interesting" 787 interesting = interesting.drop( 788 [ 789 "clust", 790 "hcpc", 791 "frame_id", 792 "cell_id", 793 "unique_id", 794 "ocular_interesting", 795 ], 796 axis=1, 797 ) 798 # Save both .csv and .rds 799 interesting.to_csv( 800 os.path.join(output_path, "ocular_interesting.csv"), index=False 801 ) 802 pyreadr.write_rds( 803 os.path.join(output_path, "ocular_interesting.rds"), interesting 804 ) 805 else: 806 # Get all data, copying it 807 data = ( 808 pd.concat([self.features, self.metadata], axis=1) 809 .copy(deep=True) 810 .reset_index(drop=True) 811 ) 812 813 # Split based on cluster number to conform to *-final[1-4].rds 814 n_clusters = max(data["clust"]) + 1 815 split_idx = [round(i * n_clusters / 4) for i in range(5)] 816 for i in range(4): 817 subset = (split_idx[i] <= data["clust"]) & ( 818 data["clust"] < split_idx[i + 1] 819 ) 820 subset = data[subset].reset_index(drop=True) 821 subset["hcpc"] = i + 1 822 pyreadr.write_rds( 823 os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset 824 ) 825 826 # Create new example cell strings 827 data["example_cell_id"] = ( 828 data["slide_id"] 829 + " " 830 + data["frame_id"].astype(str) 831 + " " 832 + data["cell_id"].astype(str) 833 + " " 834 + data["cellx"].astype(int).astype(str) 835 + " " 836 + data["celly"].astype(int).astype(str) 837 ) 838 # Find averagable data columns 839 if "cellcluster_id" in data.columns: 840 avg_cols = data.columns[: data.columns.get_loc("cellcluster_id")].tolist() 841 else: 842 avg_cols = data.columns[: data.columns.get_loc("slide_id")].tolist() 843 # Group by cluster and average 844 data = data.groupby("clust").agg( 845 **{col: (col, "mean") for col in avg_cols}, 846 count=("clust", "size"), # count rows in each cluster 847 example_cells=("example_cell_id", lambda x: ",".join(x)), 848 hcpc=("hcpc", lambda x: x.iloc[0]), 849 ) 850 data = data.reset_index() # Do NOT drop, index is "clust" 851 # Create new columns 852 metadata = pd.DataFrame( 853 { 854 "count": data["count"], 855 "example_cells": data["example_cells"], 856 "clust": data["clust"].astype(int), 857 "hcpc": data["hcpc"].astype(int), 858 "id": data["clust"].astype(int).astype(str), 859 "cccluster": "0", # Dummy value 860 "ccdistance": 0.0, # Dummy value 861 "rownum": list(range(len(data))), 862 "framegroup": 0, # Dummy value 863 } 864 ) 865 data = pd.concat([data.loc[:, avg_cols], metadata], axis=1) 866 # Save the data 867 data.to_csv(os.path.join(output_path, f"{file_stub}.csv"), index=False) 868 pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data)
Save the events to an OCULAR file. Relies on the dataframe originating from an OCULAR file (same columns; duplicate metadata/info).
Parameters
- output_path: