csi_images.csi_events
Contains the Event class, which represents a single event in a scan. The Event class optionally holds metadata and features. Lists of events with similar metadata or features can be combined into DataFrames for analysis.
The Event class holds the position of the event in the frame, which can be converted to the position in the scanner or slide coordinate positions. See the csi_utils.csi_scans documentation page for more information on the coordinate systems.
1""" 2Contains the Event class, which represents a single event in a scan. 3The Event class optionally holds metadata and features. Lists of events with 4similar metadata or features can be combined into DataFrames for analysis. 5 6The Event class holds the position of the event in the frame, which can be converted 7to the position in the scanner or slide coordinate positions. See the 8csi_utils.csi_scans documentation page for more information on the coordinate systems. 9""" 10 11import os 12import math 13import typing 14 15import numpy as np 16import pandas as pd 17 18from .csi_scans import Scan 19from .csi_tiles import Tile 20from .csi_frames import Frame 21 22# Optional dependencies; will raise errors in particular functions if not installed 23try: 24 import pyreadr 25except ImportError: 26 pyreadr = None 27 28 29class Event: 30 """ 31 A class that represents a single event in a scan, making it easy to evaluate 32 singular events. Required metadata is exposed as attributes, and optional 33 metadata and features are stored as DataFrames. 34 """ 35 36 SCAN_TO_SLIDE_TRANSFORM = { 37 # Axioscan zero is in the top-right corner instead of top-left 38 Scan.Type.AXIOSCAN7: np.array( 39 [ 40 [1, 0, 75000], 41 [0, 1, 0], 42 [0, 0, 1], 43 ] 44 ), 45 # BZScanner coordinates are a special kind of messed up: 46 # - The slide is upside-down. 47 # - The slide is oriented vertically, with the barcode at the bottom. 48 # - Tiles are numbered from the top-right 49 Scan.Type.BZSCANNER: np.array( 50 [ 51 [0, -1, 75000], 52 [-1, 0, 25000], 53 [0, 0, 1], 54 ] 55 ), 56 } 57 """ 58 Homogeneous transformation matrices for converting between scanner and slide 59 coordinates. The matrices are 3x3, with the final column representing the 60 translation in micrometers (um). For more information, see 61 [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations). 62 63 Transformations are nominal, and accuracy is not guaranteed; this is due to 64 imperfections in slides and alignment in the scanners. Units are in micrometers. 65 """ 66 67 def __init__( 68 self, 69 scan: Scan, 70 tile: Tile, 71 x: int, 72 y: int, 73 size: int = 12, # End-to-end size in pixels 74 metadata: pd.Series = None, 75 features: pd.Series = None, 76 ): 77 self.scan = scan 78 self.tile = tile 79 self.x = x 80 self.y = y 81 self.size = size 82 self.metadata = metadata 83 self.features = features 84 85 def __repr__(self) -> str: 86 return f"{self.scan.slide_id}-{self.tile.n}-{self.x}-{self.y}" 87 88 def __eq__(self, other) -> bool: 89 return self.__repr__() == other.__repr__() 90 91 def __lt__(self, other): 92 return self.__repr__() < other.__repr__() 93 94 def get_scan_position(self) -> tuple[float, float]: 95 """ 96 Get the position of the event in the scanner's coordinate frame. 97 :return: the scan position of the event in micrometers (um). 98 """ 99 # Get overall pixel position 100 pixel_x = self.x + (self.scan.tile_width_px * self.tile.x) 101 pixel_y = self.y + (self.scan.tile_height_px * self.tile.y) 102 # Convert to micrometers 103 x_um = pixel_x * self.scan.pixel_size_um 104 y_um = pixel_y * self.scan.pixel_size_um 105 # Add the scan's origin in the scanner frame 106 x_um += self.scan.roi[self.tile.n_roi].origin_x_um 107 y_um += self.scan.roi[self.tile.n_roi].origin_y_um 108 return x_um, y_um 109 110 def get_slide_position(self) -> tuple[float, float]: 111 """ 112 Get the slide position of the event in micrometers (um). 113 :return: the slide position of the event. 114 """ 115 # Turn scan_position into a 3x1 vector 116 scan_position = self.get_scan_position() 117 scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]]) 118 119 # Multiply by the appropriate homogeneous matrix 120 if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value): 121 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7] 122 elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value): 123 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER] 124 else: 125 raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.") 126 slide_position = np.matmul(transform, scan_position) 127 return float(slide_position[0][0]), float(slide_position[1][0]) 128 129 def crop_images( 130 self, images: list[np.ndarray], crop_size: int = 100, in_pixels: bool = True 131 ) -> list[np.ndarray]: 132 """ 133 Get the event crops from the frame images. Called "get" because it does not 134 need to extract anything; it is very quick for extracting multiple events from 135 the same tile. 136 Use this if you're interested in many events. 137 :param images: the frame images. 138 :param crop_size: the square size of the image crop to get for this event. 139 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 140 :return: image_size x image_size crops of the event in the provided frames. If 141 the event is too close to the edge, the crop will be smaller and not centered. 142 """ 143 # Convert a crop size in micrometers to pixels 144 if not in_pixels: 145 crop_size = round(crop_size / self.scan.pixel_size_um) 146 # Find the crop bounds 147 bounds = [ 148 self.x - crop_size // 2, 149 self.y - crop_size // 2, 150 self.x + math.ceil(crop_size / 2), 151 self.y + math.ceil(crop_size / 2), 152 ] 153 # Determine how much the bounds violate the image size 154 displacements = [ 155 max(0, -bounds[0]), 156 max(0, -bounds[1]), 157 max(0, bounds[2] - images[0].shape[1]), 158 max(0, bounds[3] - images[0].shape[0]), 159 ] 160 # Cap off the bounds 161 bounds = [ 162 max(0, bounds[0]), 163 max(0, bounds[1]), 164 min(images[0].shape[1], bounds[2]), 165 min(images[0].shape[0], bounds[3]), 166 ] 167 168 # Crop the images 169 cropped_images = [] 170 for image in images: 171 # Create a blank image of the right size 172 cropped_image = np.zeros((crop_size, crop_size), dtype=image.dtype) 173 174 # Insert the cropped image into the blank image, leaving a black buffer 175 # around the edges if the crop would go beyond the original image bounds 176 cropped_image[ 177 displacements[1] : crop_size - displacements[3], 178 displacements[0] : crop_size - displacements[2], 179 ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]] 180 cropped_images.append(cropped_image) 181 return cropped_images 182 183 def extract_images( 184 self, crop_size: int = 100, in_pixels: bool = True 185 ) -> list[np.ndarray]: 186 """ 187 Extract the images from the scan and tile, reading from the file. Called 188 "extract" because it must read and extract the images from file, which is slow. 189 Use this if you're interested in only a few events, as it is inefficient when 190 reading multiple events from the same tile. 191 :param crop_size: the square size of the image crop to get for this event. 192 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 193 :return: a list of cropped images from the scan in the order of the channels. 194 """ 195 frames = Frame.get_frames(self.tile) 196 images = [frame.get_image() for frame in frames] 197 return self.crop_images(images, crop_size, in_pixels) 198 199 @classmethod 200 def extract_images_for_list( 201 cls, 202 events: list[typing.Self], 203 crop_size: int | list[int] = None, 204 in_pixels: bool = True, 205 ) -> list[list[np.ndarray]]: 206 """ 207 Get the images for a list of events, ensuring that there is no wasteful reading 208 of the same tile multiple times. This function is more efficient than calling 209 extract_event_images for each event. 210 TODO: test this function 211 :param events: the events to extract images for. 212 :param crop_size: the square size of the image crop to get for this event. 213 Defaults to four times the size of the event. 214 :param in_pixels: whether the crop size is in pixels or micrometers. 215 Defaults to pixels, and is ignored if crop_size is None. 216 :return: a list of lists of cropped images for each event. 217 """ 218 if len(events) == 0: 219 return [] 220 221 # Populate a crop size if none provided 222 if crop_size is None: 223 crop_size = [4 * event.size for event in events] 224 in_pixels = True 225 # Propagate a constant crop size 226 elif isinstance(crop_size, int): 227 crop_size = [crop_size] * len(events) 228 229 # Sort the events by tile; use a shallow copy to avoid modifying the original 230 order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__())) 231 232 # Allocate the list to size 233 images = [None] * len(events) 234 last_tile = None 235 frame_images = None # Holds large numpy arrays, so expensive to compare 236 # Iterate through in sorted order 237 for i in order: 238 if last_tile != events[i].tile: 239 # Gather the frame images, preserving them for the next event 240 frames = Frame.get_frames(events[i].tile) 241 frame_images = [frame.get_image() for frame in frames] 242 243 last_tile = events[i].tile 244 # Use the frame images to crop the event images 245 # Preserve the original order using order[i] 246 images[i] = events[i].crop_images(frame_images, crop_size[i], in_pixels) 247 return images 248 249 250class EventArray: 251 """ 252 A class that holds a large number of events' data, making it easy to analyze and 253 manipulate many events at once. A more separated version of the Event class. 254 """ 255 256 INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y", "size"] 257 258 def __init__( 259 self, 260 info: pd.DataFrame = None, 261 metadata: pd.DataFrame = None, 262 features: pd.DataFrame = None, 263 ): 264 # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y", "size" 265 if info is not None and ( 266 not all( 267 col in info.columns 268 for col in ["slide_id", "tile", "roi", "x", "y", "size"] 269 ) 270 or len(info.columns) != 6 271 ): 272 raise ValueError( 273 "EventArray.info must have columns 'slide_id', 'tile', 'roi', 'x', 'y', 'size'" 274 ) 275 # All DataFrames must all have the same number of rows 276 if metadata is not None and (info is None or len(info) != len(metadata)): 277 raise ValueError( 278 "If EventArray.metadata is not None, it should match rows with .info" 279 ) 280 if features is not None and (info is None or len(info) != len(features)): 281 raise ValueError( 282 "If EventArray.features is not None, it should match rows with .info" 283 ) 284 self.info = info 285 self.metadata = metadata 286 self.features = features 287 288 def __len__(self) -> int: 289 # Convenience method to get the number of events 290 if self.info is None: 291 return 0 292 else: 293 return len(self.info) 294 295 def __eq__(self, other): 296 is_equal = True 297 # Parse all possibilities for info 298 if isinstance(self.info, pd.DataFrame): 299 if isinstance(other.info, pd.DataFrame): 300 is_equal = self.info.equals(other.info) 301 if not is_equal: 302 return False 303 else: 304 return False 305 elif self.info is None: 306 if other.info is not None: 307 return False 308 309 # Parse all possibilities for metadata 310 if isinstance(self.metadata, pd.DataFrame): 311 if isinstance(other.metadata, pd.DataFrame): 312 is_equal = self.metadata.equals(other.metadata) 313 if not is_equal: 314 return False 315 else: 316 return False 317 elif self.metadata is None: 318 if other.metadata is not None: 319 return False 320 321 # Parse all possibilities for features 322 if isinstance(self.features, pd.DataFrame): 323 if isinstance(other.features, pd.DataFrame): 324 is_equal = self.features.equals(other.features) 325 if not is_equal: 326 return False 327 else: 328 return False 329 elif self.features is None: 330 if other.features is not None: 331 return False 332 333 return is_equal 334 335 def get_sort_order(self, by: str | list[str], ascending: bool | list[bool] = True): 336 """ 337 Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames. 338 :param by: name of the column(s) to sort by. 339 :param ascending: whether to sort in ascending order; can be a list to match by 340 :return: the order of the indices to sort by. 341 """ 342 columns = self.get(by) 343 return columns.sort_values(by=by, ascending=ascending).index 344 345 def sort( 346 self, by: str | list[str], ascending: bool | list[bool] = True 347 ) -> typing.Self: 348 """ 349 Sort the EventArray by column(s) in the info, metadata, or features DataFrames. 350 :param by: name of the column(s) to sort by. 351 :param ascending: whether to sort in ascending order; can be a list to match by 352 :return: a new, sorted EventArray. 353 """ 354 order = self.get_sort_order(by, ascending) 355 info = self.info.loc[order].reset_index(drop=True) 356 if self.metadata is not None: 357 metadata = self.metadata.loc[order].reset_index(drop=True) 358 else: 359 metadata = None 360 if self.features is not None: 361 features = self.features.loc[order].reset_index(drop=True) 362 else: 363 features = None 364 return EventArray(info, metadata, features) 365 366 def get(self, column_names: int | str | list[int] | list[str]) -> pd.DataFrame: 367 """ 368 Get a DataFrame with the specified columns from the EventArray, by value. 369 :param column_names: the names of the columns to get. 370 :return: a DataFrame with the specified columns. 371 """ 372 if isinstance(column_names, int) or isinstance(column_names, str): 373 column_names = [column_names] 374 columns = [] 375 for column_name in column_names: 376 if column_name in self.info.columns: 377 columns.append(self.info[column_name]) 378 elif self.metadata is not None and column_name in self.metadata.columns: 379 columns.append(self.metadata[column_name]) 380 elif self.features is not None and column_name in self.features.columns: 381 columns.append(self.features[column_name]) 382 else: 383 raise ValueError(f"Column {column_name} not found in EventArray") 384 return pd.concat(columns, axis=1) 385 386 def rows(self, rows) -> typing.Self: 387 """ 388 Get a subset of the EventArray rows based on a boolean or integer index, by value. 389 :param rows: the indices to get as a 1D boolean/integer list/array/series 390 :return: a new EventArray with the subset of events. 391 """ 392 info = self.info.loc[rows].reset_index(drop=True) 393 if self.metadata is not None: 394 metadata = self.metadata.loc[rows].reset_index(drop=True) 395 else: 396 metadata = None 397 if self.features is not None: 398 features = self.features.loc[rows].reset_index(drop=True) 399 else: 400 features = None 401 return EventArray(info, metadata, features) 402 403 def copy(self) -> typing.Self: 404 """ 405 Create a deep copy of the EventArray. 406 :return: a deep copy of the EventArray. 407 """ 408 return EventArray( 409 info=self.info.copy(), 410 metadata=None if self.metadata is None else self.metadata.copy(), 411 features=None if self.features is None else self.features.copy(), 412 ) 413 414 def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None: 415 """ 416 Add metadata to the EventArray. Removes the need to check if metadata is None. 417 Overwrites any existing metadata with the same column names as the new metadata. 418 :param new_metadata: the metadata to add. 419 """ 420 if len(self) != len(new_metadata): 421 raise ValueError("New metadata must match length of existing info") 422 423 if self.metadata is None: 424 self.metadata = new_metadata 425 else: 426 if isinstance(new_metadata, pd.Series): 427 self.metadata[new_metadata.name] = new_metadata 428 else: 429 # It's a DataFrame 430 self.metadata[new_metadata.columns] = new_metadata 431 432 def add_features(self, new_features: pd.Series | pd.DataFrame) -> None: 433 """ 434 Add features to the EventArray. Removes the need to check if features is None. 435 Overwrites any existing features with the same column names as the new features. 436 :param new_features: the features to add. 437 """ 438 if len(self) != len(new_features): 439 raise ValueError("New features must match length of existing info") 440 441 if self.features is None: 442 self.features = new_features 443 else: 444 if isinstance(new_features, pd.Series): 445 self.features[new_features.name] = new_features 446 else: 447 # It's a DataFrame 448 self.features[new_features.columns] = new_features 449 450 @classmethod 451 def merge(cls, events: list[typing.Self]) -> typing.Self: 452 """ 453 Combine EventArrays in a list into a single EventArray. 454 :param events: the new list of events. 455 """ 456 all_info = [] 457 all_metadata = [] 458 all_features = [] 459 for event_array in events: 460 # Skip empty EventArrays 461 if event_array.info is not None: 462 all_info.append(event_array.info) 463 if event_array.metadata is not None: 464 all_metadata.append(event_array.metadata) 465 if event_array.features is not None: 466 all_features.append(event_array.features) 467 if len(all_info) == 0: 468 return EventArray() 469 else: 470 all_info = pd.concat(all_info, ignore_index=True) 471 if len(all_metadata) == 0: 472 all_metadata = None 473 else: 474 all_metadata = pd.concat(all_metadata, ignore_index=True) 475 if len(all_features) == 0: 476 all_features = None 477 else: 478 all_features = pd.concat(all_features, ignore_index=True) 479 480 return EventArray(all_info, all_metadata, all_features) 481 482 @classmethod 483 def from_events(cls, events: list[Event]) -> typing.Self: 484 """ 485 Set the events in the EventArray to a new list of events. 486 :param events: the new list of events. 487 """ 488 # Return an empty array if we were passed nothing 489 if events is None or len(events) == 0: 490 return EventArray() 491 # Otherwise, grab the info 492 info = pd.DataFrame( 493 { 494 "slide_id": [event.scan.slide_id for event in events], 495 "tile": [event.tile.n for event in events], 496 "roi": [event.tile.n_roi for event in events], 497 "x": [event.x for event in events], 498 "y": [event.y for event in events], 499 "size": [event.size for event in events], 500 } 501 ) 502 metadata_list = [event.metadata for event in events] 503 # Iterate through and ensure that all metadata is the same shape 504 for metadata in metadata_list: 505 if type(metadata) != type(metadata_list[0]): 506 raise ValueError("All metadata must be the same type.") 507 if metadata is not None and metadata.shape != metadata_list[0].shape: 508 raise ValueError("All metadata must be the same shape.") 509 if metadata_list[0] is None: 510 metadata = None 511 else: 512 metadata = pd.DataFrame(metadata_list) 513 features_list = [event.features for event in events] 514 # Iterate through and ensure that all features are the same shape 515 for features in features_list: 516 if type(features) != type(features_list[0]): 517 raise ValueError("All features must be the same type.") 518 if features is not None and features.shape != features_list[0].shape: 519 raise ValueError("All features must be the same shape.") 520 if features_list[0] is None: 521 features = None 522 else: 523 features = pd.DataFrame(features_list) 524 return EventArray(info=info, metadata=metadata, features=features) 525 526 def to_events( 527 self, 528 scans: list[Scan], 529 ignore_missing_scans=True, 530 ignore_metadata=False, 531 ignore_features=False, 532 ) -> list[Event]: 533 """ 534 Get the events in the EventArray as a list of events. 535 :param scans: the scans that the events belong to. Pass an empty list if you 536 don't care about scan metadata. 537 :param ignore_missing_scans: whether to create blank scans for events without scans. 538 :param ignore_metadata: whether to ignore metadata or not 539 :param ignore_features: whether to ignore features or not 540 :return: 541 """ 542 events = [] 543 for i in range(len(self.info)): 544 # Determine the associated scan 545 scan = None 546 for s in scans: 547 if s.slide_id == self.info["slide_id"][i]: 548 scan = s 549 break 550 if scan is None: 551 if ignore_missing_scans: 552 # Create a placeholder scan if the scan is missing 553 scan = Scan.make_placeholder( 554 self.info["slide_id"][i], 555 self.info["tile"][i], 556 self.info["roi"][i], 557 ) 558 else: 559 raise ValueError( 560 f"Scan {self.info['slide_id'][i]} not found for event {i}." 561 ) 562 # Add to the list 563 events.append( 564 Event( 565 scan, 566 Tile(scan, self.info["tile"][i], self.info["roi"][i]), 567 self.info["x"][i], 568 self.info["y"][i], 569 size=self.info["size"][i], 570 metadata=None if ignore_metadata else self.metadata.loc[i], 571 features=None if ignore_features else self.features.loc[i], 572 ) 573 ) 574 return events 575 576 def to_dataframe(self) -> pd.DataFrame: 577 """ 578 Convert all the data in the EventArray to a single DataFrame. 579 :return: a DataFrame with all the data in the EventArray. 580 """ 581 # Make a copy of the info DataFrame and prepend "info_" to the column names 582 output = self.info.copy() 583 output.columns = [f"info_{col}" for col in output.columns] 584 # Combine with the metadata and prepend "metadata_" to the column names 585 if self.metadata is not None: 586 metadata = self.metadata.copy() 587 metadata.columns = [f"metadata_{col}" for col in metadata.columns] 588 output = pd.concat([output, metadata], axis=1) 589 # Combine with the features and prepend "features_" to the column names 590 if self.features is not None: 591 features = self.features.copy() 592 features.columns = [f"features_{col}" for col in features.columns] 593 output = pd.concat([output, features], axis=1) 594 return output 595 596 @classmethod 597 def from_dataframe(cls, df) -> typing.Self: 598 """ 599 From a single, special DataFrame, create an EventArray. 600 :return: a DataFrame with all the data in the EventArray. 601 """ 602 # Split the columns into info, metadata, and features and strip prefix 603 info = df[[col for col in df.columns if col.startswith("info_")]].copy() 604 info.columns = [col.replace("info_", "") for col in info.columns] 605 if info.size == 0: 606 info = None 607 metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy() 608 metadata.columns = [col.replace("metadata_", "") for col in metadata.columns] 609 if metadata.size == 0: 610 metadata = None 611 features = df[[col for col in df.columns if col.startswith("features_")]].copy() 612 features.columns = [col.replace("features_", "") for col in features.columns] 613 if features.size == 0: 614 features = None 615 return cls(info=info, metadata=metadata, features=features) 616 617 def save_csv(self, output_path: str) -> bool: 618 """ 619 Save the events to an CSV file, including metadata and features. 620 :param output_path: 621 :return: 622 """ 623 self.to_dataframe().to_csv(output_path, index=False) 624 return os.path.exists(output_path) 625 626 @classmethod 627 def load_csv(cls, input_path: str) -> typing.Self: 628 """ 629 Load the events from an CSV file, including metadata and features. 630 :param input_path: 631 :return: 632 """ 633 # Load the CSV file 634 df = pd.read_csv(input_path) 635 return cls.from_dataframe(df) 636 637 def save_hdf5(self, output_path: str) -> bool: 638 """ 639 Save the events to an HDF5 file, including metadata and features. 640 Uses the pandas-provided HDF5 functions for ease, and external compatibility, 641 though these files are slightly harder to view in HDFView or similar. 642 :param output_path: 643 :return: 644 """ 645 # Open the output_path as an HDF5 file 646 with pd.HDFStore(output_path) as store: 647 # Store the dataframes in the HDF5 file 648 if self.info is not None: 649 store.put("info", self.info, index=False) 650 if self.metadata is not None: 651 store.put("metadata", self.metadata, index=False) 652 if self.features is not None: 653 store.put("features", self.features, index=False) 654 return os.path.exists(output_path) 655 656 @classmethod 657 def load_hdf5(cls, input_path: str) -> typing.Self: 658 """ 659 Load the events from an HDF5 file, including metadata and features. 660 :param input_path: 661 :return: 662 """ 663 # Open the input_path as an HDF5 file 664 with pd.HDFStore(input_path) as store: 665 # Load the dataframes from the HDF5 file 666 info = store.get("info") if "info" in store else None 667 metadata = store.get("metadata") if "metadata" in store else None 668 features = store.get("features") if "features" in store else None 669 return cls(info=info, metadata=metadata, features=features) 670 671 @classmethod 672 def load_ocular( 673 cls, 674 input_path: str, 675 event_type="cells", 676 cell_data_files=( 677 "rc-final1.rds", 678 "rc-final2.rds", 679 "rc-final3.rds", 680 "rc-final4.rds", 681 "ocular_interesting.rds", 682 ), 683 others_data_files=( 684 "others-final1.rds", 685 "others-final2.rds", 686 "others-final3.rds", 687 "others-final4.rds", 688 ), 689 atlas_data_files=( 690 "ocular_interesting.rds", 691 "ocular_not_interesting.rds", 692 ), 693 drop_common_events=True, 694 log=None, 695 ) -> typing.Self: 696 """ 697 698 :param input_path: 699 :param event_type: 700 :param cell_data_files: 701 :param others_data_files: 702 :param atlas_data_files: 703 :param drop_common_events: 704 :param log: 705 :return: 706 """ 707 if pyreadr is None: 708 raise ModuleNotFoundError( 709 "pyreadr not installed. Install pyreadr directly " 710 "or install csi-images with [rds] option to resolve." 711 ) 712 # Check if the input path is a directory or a file 713 if os.path.isfile(input_path): 714 data_files = [os.path.basename(input_path)] 715 input_path = os.path.dirname(input_path) 716 if event_type == "cells": 717 data_files = cell_data_files 718 elif event_type == "others": 719 data_files = others_data_files 720 else: 721 raise ValueError("Invalid event type.") 722 723 # Load the data from the OCULAR files 724 file_data = {} 725 for file in data_files: 726 file_path = os.path.join(input_path, file) 727 if not os.path.isfile(file_path): 728 if log is not None: 729 log.warning(f"{file} not found for in {input_path}") 730 continue 731 file_data[file] = pyreadr.read_r(file_path) 732 # Get the DataFrame associated with None (pyreadr dict quirk) 733 file_data[file] = file_data[file][None] 734 if len(file_data[file]) == 0: 735 # File gets dropped from the dict 736 file_data.pop(file) 737 if log is not None: 738 log.warning(f"{file} has no cells") 739 continue 740 741 if log is not None: 742 log.debug(f"{file} has {len(file_data[file])} cells") 743 744 # Drop common cells if requested and in this file 745 if file in atlas_data_files and drop_common_events: 746 common_cell_indices = ( 747 file_data[file]["catalogue_classification"] == "common_cell" 748 ) 749 if log is not None: 750 log.debug( 751 f"Dropping {int(pd.Series.sum(common_cell_indices))}" 752 f"common cells from {file}" 753 ) 754 file_data[file] = file_data[file][common_cell_indices == False] 755 756 if len(file_data[file]) == 0: 757 # File gets dropped from the dict 758 file_data.pop(file) 759 if log is not None: 760 log.warning(f"{file} has no cells after dropping common cells") 761 continue 762 763 # Extract frame_id and cell_id 764 # DAPI- events already have frame_id cell_id outside rowname 765 if event_type == "cells": 766 file_data[file]["rowname"] = file_data[file]["rowname"].astype("str") 767 # get frame_id cell_id from rownames column and split into two columns 768 split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True) 769 if len(split_res.columns) != 2: 770 log.warning( 771 f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}' 772 ) 773 # then assign it back to the dataframe 774 file_data[file][["frame_id", "cell_id"]] = split_res.astype("int") 775 # reset indexes since they can cause NaN values in concat 776 file_data[file] = file_data[file].reset_index(drop=True) 777 778 # Merge the data from all files 779 if len(file_data) == 0: 780 return EventArray() 781 elif len(file_data) == 1: 782 data = [file_data[file] for file in file_data.keys()][0] 783 else: 784 data = pd.concat(file_data.values()) 785 786 if log is not None: 787 log.debug(f"Gathered a total of {len(data)} events") 788 789 # Others is missing the "slide_id". Insert it right before "frame_id" column 790 if event_type == "others" and "slide_id" not in data.columns: 791 if os.path.basename(input_path) == "ocular": 792 slide_id = os.path.basename(os.path.dirname(input_path)) 793 else: 794 slide_id = "UNKNOWN" 795 data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id) 796 797 # Sort according to ascending cell_id to keep the original, which is in manual_df 798 data = data.sort_values(by=["cell_id"], ascending=True) 799 # Filter out duplicates by x & y 800 data = data.assign( 801 unique_id=data["slide_id"] 802 + "_" 803 + data["frame_id"].astype(str) 804 + "_" 805 + data["cellx"].astype(int).astype(str) 806 + "_" 807 + data["celly"].astype(int).astype(str) 808 ) 809 data = data.drop_duplicates(subset=["unique_id"], keep="first") 810 # Normal unique_id is with cell_id 811 data = data.assign( 812 unique_id=data["slide_id"] 813 + "_" 814 + data["frame_id"].astype(str) 815 + "_" 816 + data["cell_id"].astype(str) 817 ) 818 data = data.reset_index(drop=True) 819 # All columns up to "slide_id" are features; drop the "slide_id" 820 features = data.loc[:, :"slide_id"].iloc[:, :-1] 821 data = data.loc[:, "slide_id":] 822 # Grab the info columns 823 info = data[["slide_id", "frame_id", "cellx", "celly"]] 824 info.columns = ["slide_id", "tile", "x", "y"] 825 info = info.assign( 826 roi=0, # OCULAR only works on 1 ROI, as far as known 827 size=25, # Static, for later montaging 828 ) 829 info = info[["slide_id", "tile", "roi", "x", "y", "size"]] 830 # Metadata has duplicate columns for later convenience 831 metadata = data 832 # Certain columns tend to be problematic with mixed data formats... 833 for col in ["TRITC", "CY5", "FITC"]: 834 if col in metadata: 835 labels = { 836 "False": False, 837 "True": True, 838 "FALSE": False, 839 "TRUE": True, 840 } 841 metadata[col] = metadata[col].map(labels).astype(bool) 842 for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]: 843 if col in metadata: 844 metadata[col] = metadata[col].fillna(-1).astype(int) 845 return EventArray(info, metadata, features) 846 847 def save_ocular(self, output_path: str, event_type: str = "cells"): 848 """ 849 Save the events to an OCULAR file. Relies on the dataframe originating 850 from an OCULAR file (same columns; duplicate metadata/info). 851 :param output_path: 852 :param event_type: 853 :return: 854 """ 855 if event_type == "cells": 856 file_stub = "rc-final" 857 elif event_type == "others": 858 file_stub = "others-final" 859 else: 860 raise ValueError("Invalid event type. Must be cells or others.") 861 862 # Check for the "ocular_interesting" column 863 if event_type == "cells": 864 if "ocular_interesting" in self.metadata.columns: 865 interesting_rows = self.metadata["ocular_interesting"].to_numpy( 866 dtype=bool 867 ) 868 elif "hcpc" in self.metadata.columns: 869 # Interesting cells don't get an hcpc designation, leaving them as -1 870 interesting_rows = ( 871 self.metadata["hcpc"].to_numpy() == -1 872 ) # interesting cells 873 else: 874 interesting_rows = [] 875 if sum(interesting_rows) > 0: 876 # Split the metadata into interesting and regular 877 interesting_events = self.rows(interesting_rows) 878 interesting_df = pd.concat( 879 [interesting_events.features, interesting_events.metadata], axis=1 880 ) 881 data_events = self.rows(~interesting_rows) 882 data_df = pd.concat( 883 [data_events.features, data_events.metadata], axis=1 884 ) 885 data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore") 886 887 # Drop particular columns for "interesting" 888 interesting_df = interesting_df.drop( 889 [ 890 "clust", 891 "hcpc", 892 "frame_id", 893 "cell_id", 894 "unique_id", 895 "ocular_interesting", 896 ], 897 axis=1, 898 errors="ignore", 899 ) 900 # Save both .csv and .rds 901 interesting_df.to_csv( 902 os.path.join(output_path, "ocular_interesting.csv"), index=False 903 ) 904 pyreadr.write_rds( 905 os.path.join(output_path, "ocular_interesting.rds"), interesting_df 906 ) 907 else: 908 data_df = pd.concat([self.features, self.metadata], axis=1) 909 else: 910 # Get all data and reset_index (will copy it) 911 data_df = pd.concat([self.features, self.metadata], axis=1) 912 913 # Split based on cluster number to conform to *-final[1-4].rds 914 n_clusters = max(data_df["clust"]) + 1 915 split_idx = [round(i * n_clusters / 4) for i in range(5)] 916 for i in range(4): 917 subset = (split_idx[i] <= data_df["clust"]) & ( 918 data_df["clust"] < split_idx[i + 1] 919 ) 920 data_df.loc[subset, "hcpc"] = i + 1 921 subset = data_df[subset].reset_index(drop=True) 922 pyreadr.write_rds( 923 os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset 924 ) 925 926 # Create new example cell strings 927 data_df["example_cell_id"] = ( 928 data_df["slide_id"] 929 + " " 930 + data_df["frame_id"].astype(str) 931 + " " 932 + data_df["cell_id"].astype(str) 933 + " " 934 + data_df["cellx"].astype(int).astype(str) 935 + " " 936 + data_df["celly"].astype(int).astype(str) 937 ) 938 # Find averagable data columns 939 if "cellcluster_id" in data_df.columns: 940 end_idx = data_df.columns.get_loc("cellcluster_id") 941 else: 942 end_idx = data_df.columns.get_loc("slide_id") 943 avg_cols = data_df.columns[:end_idx].tolist() 944 # Group by cluster and average 945 data_df = data_df.groupby("clust").agg( 946 **{col: (col, "mean") for col in avg_cols}, 947 count=("clust", "size"), # count rows in each cluster 948 example_cells=("example_cell_id", lambda x: ",".join(x)), 949 hcpc=("hcpc", lambda x: x.iloc[0]), 950 ) 951 data_df = data_df.reset_index() # Do NOT drop, index is "clust" 952 # Create new columns 953 metadata = pd.DataFrame( 954 { 955 "count": data_df["count"], 956 "example_cells": data_df["example_cells"], 957 "clust": data_df["clust"].astype(int), 958 "hcpc": data_df["hcpc"].astype(int), 959 "id": data_df["clust"].astype(int).astype(str), 960 "cccluster": "0", # Dummy value 961 "ccdistance": 0.0, # Dummy value 962 "rownum": list(range(len(data_df))), 963 "framegroup": 0, # Dummy value 964 } 965 ) 966 data_df = pd.concat([data_df[avg_cols], metadata], axis=1) 967 # Save the cluster data 968 data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv"), index=False) 969 pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df)
30class Event: 31 """ 32 A class that represents a single event in a scan, making it easy to evaluate 33 singular events. Required metadata is exposed as attributes, and optional 34 metadata and features are stored as DataFrames. 35 """ 36 37 SCAN_TO_SLIDE_TRANSFORM = { 38 # Axioscan zero is in the top-right corner instead of top-left 39 Scan.Type.AXIOSCAN7: np.array( 40 [ 41 [1, 0, 75000], 42 [0, 1, 0], 43 [0, 0, 1], 44 ] 45 ), 46 # BZScanner coordinates are a special kind of messed up: 47 # - The slide is upside-down. 48 # - The slide is oriented vertically, with the barcode at the bottom. 49 # - Tiles are numbered from the top-right 50 Scan.Type.BZSCANNER: np.array( 51 [ 52 [0, -1, 75000], 53 [-1, 0, 25000], 54 [0, 0, 1], 55 ] 56 ), 57 } 58 """ 59 Homogeneous transformation matrices for converting between scanner and slide 60 coordinates. The matrices are 3x3, with the final column representing the 61 translation in micrometers (um). For more information, see 62 [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations). 63 64 Transformations are nominal, and accuracy is not guaranteed; this is due to 65 imperfections in slides and alignment in the scanners. Units are in micrometers. 66 """ 67 68 def __init__( 69 self, 70 scan: Scan, 71 tile: Tile, 72 x: int, 73 y: int, 74 size: int = 12, # End-to-end size in pixels 75 metadata: pd.Series = None, 76 features: pd.Series = None, 77 ): 78 self.scan = scan 79 self.tile = tile 80 self.x = x 81 self.y = y 82 self.size = size 83 self.metadata = metadata 84 self.features = features 85 86 def __repr__(self) -> str: 87 return f"{self.scan.slide_id}-{self.tile.n}-{self.x}-{self.y}" 88 89 def __eq__(self, other) -> bool: 90 return self.__repr__() == other.__repr__() 91 92 def __lt__(self, other): 93 return self.__repr__() < other.__repr__() 94 95 def get_scan_position(self) -> tuple[float, float]: 96 """ 97 Get the position of the event in the scanner's coordinate frame. 98 :return: the scan position of the event in micrometers (um). 99 """ 100 # Get overall pixel position 101 pixel_x = self.x + (self.scan.tile_width_px * self.tile.x) 102 pixel_y = self.y + (self.scan.tile_height_px * self.tile.y) 103 # Convert to micrometers 104 x_um = pixel_x * self.scan.pixel_size_um 105 y_um = pixel_y * self.scan.pixel_size_um 106 # Add the scan's origin in the scanner frame 107 x_um += self.scan.roi[self.tile.n_roi].origin_x_um 108 y_um += self.scan.roi[self.tile.n_roi].origin_y_um 109 return x_um, y_um 110 111 def get_slide_position(self) -> tuple[float, float]: 112 """ 113 Get the slide position of the event in micrometers (um). 114 :return: the slide position of the event. 115 """ 116 # Turn scan_position into a 3x1 vector 117 scan_position = self.get_scan_position() 118 scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]]) 119 120 # Multiply by the appropriate homogeneous matrix 121 if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value): 122 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7] 123 elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value): 124 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER] 125 else: 126 raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.") 127 slide_position = np.matmul(transform, scan_position) 128 return float(slide_position[0][0]), float(slide_position[1][0]) 129 130 def crop_images( 131 self, images: list[np.ndarray], crop_size: int = 100, in_pixels: bool = True 132 ) -> list[np.ndarray]: 133 """ 134 Get the event crops from the frame images. Called "get" because it does not 135 need to extract anything; it is very quick for extracting multiple events from 136 the same tile. 137 Use this if you're interested in many events. 138 :param images: the frame images. 139 :param crop_size: the square size of the image crop to get for this event. 140 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 141 :return: image_size x image_size crops of the event in the provided frames. If 142 the event is too close to the edge, the crop will be smaller and not centered. 143 """ 144 # Convert a crop size in micrometers to pixels 145 if not in_pixels: 146 crop_size = round(crop_size / self.scan.pixel_size_um) 147 # Find the crop bounds 148 bounds = [ 149 self.x - crop_size // 2, 150 self.y - crop_size // 2, 151 self.x + math.ceil(crop_size / 2), 152 self.y + math.ceil(crop_size / 2), 153 ] 154 # Determine how much the bounds violate the image size 155 displacements = [ 156 max(0, -bounds[0]), 157 max(0, -bounds[1]), 158 max(0, bounds[2] - images[0].shape[1]), 159 max(0, bounds[3] - images[0].shape[0]), 160 ] 161 # Cap off the bounds 162 bounds = [ 163 max(0, bounds[0]), 164 max(0, bounds[1]), 165 min(images[0].shape[1], bounds[2]), 166 min(images[0].shape[0], bounds[3]), 167 ] 168 169 # Crop the images 170 cropped_images = [] 171 for image in images: 172 # Create a blank image of the right size 173 cropped_image = np.zeros((crop_size, crop_size), dtype=image.dtype) 174 175 # Insert the cropped image into the blank image, leaving a black buffer 176 # around the edges if the crop would go beyond the original image bounds 177 cropped_image[ 178 displacements[1] : crop_size - displacements[3], 179 displacements[0] : crop_size - displacements[2], 180 ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]] 181 cropped_images.append(cropped_image) 182 return cropped_images 183 184 def extract_images( 185 self, crop_size: int = 100, in_pixels: bool = True 186 ) -> list[np.ndarray]: 187 """ 188 Extract the images from the scan and tile, reading from the file. Called 189 "extract" because it must read and extract the images from file, which is slow. 190 Use this if you're interested in only a few events, as it is inefficient when 191 reading multiple events from the same tile. 192 :param crop_size: the square size of the image crop to get for this event. 193 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 194 :return: a list of cropped images from the scan in the order of the channels. 195 """ 196 frames = Frame.get_frames(self.tile) 197 images = [frame.get_image() for frame in frames] 198 return self.crop_images(images, crop_size, in_pixels) 199 200 @classmethod 201 def extract_images_for_list( 202 cls, 203 events: list[typing.Self], 204 crop_size: int | list[int] = None, 205 in_pixels: bool = True, 206 ) -> list[list[np.ndarray]]: 207 """ 208 Get the images for a list of events, ensuring that there is no wasteful reading 209 of the same tile multiple times. This function is more efficient than calling 210 extract_event_images for each event. 211 TODO: test this function 212 :param events: the events to extract images for. 213 :param crop_size: the square size of the image crop to get for this event. 214 Defaults to four times the size of the event. 215 :param in_pixels: whether the crop size is in pixels or micrometers. 216 Defaults to pixels, and is ignored if crop_size is None. 217 :return: a list of lists of cropped images for each event. 218 """ 219 if len(events) == 0: 220 return [] 221 222 # Populate a crop size if none provided 223 if crop_size is None: 224 crop_size = [4 * event.size for event in events] 225 in_pixels = True 226 # Propagate a constant crop size 227 elif isinstance(crop_size, int): 228 crop_size = [crop_size] * len(events) 229 230 # Sort the events by tile; use a shallow copy to avoid modifying the original 231 order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__())) 232 233 # Allocate the list to size 234 images = [None] * len(events) 235 last_tile = None 236 frame_images = None # Holds large numpy arrays, so expensive to compare 237 # Iterate through in sorted order 238 for i in order: 239 if last_tile != events[i].tile: 240 # Gather the frame images, preserving them for the next event 241 frames = Frame.get_frames(events[i].tile) 242 frame_images = [frame.get_image() for frame in frames] 243 244 last_tile = events[i].tile 245 # Use the frame images to crop the event images 246 # Preserve the original order using order[i] 247 images[i] = events[i].crop_images(frame_images, crop_size[i], in_pixels) 248 return images
A class that represents a single event in a scan, making it easy to evaluate singular events. Required metadata is exposed as attributes, and optional metadata and features are stored as DataFrames.
68 def __init__( 69 self, 70 scan: Scan, 71 tile: Tile, 72 x: int, 73 y: int, 74 size: int = 12, # End-to-end size in pixels 75 metadata: pd.Series = None, 76 features: pd.Series = None, 77 ): 78 self.scan = scan 79 self.tile = tile 80 self.x = x 81 self.y = y 82 self.size = size 83 self.metadata = metadata 84 self.features = features
Homogeneous transformation matrices for converting between scanner and slide coordinates. The matrices are 3x3, with the final column representing the translation in micrometers (um). For more information, see affine transformations.
Transformations are nominal, and accuracy is not guaranteed; this is due to imperfections in slides and alignment in the scanners. Units are in micrometers.
95 def get_scan_position(self) -> tuple[float, float]: 96 """ 97 Get the position of the event in the scanner's coordinate frame. 98 :return: the scan position of the event in micrometers (um). 99 """ 100 # Get overall pixel position 101 pixel_x = self.x + (self.scan.tile_width_px * self.tile.x) 102 pixel_y = self.y + (self.scan.tile_height_px * self.tile.y) 103 # Convert to micrometers 104 x_um = pixel_x * self.scan.pixel_size_um 105 y_um = pixel_y * self.scan.pixel_size_um 106 # Add the scan's origin in the scanner frame 107 x_um += self.scan.roi[self.tile.n_roi].origin_x_um 108 y_um += self.scan.roi[self.tile.n_roi].origin_y_um 109 return x_um, y_um
Get the position of the event in the scanner's coordinate frame.
Returns
the scan position of the event in micrometers (um).
111 def get_slide_position(self) -> tuple[float, float]: 112 """ 113 Get the slide position of the event in micrometers (um). 114 :return: the slide position of the event. 115 """ 116 # Turn scan_position into a 3x1 vector 117 scan_position = self.get_scan_position() 118 scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]]) 119 120 # Multiply by the appropriate homogeneous matrix 121 if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value): 122 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7] 123 elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value): 124 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER] 125 else: 126 raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.") 127 slide_position = np.matmul(transform, scan_position) 128 return float(slide_position[0][0]), float(slide_position[1][0])
Get the slide position of the event in micrometers (um).
Returns
the slide position of the event.
130 def crop_images( 131 self, images: list[np.ndarray], crop_size: int = 100, in_pixels: bool = True 132 ) -> list[np.ndarray]: 133 """ 134 Get the event crops from the frame images. Called "get" because it does not 135 need to extract anything; it is very quick for extracting multiple events from 136 the same tile. 137 Use this if you're interested in many events. 138 :param images: the frame images. 139 :param crop_size: the square size of the image crop to get for this event. 140 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 141 :return: image_size x image_size crops of the event in the provided frames. If 142 the event is too close to the edge, the crop will be smaller and not centered. 143 """ 144 # Convert a crop size in micrometers to pixels 145 if not in_pixels: 146 crop_size = round(crop_size / self.scan.pixel_size_um) 147 # Find the crop bounds 148 bounds = [ 149 self.x - crop_size // 2, 150 self.y - crop_size // 2, 151 self.x + math.ceil(crop_size / 2), 152 self.y + math.ceil(crop_size / 2), 153 ] 154 # Determine how much the bounds violate the image size 155 displacements = [ 156 max(0, -bounds[0]), 157 max(0, -bounds[1]), 158 max(0, bounds[2] - images[0].shape[1]), 159 max(0, bounds[3] - images[0].shape[0]), 160 ] 161 # Cap off the bounds 162 bounds = [ 163 max(0, bounds[0]), 164 max(0, bounds[1]), 165 min(images[0].shape[1], bounds[2]), 166 min(images[0].shape[0], bounds[3]), 167 ] 168 169 # Crop the images 170 cropped_images = [] 171 for image in images: 172 # Create a blank image of the right size 173 cropped_image = np.zeros((crop_size, crop_size), dtype=image.dtype) 174 175 # Insert the cropped image into the blank image, leaving a black buffer 176 # around the edges if the crop would go beyond the original image bounds 177 cropped_image[ 178 displacements[1] : crop_size - displacements[3], 179 displacements[0] : crop_size - displacements[2], 180 ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]] 181 cropped_images.append(cropped_image) 182 return cropped_images
Get the event crops from the frame images. Called "get" because it does not need to extract anything; it is very quick for extracting multiple events from the same tile. Use this if you're interested in many events.
Parameters
- images: the frame images.
- crop_size: the square size of the image crop to get for this event.
- in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
Returns
image_size x image_size crops of the event in the provided frames. If the event is too close to the edge, the crop will be smaller and not centered.
184 def extract_images( 185 self, crop_size: int = 100, in_pixels: bool = True 186 ) -> list[np.ndarray]: 187 """ 188 Extract the images from the scan and tile, reading from the file. Called 189 "extract" because it must read and extract the images from file, which is slow. 190 Use this if you're interested in only a few events, as it is inefficient when 191 reading multiple events from the same tile. 192 :param crop_size: the square size of the image crop to get for this event. 193 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 194 :return: a list of cropped images from the scan in the order of the channels. 195 """ 196 frames = Frame.get_frames(self.tile) 197 images = [frame.get_image() for frame in frames] 198 return self.crop_images(images, crop_size, in_pixels)
Extract the images from the scan and tile, reading from the file. Called "extract" because it must read and extract the images from file, which is slow. Use this if you're interested in only a few events, as it is inefficient when reading multiple events from the same tile.
Parameters
- crop_size: the square size of the image crop to get for this event.
- in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
Returns
a list of cropped images from the scan in the order of the channels.
200 @classmethod 201 def extract_images_for_list( 202 cls, 203 events: list[typing.Self], 204 crop_size: int | list[int] = None, 205 in_pixels: bool = True, 206 ) -> list[list[np.ndarray]]: 207 """ 208 Get the images for a list of events, ensuring that there is no wasteful reading 209 of the same tile multiple times. This function is more efficient than calling 210 extract_event_images for each event. 211 TODO: test this function 212 :param events: the events to extract images for. 213 :param crop_size: the square size of the image crop to get for this event. 214 Defaults to four times the size of the event. 215 :param in_pixels: whether the crop size is in pixels or micrometers. 216 Defaults to pixels, and is ignored if crop_size is None. 217 :return: a list of lists of cropped images for each event. 218 """ 219 if len(events) == 0: 220 return [] 221 222 # Populate a crop size if none provided 223 if crop_size is None: 224 crop_size = [4 * event.size for event in events] 225 in_pixels = True 226 # Propagate a constant crop size 227 elif isinstance(crop_size, int): 228 crop_size = [crop_size] * len(events) 229 230 # Sort the events by tile; use a shallow copy to avoid modifying the original 231 order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__())) 232 233 # Allocate the list to size 234 images = [None] * len(events) 235 last_tile = None 236 frame_images = None # Holds large numpy arrays, so expensive to compare 237 # Iterate through in sorted order 238 for i in order: 239 if last_tile != events[i].tile: 240 # Gather the frame images, preserving them for the next event 241 frames = Frame.get_frames(events[i].tile) 242 frame_images = [frame.get_image() for frame in frames] 243 244 last_tile = events[i].tile 245 # Use the frame images to crop the event images 246 # Preserve the original order using order[i] 247 images[i] = events[i].crop_images(frame_images, crop_size[i], in_pixels) 248 return images
Get the images for a list of events, ensuring that there is no wasteful reading of the same tile multiple times. This function is more efficient than calling extract_event_images for each event. TODO: test this function
Parameters
- events: the events to extract images for.
- crop_size: the square size of the image crop to get for this event. Defaults to four times the size of the event.
- in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels, and is ignored if crop_size is None.
Returns
a list of lists of cropped images for each event.
251class EventArray: 252 """ 253 A class that holds a large number of events' data, making it easy to analyze and 254 manipulate many events at once. A more separated version of the Event class. 255 """ 256 257 INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y", "size"] 258 259 def __init__( 260 self, 261 info: pd.DataFrame = None, 262 metadata: pd.DataFrame = None, 263 features: pd.DataFrame = None, 264 ): 265 # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y", "size" 266 if info is not None and ( 267 not all( 268 col in info.columns 269 for col in ["slide_id", "tile", "roi", "x", "y", "size"] 270 ) 271 or len(info.columns) != 6 272 ): 273 raise ValueError( 274 "EventArray.info must have columns 'slide_id', 'tile', 'roi', 'x', 'y', 'size'" 275 ) 276 # All DataFrames must all have the same number of rows 277 if metadata is not None and (info is None or len(info) != len(metadata)): 278 raise ValueError( 279 "If EventArray.metadata is not None, it should match rows with .info" 280 ) 281 if features is not None and (info is None or len(info) != len(features)): 282 raise ValueError( 283 "If EventArray.features is not None, it should match rows with .info" 284 ) 285 self.info = info 286 self.metadata = metadata 287 self.features = features 288 289 def __len__(self) -> int: 290 # Convenience method to get the number of events 291 if self.info is None: 292 return 0 293 else: 294 return len(self.info) 295 296 def __eq__(self, other): 297 is_equal = True 298 # Parse all possibilities for info 299 if isinstance(self.info, pd.DataFrame): 300 if isinstance(other.info, pd.DataFrame): 301 is_equal = self.info.equals(other.info) 302 if not is_equal: 303 return False 304 else: 305 return False 306 elif self.info is None: 307 if other.info is not None: 308 return False 309 310 # Parse all possibilities for metadata 311 if isinstance(self.metadata, pd.DataFrame): 312 if isinstance(other.metadata, pd.DataFrame): 313 is_equal = self.metadata.equals(other.metadata) 314 if not is_equal: 315 return False 316 else: 317 return False 318 elif self.metadata is None: 319 if other.metadata is not None: 320 return False 321 322 # Parse all possibilities for features 323 if isinstance(self.features, pd.DataFrame): 324 if isinstance(other.features, pd.DataFrame): 325 is_equal = self.features.equals(other.features) 326 if not is_equal: 327 return False 328 else: 329 return False 330 elif self.features is None: 331 if other.features is not None: 332 return False 333 334 return is_equal 335 336 def get_sort_order(self, by: str | list[str], ascending: bool | list[bool] = True): 337 """ 338 Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames. 339 :param by: name of the column(s) to sort by. 340 :param ascending: whether to sort in ascending order; can be a list to match by 341 :return: the order of the indices to sort by. 342 """ 343 columns = self.get(by) 344 return columns.sort_values(by=by, ascending=ascending).index 345 346 def sort( 347 self, by: str | list[str], ascending: bool | list[bool] = True 348 ) -> typing.Self: 349 """ 350 Sort the EventArray by column(s) in the info, metadata, or features DataFrames. 351 :param by: name of the column(s) to sort by. 352 :param ascending: whether to sort in ascending order; can be a list to match by 353 :return: a new, sorted EventArray. 354 """ 355 order = self.get_sort_order(by, ascending) 356 info = self.info.loc[order].reset_index(drop=True) 357 if self.metadata is not None: 358 metadata = self.metadata.loc[order].reset_index(drop=True) 359 else: 360 metadata = None 361 if self.features is not None: 362 features = self.features.loc[order].reset_index(drop=True) 363 else: 364 features = None 365 return EventArray(info, metadata, features) 366 367 def get(self, column_names: int | str | list[int] | list[str]) -> pd.DataFrame: 368 """ 369 Get a DataFrame with the specified columns from the EventArray, by value. 370 :param column_names: the names of the columns to get. 371 :return: a DataFrame with the specified columns. 372 """ 373 if isinstance(column_names, int) or isinstance(column_names, str): 374 column_names = [column_names] 375 columns = [] 376 for column_name in column_names: 377 if column_name in self.info.columns: 378 columns.append(self.info[column_name]) 379 elif self.metadata is not None and column_name in self.metadata.columns: 380 columns.append(self.metadata[column_name]) 381 elif self.features is not None and column_name in self.features.columns: 382 columns.append(self.features[column_name]) 383 else: 384 raise ValueError(f"Column {column_name} not found in EventArray") 385 return pd.concat(columns, axis=1) 386 387 def rows(self, rows) -> typing.Self: 388 """ 389 Get a subset of the EventArray rows based on a boolean or integer index, by value. 390 :param rows: the indices to get as a 1D boolean/integer list/array/series 391 :return: a new EventArray with the subset of events. 392 """ 393 info = self.info.loc[rows].reset_index(drop=True) 394 if self.metadata is not None: 395 metadata = self.metadata.loc[rows].reset_index(drop=True) 396 else: 397 metadata = None 398 if self.features is not None: 399 features = self.features.loc[rows].reset_index(drop=True) 400 else: 401 features = None 402 return EventArray(info, metadata, features) 403 404 def copy(self) -> typing.Self: 405 """ 406 Create a deep copy of the EventArray. 407 :return: a deep copy of the EventArray. 408 """ 409 return EventArray( 410 info=self.info.copy(), 411 metadata=None if self.metadata is None else self.metadata.copy(), 412 features=None if self.features is None else self.features.copy(), 413 ) 414 415 def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None: 416 """ 417 Add metadata to the EventArray. Removes the need to check if metadata is None. 418 Overwrites any existing metadata with the same column names as the new metadata. 419 :param new_metadata: the metadata to add. 420 """ 421 if len(self) != len(new_metadata): 422 raise ValueError("New metadata must match length of existing info") 423 424 if self.metadata is None: 425 self.metadata = new_metadata 426 else: 427 if isinstance(new_metadata, pd.Series): 428 self.metadata[new_metadata.name] = new_metadata 429 else: 430 # It's a DataFrame 431 self.metadata[new_metadata.columns] = new_metadata 432 433 def add_features(self, new_features: pd.Series | pd.DataFrame) -> None: 434 """ 435 Add features to the EventArray. Removes the need to check if features is None. 436 Overwrites any existing features with the same column names as the new features. 437 :param new_features: the features to add. 438 """ 439 if len(self) != len(new_features): 440 raise ValueError("New features must match length of existing info") 441 442 if self.features is None: 443 self.features = new_features 444 else: 445 if isinstance(new_features, pd.Series): 446 self.features[new_features.name] = new_features 447 else: 448 # It's a DataFrame 449 self.features[new_features.columns] = new_features 450 451 @classmethod 452 def merge(cls, events: list[typing.Self]) -> typing.Self: 453 """ 454 Combine EventArrays in a list into a single EventArray. 455 :param events: the new list of events. 456 """ 457 all_info = [] 458 all_metadata = [] 459 all_features = [] 460 for event_array in events: 461 # Skip empty EventArrays 462 if event_array.info is not None: 463 all_info.append(event_array.info) 464 if event_array.metadata is not None: 465 all_metadata.append(event_array.metadata) 466 if event_array.features is not None: 467 all_features.append(event_array.features) 468 if len(all_info) == 0: 469 return EventArray() 470 else: 471 all_info = pd.concat(all_info, ignore_index=True) 472 if len(all_metadata) == 0: 473 all_metadata = None 474 else: 475 all_metadata = pd.concat(all_metadata, ignore_index=True) 476 if len(all_features) == 0: 477 all_features = None 478 else: 479 all_features = pd.concat(all_features, ignore_index=True) 480 481 return EventArray(all_info, all_metadata, all_features) 482 483 @classmethod 484 def from_events(cls, events: list[Event]) -> typing.Self: 485 """ 486 Set the events in the EventArray to a new list of events. 487 :param events: the new list of events. 488 """ 489 # Return an empty array if we were passed nothing 490 if events is None or len(events) == 0: 491 return EventArray() 492 # Otherwise, grab the info 493 info = pd.DataFrame( 494 { 495 "slide_id": [event.scan.slide_id for event in events], 496 "tile": [event.tile.n for event in events], 497 "roi": [event.tile.n_roi for event in events], 498 "x": [event.x for event in events], 499 "y": [event.y for event in events], 500 "size": [event.size for event in events], 501 } 502 ) 503 metadata_list = [event.metadata for event in events] 504 # Iterate through and ensure that all metadata is the same shape 505 for metadata in metadata_list: 506 if type(metadata) != type(metadata_list[0]): 507 raise ValueError("All metadata must be the same type.") 508 if metadata is not None and metadata.shape != metadata_list[0].shape: 509 raise ValueError("All metadata must be the same shape.") 510 if metadata_list[0] is None: 511 metadata = None 512 else: 513 metadata = pd.DataFrame(metadata_list) 514 features_list = [event.features for event in events] 515 # Iterate through and ensure that all features are the same shape 516 for features in features_list: 517 if type(features) != type(features_list[0]): 518 raise ValueError("All features must be the same type.") 519 if features is not None and features.shape != features_list[0].shape: 520 raise ValueError("All features must be the same shape.") 521 if features_list[0] is None: 522 features = None 523 else: 524 features = pd.DataFrame(features_list) 525 return EventArray(info=info, metadata=metadata, features=features) 526 527 def to_events( 528 self, 529 scans: list[Scan], 530 ignore_missing_scans=True, 531 ignore_metadata=False, 532 ignore_features=False, 533 ) -> list[Event]: 534 """ 535 Get the events in the EventArray as a list of events. 536 :param scans: the scans that the events belong to. Pass an empty list if you 537 don't care about scan metadata. 538 :param ignore_missing_scans: whether to create blank scans for events without scans. 539 :param ignore_metadata: whether to ignore metadata or not 540 :param ignore_features: whether to ignore features or not 541 :return: 542 """ 543 events = [] 544 for i in range(len(self.info)): 545 # Determine the associated scan 546 scan = None 547 for s in scans: 548 if s.slide_id == self.info["slide_id"][i]: 549 scan = s 550 break 551 if scan is None: 552 if ignore_missing_scans: 553 # Create a placeholder scan if the scan is missing 554 scan = Scan.make_placeholder( 555 self.info["slide_id"][i], 556 self.info["tile"][i], 557 self.info["roi"][i], 558 ) 559 else: 560 raise ValueError( 561 f"Scan {self.info['slide_id'][i]} not found for event {i}." 562 ) 563 # Add to the list 564 events.append( 565 Event( 566 scan, 567 Tile(scan, self.info["tile"][i], self.info["roi"][i]), 568 self.info["x"][i], 569 self.info["y"][i], 570 size=self.info["size"][i], 571 metadata=None if ignore_metadata else self.metadata.loc[i], 572 features=None if ignore_features else self.features.loc[i], 573 ) 574 ) 575 return events 576 577 def to_dataframe(self) -> pd.DataFrame: 578 """ 579 Convert all the data in the EventArray to a single DataFrame. 580 :return: a DataFrame with all the data in the EventArray. 581 """ 582 # Make a copy of the info DataFrame and prepend "info_" to the column names 583 output = self.info.copy() 584 output.columns = [f"info_{col}" for col in output.columns] 585 # Combine with the metadata and prepend "metadata_" to the column names 586 if self.metadata is not None: 587 metadata = self.metadata.copy() 588 metadata.columns = [f"metadata_{col}" for col in metadata.columns] 589 output = pd.concat([output, metadata], axis=1) 590 # Combine with the features and prepend "features_" to the column names 591 if self.features is not None: 592 features = self.features.copy() 593 features.columns = [f"features_{col}" for col in features.columns] 594 output = pd.concat([output, features], axis=1) 595 return output 596 597 @classmethod 598 def from_dataframe(cls, df) -> typing.Self: 599 """ 600 From a single, special DataFrame, create an EventArray. 601 :return: a DataFrame with all the data in the EventArray. 602 """ 603 # Split the columns into info, metadata, and features and strip prefix 604 info = df[[col for col in df.columns if col.startswith("info_")]].copy() 605 info.columns = [col.replace("info_", "") for col in info.columns] 606 if info.size == 0: 607 info = None 608 metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy() 609 metadata.columns = [col.replace("metadata_", "") for col in metadata.columns] 610 if metadata.size == 0: 611 metadata = None 612 features = df[[col for col in df.columns if col.startswith("features_")]].copy() 613 features.columns = [col.replace("features_", "") for col in features.columns] 614 if features.size == 0: 615 features = None 616 return cls(info=info, metadata=metadata, features=features) 617 618 def save_csv(self, output_path: str) -> bool: 619 """ 620 Save the events to an CSV file, including metadata and features. 621 :param output_path: 622 :return: 623 """ 624 self.to_dataframe().to_csv(output_path, index=False) 625 return os.path.exists(output_path) 626 627 @classmethod 628 def load_csv(cls, input_path: str) -> typing.Self: 629 """ 630 Load the events from an CSV file, including metadata and features. 631 :param input_path: 632 :return: 633 """ 634 # Load the CSV file 635 df = pd.read_csv(input_path) 636 return cls.from_dataframe(df) 637 638 def save_hdf5(self, output_path: str) -> bool: 639 """ 640 Save the events to an HDF5 file, including metadata and features. 641 Uses the pandas-provided HDF5 functions for ease, and external compatibility, 642 though these files are slightly harder to view in HDFView or similar. 643 :param output_path: 644 :return: 645 """ 646 # Open the output_path as an HDF5 file 647 with pd.HDFStore(output_path) as store: 648 # Store the dataframes in the HDF5 file 649 if self.info is not None: 650 store.put("info", self.info, index=False) 651 if self.metadata is not None: 652 store.put("metadata", self.metadata, index=False) 653 if self.features is not None: 654 store.put("features", self.features, index=False) 655 return os.path.exists(output_path) 656 657 @classmethod 658 def load_hdf5(cls, input_path: str) -> typing.Self: 659 """ 660 Load the events from an HDF5 file, including metadata and features. 661 :param input_path: 662 :return: 663 """ 664 # Open the input_path as an HDF5 file 665 with pd.HDFStore(input_path) as store: 666 # Load the dataframes from the HDF5 file 667 info = store.get("info") if "info" in store else None 668 metadata = store.get("metadata") if "metadata" in store else None 669 features = store.get("features") if "features" in store else None 670 return cls(info=info, metadata=metadata, features=features) 671 672 @classmethod 673 def load_ocular( 674 cls, 675 input_path: str, 676 event_type="cells", 677 cell_data_files=( 678 "rc-final1.rds", 679 "rc-final2.rds", 680 "rc-final3.rds", 681 "rc-final4.rds", 682 "ocular_interesting.rds", 683 ), 684 others_data_files=( 685 "others-final1.rds", 686 "others-final2.rds", 687 "others-final3.rds", 688 "others-final4.rds", 689 ), 690 atlas_data_files=( 691 "ocular_interesting.rds", 692 "ocular_not_interesting.rds", 693 ), 694 drop_common_events=True, 695 log=None, 696 ) -> typing.Self: 697 """ 698 699 :param input_path: 700 :param event_type: 701 :param cell_data_files: 702 :param others_data_files: 703 :param atlas_data_files: 704 :param drop_common_events: 705 :param log: 706 :return: 707 """ 708 if pyreadr is None: 709 raise ModuleNotFoundError( 710 "pyreadr not installed. Install pyreadr directly " 711 "or install csi-images with [rds] option to resolve." 712 ) 713 # Check if the input path is a directory or a file 714 if os.path.isfile(input_path): 715 data_files = [os.path.basename(input_path)] 716 input_path = os.path.dirname(input_path) 717 if event_type == "cells": 718 data_files = cell_data_files 719 elif event_type == "others": 720 data_files = others_data_files 721 else: 722 raise ValueError("Invalid event type.") 723 724 # Load the data from the OCULAR files 725 file_data = {} 726 for file in data_files: 727 file_path = os.path.join(input_path, file) 728 if not os.path.isfile(file_path): 729 if log is not None: 730 log.warning(f"{file} not found for in {input_path}") 731 continue 732 file_data[file] = pyreadr.read_r(file_path) 733 # Get the DataFrame associated with None (pyreadr dict quirk) 734 file_data[file] = file_data[file][None] 735 if len(file_data[file]) == 0: 736 # File gets dropped from the dict 737 file_data.pop(file) 738 if log is not None: 739 log.warning(f"{file} has no cells") 740 continue 741 742 if log is not None: 743 log.debug(f"{file} has {len(file_data[file])} cells") 744 745 # Drop common cells if requested and in this file 746 if file in atlas_data_files and drop_common_events: 747 common_cell_indices = ( 748 file_data[file]["catalogue_classification"] == "common_cell" 749 ) 750 if log is not None: 751 log.debug( 752 f"Dropping {int(pd.Series.sum(common_cell_indices))}" 753 f"common cells from {file}" 754 ) 755 file_data[file] = file_data[file][common_cell_indices == False] 756 757 if len(file_data[file]) == 0: 758 # File gets dropped from the dict 759 file_data.pop(file) 760 if log is not None: 761 log.warning(f"{file} has no cells after dropping common cells") 762 continue 763 764 # Extract frame_id and cell_id 765 # DAPI- events already have frame_id cell_id outside rowname 766 if event_type == "cells": 767 file_data[file]["rowname"] = file_data[file]["rowname"].astype("str") 768 # get frame_id cell_id from rownames column and split into two columns 769 split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True) 770 if len(split_res.columns) != 2: 771 log.warning( 772 f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}' 773 ) 774 # then assign it back to the dataframe 775 file_data[file][["frame_id", "cell_id"]] = split_res.astype("int") 776 # reset indexes since they can cause NaN values in concat 777 file_data[file] = file_data[file].reset_index(drop=True) 778 779 # Merge the data from all files 780 if len(file_data) == 0: 781 return EventArray() 782 elif len(file_data) == 1: 783 data = [file_data[file] for file in file_data.keys()][0] 784 else: 785 data = pd.concat(file_data.values()) 786 787 if log is not None: 788 log.debug(f"Gathered a total of {len(data)} events") 789 790 # Others is missing the "slide_id". Insert it right before "frame_id" column 791 if event_type == "others" and "slide_id" not in data.columns: 792 if os.path.basename(input_path) == "ocular": 793 slide_id = os.path.basename(os.path.dirname(input_path)) 794 else: 795 slide_id = "UNKNOWN" 796 data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id) 797 798 # Sort according to ascending cell_id to keep the original, which is in manual_df 799 data = data.sort_values(by=["cell_id"], ascending=True) 800 # Filter out duplicates by x & y 801 data = data.assign( 802 unique_id=data["slide_id"] 803 + "_" 804 + data["frame_id"].astype(str) 805 + "_" 806 + data["cellx"].astype(int).astype(str) 807 + "_" 808 + data["celly"].astype(int).astype(str) 809 ) 810 data = data.drop_duplicates(subset=["unique_id"], keep="first") 811 # Normal unique_id is with cell_id 812 data = data.assign( 813 unique_id=data["slide_id"] 814 + "_" 815 + data["frame_id"].astype(str) 816 + "_" 817 + data["cell_id"].astype(str) 818 ) 819 data = data.reset_index(drop=True) 820 # All columns up to "slide_id" are features; drop the "slide_id" 821 features = data.loc[:, :"slide_id"].iloc[:, :-1] 822 data = data.loc[:, "slide_id":] 823 # Grab the info columns 824 info = data[["slide_id", "frame_id", "cellx", "celly"]] 825 info.columns = ["slide_id", "tile", "x", "y"] 826 info = info.assign( 827 roi=0, # OCULAR only works on 1 ROI, as far as known 828 size=25, # Static, for later montaging 829 ) 830 info = info[["slide_id", "tile", "roi", "x", "y", "size"]] 831 # Metadata has duplicate columns for later convenience 832 metadata = data 833 # Certain columns tend to be problematic with mixed data formats... 834 for col in ["TRITC", "CY5", "FITC"]: 835 if col in metadata: 836 labels = { 837 "False": False, 838 "True": True, 839 "FALSE": False, 840 "TRUE": True, 841 } 842 metadata[col] = metadata[col].map(labels).astype(bool) 843 for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]: 844 if col in metadata: 845 metadata[col] = metadata[col].fillna(-1).astype(int) 846 return EventArray(info, metadata, features) 847 848 def save_ocular(self, output_path: str, event_type: str = "cells"): 849 """ 850 Save the events to an OCULAR file. Relies on the dataframe originating 851 from an OCULAR file (same columns; duplicate metadata/info). 852 :param output_path: 853 :param event_type: 854 :return: 855 """ 856 if event_type == "cells": 857 file_stub = "rc-final" 858 elif event_type == "others": 859 file_stub = "others-final" 860 else: 861 raise ValueError("Invalid event type. Must be cells or others.") 862 863 # Check for the "ocular_interesting" column 864 if event_type == "cells": 865 if "ocular_interesting" in self.metadata.columns: 866 interesting_rows = self.metadata["ocular_interesting"].to_numpy( 867 dtype=bool 868 ) 869 elif "hcpc" in self.metadata.columns: 870 # Interesting cells don't get an hcpc designation, leaving them as -1 871 interesting_rows = ( 872 self.metadata["hcpc"].to_numpy() == -1 873 ) # interesting cells 874 else: 875 interesting_rows = [] 876 if sum(interesting_rows) > 0: 877 # Split the metadata into interesting and regular 878 interesting_events = self.rows(interesting_rows) 879 interesting_df = pd.concat( 880 [interesting_events.features, interesting_events.metadata], axis=1 881 ) 882 data_events = self.rows(~interesting_rows) 883 data_df = pd.concat( 884 [data_events.features, data_events.metadata], axis=1 885 ) 886 data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore") 887 888 # Drop particular columns for "interesting" 889 interesting_df = interesting_df.drop( 890 [ 891 "clust", 892 "hcpc", 893 "frame_id", 894 "cell_id", 895 "unique_id", 896 "ocular_interesting", 897 ], 898 axis=1, 899 errors="ignore", 900 ) 901 # Save both .csv and .rds 902 interesting_df.to_csv( 903 os.path.join(output_path, "ocular_interesting.csv"), index=False 904 ) 905 pyreadr.write_rds( 906 os.path.join(output_path, "ocular_interesting.rds"), interesting_df 907 ) 908 else: 909 data_df = pd.concat([self.features, self.metadata], axis=1) 910 else: 911 # Get all data and reset_index (will copy it) 912 data_df = pd.concat([self.features, self.metadata], axis=1) 913 914 # Split based on cluster number to conform to *-final[1-4].rds 915 n_clusters = max(data_df["clust"]) + 1 916 split_idx = [round(i * n_clusters / 4) for i in range(5)] 917 for i in range(4): 918 subset = (split_idx[i] <= data_df["clust"]) & ( 919 data_df["clust"] < split_idx[i + 1] 920 ) 921 data_df.loc[subset, "hcpc"] = i + 1 922 subset = data_df[subset].reset_index(drop=True) 923 pyreadr.write_rds( 924 os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset 925 ) 926 927 # Create new example cell strings 928 data_df["example_cell_id"] = ( 929 data_df["slide_id"] 930 + " " 931 + data_df["frame_id"].astype(str) 932 + " " 933 + data_df["cell_id"].astype(str) 934 + " " 935 + data_df["cellx"].astype(int).astype(str) 936 + " " 937 + data_df["celly"].astype(int).astype(str) 938 ) 939 # Find averagable data columns 940 if "cellcluster_id" in data_df.columns: 941 end_idx = data_df.columns.get_loc("cellcluster_id") 942 else: 943 end_idx = data_df.columns.get_loc("slide_id") 944 avg_cols = data_df.columns[:end_idx].tolist() 945 # Group by cluster and average 946 data_df = data_df.groupby("clust").agg( 947 **{col: (col, "mean") for col in avg_cols}, 948 count=("clust", "size"), # count rows in each cluster 949 example_cells=("example_cell_id", lambda x: ",".join(x)), 950 hcpc=("hcpc", lambda x: x.iloc[0]), 951 ) 952 data_df = data_df.reset_index() # Do NOT drop, index is "clust" 953 # Create new columns 954 metadata = pd.DataFrame( 955 { 956 "count": data_df["count"], 957 "example_cells": data_df["example_cells"], 958 "clust": data_df["clust"].astype(int), 959 "hcpc": data_df["hcpc"].astype(int), 960 "id": data_df["clust"].astype(int).astype(str), 961 "cccluster": "0", # Dummy value 962 "ccdistance": 0.0, # Dummy value 963 "rownum": list(range(len(data_df))), 964 "framegroup": 0, # Dummy value 965 } 966 ) 967 data_df = pd.concat([data_df[avg_cols], metadata], axis=1) 968 # Save the cluster data 969 data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv"), index=False) 970 pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df)
A class that holds a large number of events' data, making it easy to analyze and manipulate many events at once. A more separated version of the Event class.
259 def __init__( 260 self, 261 info: pd.DataFrame = None, 262 metadata: pd.DataFrame = None, 263 features: pd.DataFrame = None, 264 ): 265 # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y", "size" 266 if info is not None and ( 267 not all( 268 col in info.columns 269 for col in ["slide_id", "tile", "roi", "x", "y", "size"] 270 ) 271 or len(info.columns) != 6 272 ): 273 raise ValueError( 274 "EventArray.info must have columns 'slide_id', 'tile', 'roi', 'x', 'y', 'size'" 275 ) 276 # All DataFrames must all have the same number of rows 277 if metadata is not None and (info is None or len(info) != len(metadata)): 278 raise ValueError( 279 "If EventArray.metadata is not None, it should match rows with .info" 280 ) 281 if features is not None and (info is None or len(info) != len(features)): 282 raise ValueError( 283 "If EventArray.features is not None, it should match rows with .info" 284 ) 285 self.info = info 286 self.metadata = metadata 287 self.features = features
336 def get_sort_order(self, by: str | list[str], ascending: bool | list[bool] = True): 337 """ 338 Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames. 339 :param by: name of the column(s) to sort by. 340 :param ascending: whether to sort in ascending order; can be a list to match by 341 :return: the order of the indices to sort by. 342 """ 343 columns = self.get(by) 344 return columns.sort_values(by=by, ascending=ascending).index
Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.
Parameters
- by: name of the column(s) to sort by.
- ascending: whether to sort in ascending order; can be a list to match by
Returns
the order of the indices to sort by.
346 def sort( 347 self, by: str | list[str], ascending: bool | list[bool] = True 348 ) -> typing.Self: 349 """ 350 Sort the EventArray by column(s) in the info, metadata, or features DataFrames. 351 :param by: name of the column(s) to sort by. 352 :param ascending: whether to sort in ascending order; can be a list to match by 353 :return: a new, sorted EventArray. 354 """ 355 order = self.get_sort_order(by, ascending) 356 info = self.info.loc[order].reset_index(drop=True) 357 if self.metadata is not None: 358 metadata = self.metadata.loc[order].reset_index(drop=True) 359 else: 360 metadata = None 361 if self.features is not None: 362 features = self.features.loc[order].reset_index(drop=True) 363 else: 364 features = None 365 return EventArray(info, metadata, features)
Sort the EventArray by column(s) in the info, metadata, or features DataFrames.
Parameters
- by: name of the column(s) to sort by.
- ascending: whether to sort in ascending order; can be a list to match by
Returns
a new, sorted EventArray.
367 def get(self, column_names: int | str | list[int] | list[str]) -> pd.DataFrame: 368 """ 369 Get a DataFrame with the specified columns from the EventArray, by value. 370 :param column_names: the names of the columns to get. 371 :return: a DataFrame with the specified columns. 372 """ 373 if isinstance(column_names, int) or isinstance(column_names, str): 374 column_names = [column_names] 375 columns = [] 376 for column_name in column_names: 377 if column_name in self.info.columns: 378 columns.append(self.info[column_name]) 379 elif self.metadata is not None and column_name in self.metadata.columns: 380 columns.append(self.metadata[column_name]) 381 elif self.features is not None and column_name in self.features.columns: 382 columns.append(self.features[column_name]) 383 else: 384 raise ValueError(f"Column {column_name} not found in EventArray") 385 return pd.concat(columns, axis=1)
Get a DataFrame with the specified columns from the EventArray, by value.
Parameters
- column_names: the names of the columns to get.
Returns
a DataFrame with the specified columns.
387 def rows(self, rows) -> typing.Self: 388 """ 389 Get a subset of the EventArray rows based on a boolean or integer index, by value. 390 :param rows: the indices to get as a 1D boolean/integer list/array/series 391 :return: a new EventArray with the subset of events. 392 """ 393 info = self.info.loc[rows].reset_index(drop=True) 394 if self.metadata is not None: 395 metadata = self.metadata.loc[rows].reset_index(drop=True) 396 else: 397 metadata = None 398 if self.features is not None: 399 features = self.features.loc[rows].reset_index(drop=True) 400 else: 401 features = None 402 return EventArray(info, metadata, features)
Get a subset of the EventArray rows based on a boolean or integer index, by value.
Parameters
- rows: the indices to get as a 1D boolean/integer list/array/series
Returns
a new EventArray with the subset of events.
404 def copy(self) -> typing.Self: 405 """ 406 Create a deep copy of the EventArray. 407 :return: a deep copy of the EventArray. 408 """ 409 return EventArray( 410 info=self.info.copy(), 411 metadata=None if self.metadata is None else self.metadata.copy(), 412 features=None if self.features is None else self.features.copy(), 413 )
Create a deep copy of the EventArray.
Returns
a deep copy of the EventArray.
415 def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None: 416 """ 417 Add metadata to the EventArray. Removes the need to check if metadata is None. 418 Overwrites any existing metadata with the same column names as the new metadata. 419 :param new_metadata: the metadata to add. 420 """ 421 if len(self) != len(new_metadata): 422 raise ValueError("New metadata must match length of existing info") 423 424 if self.metadata is None: 425 self.metadata = new_metadata 426 else: 427 if isinstance(new_metadata, pd.Series): 428 self.metadata[new_metadata.name] = new_metadata 429 else: 430 # It's a DataFrame 431 self.metadata[new_metadata.columns] = new_metadata
Add metadata to the EventArray. Removes the need to check if metadata is None. Overwrites any existing metadata with the same column names as the new metadata.
Parameters
- new_metadata: the metadata to add.
433 def add_features(self, new_features: pd.Series | pd.DataFrame) -> None: 434 """ 435 Add features to the EventArray. Removes the need to check if features is None. 436 Overwrites any existing features with the same column names as the new features. 437 :param new_features: the features to add. 438 """ 439 if len(self) != len(new_features): 440 raise ValueError("New features must match length of existing info") 441 442 if self.features is None: 443 self.features = new_features 444 else: 445 if isinstance(new_features, pd.Series): 446 self.features[new_features.name] = new_features 447 else: 448 # It's a DataFrame 449 self.features[new_features.columns] = new_features
Add features to the EventArray. Removes the need to check if features is None. Overwrites any existing features with the same column names as the new features.
Parameters
- new_features: the features to add.
451 @classmethod 452 def merge(cls, events: list[typing.Self]) -> typing.Self: 453 """ 454 Combine EventArrays in a list into a single EventArray. 455 :param events: the new list of events. 456 """ 457 all_info = [] 458 all_metadata = [] 459 all_features = [] 460 for event_array in events: 461 # Skip empty EventArrays 462 if event_array.info is not None: 463 all_info.append(event_array.info) 464 if event_array.metadata is not None: 465 all_metadata.append(event_array.metadata) 466 if event_array.features is not None: 467 all_features.append(event_array.features) 468 if len(all_info) == 0: 469 return EventArray() 470 else: 471 all_info = pd.concat(all_info, ignore_index=True) 472 if len(all_metadata) == 0: 473 all_metadata = None 474 else: 475 all_metadata = pd.concat(all_metadata, ignore_index=True) 476 if len(all_features) == 0: 477 all_features = None 478 else: 479 all_features = pd.concat(all_features, ignore_index=True) 480 481 return EventArray(all_info, all_metadata, all_features)
Combine EventArrays in a list into a single EventArray.
Parameters
- events: the new list of events.
483 @classmethod 484 def from_events(cls, events: list[Event]) -> typing.Self: 485 """ 486 Set the events in the EventArray to a new list of events. 487 :param events: the new list of events. 488 """ 489 # Return an empty array if we were passed nothing 490 if events is None or len(events) == 0: 491 return EventArray() 492 # Otherwise, grab the info 493 info = pd.DataFrame( 494 { 495 "slide_id": [event.scan.slide_id for event in events], 496 "tile": [event.tile.n for event in events], 497 "roi": [event.tile.n_roi for event in events], 498 "x": [event.x for event in events], 499 "y": [event.y for event in events], 500 "size": [event.size for event in events], 501 } 502 ) 503 metadata_list = [event.metadata for event in events] 504 # Iterate through and ensure that all metadata is the same shape 505 for metadata in metadata_list: 506 if type(metadata) != type(metadata_list[0]): 507 raise ValueError("All metadata must be the same type.") 508 if metadata is not None and metadata.shape != metadata_list[0].shape: 509 raise ValueError("All metadata must be the same shape.") 510 if metadata_list[0] is None: 511 metadata = None 512 else: 513 metadata = pd.DataFrame(metadata_list) 514 features_list = [event.features for event in events] 515 # Iterate through and ensure that all features are the same shape 516 for features in features_list: 517 if type(features) != type(features_list[0]): 518 raise ValueError("All features must be the same type.") 519 if features is not None and features.shape != features_list[0].shape: 520 raise ValueError("All features must be the same shape.") 521 if features_list[0] is None: 522 features = None 523 else: 524 features = pd.DataFrame(features_list) 525 return EventArray(info=info, metadata=metadata, features=features)
Set the events in the EventArray to a new list of events.
Parameters
- events: the new list of events.
527 def to_events( 528 self, 529 scans: list[Scan], 530 ignore_missing_scans=True, 531 ignore_metadata=False, 532 ignore_features=False, 533 ) -> list[Event]: 534 """ 535 Get the events in the EventArray as a list of events. 536 :param scans: the scans that the events belong to. Pass an empty list if you 537 don't care about scan metadata. 538 :param ignore_missing_scans: whether to create blank scans for events without scans. 539 :param ignore_metadata: whether to ignore metadata or not 540 :param ignore_features: whether to ignore features or not 541 :return: 542 """ 543 events = [] 544 for i in range(len(self.info)): 545 # Determine the associated scan 546 scan = None 547 for s in scans: 548 if s.slide_id == self.info["slide_id"][i]: 549 scan = s 550 break 551 if scan is None: 552 if ignore_missing_scans: 553 # Create a placeholder scan if the scan is missing 554 scan = Scan.make_placeholder( 555 self.info["slide_id"][i], 556 self.info["tile"][i], 557 self.info["roi"][i], 558 ) 559 else: 560 raise ValueError( 561 f"Scan {self.info['slide_id'][i]} not found for event {i}." 562 ) 563 # Add to the list 564 events.append( 565 Event( 566 scan, 567 Tile(scan, self.info["tile"][i], self.info["roi"][i]), 568 self.info["x"][i], 569 self.info["y"][i], 570 size=self.info["size"][i], 571 metadata=None if ignore_metadata else self.metadata.loc[i], 572 features=None if ignore_features else self.features.loc[i], 573 ) 574 ) 575 return events
Get the events in the EventArray as a list of events.
Parameters
- scans: the scans that the events belong to. Pass an empty list if you don't care about scan metadata.
- ignore_missing_scans: whether to create blank scans for events without scans.
- ignore_metadata: whether to ignore metadata or not
- ignore_features: whether to ignore features or not
Returns
577 def to_dataframe(self) -> pd.DataFrame: 578 """ 579 Convert all the data in the EventArray to a single DataFrame. 580 :return: a DataFrame with all the data in the EventArray. 581 """ 582 # Make a copy of the info DataFrame and prepend "info_" to the column names 583 output = self.info.copy() 584 output.columns = [f"info_{col}" for col in output.columns] 585 # Combine with the metadata and prepend "metadata_" to the column names 586 if self.metadata is not None: 587 metadata = self.metadata.copy() 588 metadata.columns = [f"metadata_{col}" for col in metadata.columns] 589 output = pd.concat([output, metadata], axis=1) 590 # Combine with the features and prepend "features_" to the column names 591 if self.features is not None: 592 features = self.features.copy() 593 features.columns = [f"features_{col}" for col in features.columns] 594 output = pd.concat([output, features], axis=1) 595 return output
Convert all the data in the EventArray to a single DataFrame.
Returns
a DataFrame with all the data in the EventArray.
597 @classmethod 598 def from_dataframe(cls, df) -> typing.Self: 599 """ 600 From a single, special DataFrame, create an EventArray. 601 :return: a DataFrame with all the data in the EventArray. 602 """ 603 # Split the columns into info, metadata, and features and strip prefix 604 info = df[[col for col in df.columns if col.startswith("info_")]].copy() 605 info.columns = [col.replace("info_", "") for col in info.columns] 606 if info.size == 0: 607 info = None 608 metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy() 609 metadata.columns = [col.replace("metadata_", "") for col in metadata.columns] 610 if metadata.size == 0: 611 metadata = None 612 features = df[[col for col in df.columns if col.startswith("features_")]].copy() 613 features.columns = [col.replace("features_", "") for col in features.columns] 614 if features.size == 0: 615 features = None 616 return cls(info=info, metadata=metadata, features=features)
From a single, special DataFrame, create an EventArray.
Returns
a DataFrame with all the data in the EventArray.
618 def save_csv(self, output_path: str) -> bool: 619 """ 620 Save the events to an CSV file, including metadata and features. 621 :param output_path: 622 :return: 623 """ 624 self.to_dataframe().to_csv(output_path, index=False) 625 return os.path.exists(output_path)
Save the events to an CSV file, including metadata and features.
Parameters
- output_path:
Returns
627 @classmethod 628 def load_csv(cls, input_path: str) -> typing.Self: 629 """ 630 Load the events from an CSV file, including metadata and features. 631 :param input_path: 632 :return: 633 """ 634 # Load the CSV file 635 df = pd.read_csv(input_path) 636 return cls.from_dataframe(df)
Load the events from an CSV file, including metadata and features.
Parameters
- input_path:
Returns
638 def save_hdf5(self, output_path: str) -> bool: 639 """ 640 Save the events to an HDF5 file, including metadata and features. 641 Uses the pandas-provided HDF5 functions for ease, and external compatibility, 642 though these files are slightly harder to view in HDFView or similar. 643 :param output_path: 644 :return: 645 """ 646 # Open the output_path as an HDF5 file 647 with pd.HDFStore(output_path) as store: 648 # Store the dataframes in the HDF5 file 649 if self.info is not None: 650 store.put("info", self.info, index=False) 651 if self.metadata is not None: 652 store.put("metadata", self.metadata, index=False) 653 if self.features is not None: 654 store.put("features", self.features, index=False) 655 return os.path.exists(output_path)
Save the events to an HDF5 file, including metadata and features. Uses the pandas-provided HDF5 functions for ease, and external compatibility, though these files are slightly harder to view in HDFView or similar.
Parameters
- output_path:
Returns
657 @classmethod 658 def load_hdf5(cls, input_path: str) -> typing.Self: 659 """ 660 Load the events from an HDF5 file, including metadata and features. 661 :param input_path: 662 :return: 663 """ 664 # Open the input_path as an HDF5 file 665 with pd.HDFStore(input_path) as store: 666 # Load the dataframes from the HDF5 file 667 info = store.get("info") if "info" in store else None 668 metadata = store.get("metadata") if "metadata" in store else None 669 features = store.get("features") if "features" in store else None 670 return cls(info=info, metadata=metadata, features=features)
Load the events from an HDF5 file, including metadata and features.
Parameters
- input_path:
Returns
672 @classmethod 673 def load_ocular( 674 cls, 675 input_path: str, 676 event_type="cells", 677 cell_data_files=( 678 "rc-final1.rds", 679 "rc-final2.rds", 680 "rc-final3.rds", 681 "rc-final4.rds", 682 "ocular_interesting.rds", 683 ), 684 others_data_files=( 685 "others-final1.rds", 686 "others-final2.rds", 687 "others-final3.rds", 688 "others-final4.rds", 689 ), 690 atlas_data_files=( 691 "ocular_interesting.rds", 692 "ocular_not_interesting.rds", 693 ), 694 drop_common_events=True, 695 log=None, 696 ) -> typing.Self: 697 """ 698 699 :param input_path: 700 :param event_type: 701 :param cell_data_files: 702 :param others_data_files: 703 :param atlas_data_files: 704 :param drop_common_events: 705 :param log: 706 :return: 707 """ 708 if pyreadr is None: 709 raise ModuleNotFoundError( 710 "pyreadr not installed. Install pyreadr directly " 711 "or install csi-images with [rds] option to resolve." 712 ) 713 # Check if the input path is a directory or a file 714 if os.path.isfile(input_path): 715 data_files = [os.path.basename(input_path)] 716 input_path = os.path.dirname(input_path) 717 if event_type == "cells": 718 data_files = cell_data_files 719 elif event_type == "others": 720 data_files = others_data_files 721 else: 722 raise ValueError("Invalid event type.") 723 724 # Load the data from the OCULAR files 725 file_data = {} 726 for file in data_files: 727 file_path = os.path.join(input_path, file) 728 if not os.path.isfile(file_path): 729 if log is not None: 730 log.warning(f"{file} not found for in {input_path}") 731 continue 732 file_data[file] = pyreadr.read_r(file_path) 733 # Get the DataFrame associated with None (pyreadr dict quirk) 734 file_data[file] = file_data[file][None] 735 if len(file_data[file]) == 0: 736 # File gets dropped from the dict 737 file_data.pop(file) 738 if log is not None: 739 log.warning(f"{file} has no cells") 740 continue 741 742 if log is not None: 743 log.debug(f"{file} has {len(file_data[file])} cells") 744 745 # Drop common cells if requested and in this file 746 if file in atlas_data_files and drop_common_events: 747 common_cell_indices = ( 748 file_data[file]["catalogue_classification"] == "common_cell" 749 ) 750 if log is not None: 751 log.debug( 752 f"Dropping {int(pd.Series.sum(common_cell_indices))}" 753 f"common cells from {file}" 754 ) 755 file_data[file] = file_data[file][common_cell_indices == False] 756 757 if len(file_data[file]) == 0: 758 # File gets dropped from the dict 759 file_data.pop(file) 760 if log is not None: 761 log.warning(f"{file} has no cells after dropping common cells") 762 continue 763 764 # Extract frame_id and cell_id 765 # DAPI- events already have frame_id cell_id outside rowname 766 if event_type == "cells": 767 file_data[file]["rowname"] = file_data[file]["rowname"].astype("str") 768 # get frame_id cell_id from rownames column and split into two columns 769 split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True) 770 if len(split_res.columns) != 2: 771 log.warning( 772 f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}' 773 ) 774 # then assign it back to the dataframe 775 file_data[file][["frame_id", "cell_id"]] = split_res.astype("int") 776 # reset indexes since they can cause NaN values in concat 777 file_data[file] = file_data[file].reset_index(drop=True) 778 779 # Merge the data from all files 780 if len(file_data) == 0: 781 return EventArray() 782 elif len(file_data) == 1: 783 data = [file_data[file] for file in file_data.keys()][0] 784 else: 785 data = pd.concat(file_data.values()) 786 787 if log is not None: 788 log.debug(f"Gathered a total of {len(data)} events") 789 790 # Others is missing the "slide_id". Insert it right before "frame_id" column 791 if event_type == "others" and "slide_id" not in data.columns: 792 if os.path.basename(input_path) == "ocular": 793 slide_id = os.path.basename(os.path.dirname(input_path)) 794 else: 795 slide_id = "UNKNOWN" 796 data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id) 797 798 # Sort according to ascending cell_id to keep the original, which is in manual_df 799 data = data.sort_values(by=["cell_id"], ascending=True) 800 # Filter out duplicates by x & y 801 data = data.assign( 802 unique_id=data["slide_id"] 803 + "_" 804 + data["frame_id"].astype(str) 805 + "_" 806 + data["cellx"].astype(int).astype(str) 807 + "_" 808 + data["celly"].astype(int).astype(str) 809 ) 810 data = data.drop_duplicates(subset=["unique_id"], keep="first") 811 # Normal unique_id is with cell_id 812 data = data.assign( 813 unique_id=data["slide_id"] 814 + "_" 815 + data["frame_id"].astype(str) 816 + "_" 817 + data["cell_id"].astype(str) 818 ) 819 data = data.reset_index(drop=True) 820 # All columns up to "slide_id" are features; drop the "slide_id" 821 features = data.loc[:, :"slide_id"].iloc[:, :-1] 822 data = data.loc[:, "slide_id":] 823 # Grab the info columns 824 info = data[["slide_id", "frame_id", "cellx", "celly"]] 825 info.columns = ["slide_id", "tile", "x", "y"] 826 info = info.assign( 827 roi=0, # OCULAR only works on 1 ROI, as far as known 828 size=25, # Static, for later montaging 829 ) 830 info = info[["slide_id", "tile", "roi", "x", "y", "size"]] 831 # Metadata has duplicate columns for later convenience 832 metadata = data 833 # Certain columns tend to be problematic with mixed data formats... 834 for col in ["TRITC", "CY5", "FITC"]: 835 if col in metadata: 836 labels = { 837 "False": False, 838 "True": True, 839 "FALSE": False, 840 "TRUE": True, 841 } 842 metadata[col] = metadata[col].map(labels).astype(bool) 843 for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]: 844 if col in metadata: 845 metadata[col] = metadata[col].fillna(-1).astype(int) 846 return EventArray(info, metadata, features)
Parameters
- input_path:
- event_type:
- cell_data_files:
- others_data_files:
- atlas_data_files:
- drop_common_events:
- log:
Returns
848 def save_ocular(self, output_path: str, event_type: str = "cells"): 849 """ 850 Save the events to an OCULAR file. Relies on the dataframe originating 851 from an OCULAR file (same columns; duplicate metadata/info). 852 :param output_path: 853 :param event_type: 854 :return: 855 """ 856 if event_type == "cells": 857 file_stub = "rc-final" 858 elif event_type == "others": 859 file_stub = "others-final" 860 else: 861 raise ValueError("Invalid event type. Must be cells or others.") 862 863 # Check for the "ocular_interesting" column 864 if event_type == "cells": 865 if "ocular_interesting" in self.metadata.columns: 866 interesting_rows = self.metadata["ocular_interesting"].to_numpy( 867 dtype=bool 868 ) 869 elif "hcpc" in self.metadata.columns: 870 # Interesting cells don't get an hcpc designation, leaving them as -1 871 interesting_rows = ( 872 self.metadata["hcpc"].to_numpy() == -1 873 ) # interesting cells 874 else: 875 interesting_rows = [] 876 if sum(interesting_rows) > 0: 877 # Split the metadata into interesting and regular 878 interesting_events = self.rows(interesting_rows) 879 interesting_df = pd.concat( 880 [interesting_events.features, interesting_events.metadata], axis=1 881 ) 882 data_events = self.rows(~interesting_rows) 883 data_df = pd.concat( 884 [data_events.features, data_events.metadata], axis=1 885 ) 886 data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore") 887 888 # Drop particular columns for "interesting" 889 interesting_df = interesting_df.drop( 890 [ 891 "clust", 892 "hcpc", 893 "frame_id", 894 "cell_id", 895 "unique_id", 896 "ocular_interesting", 897 ], 898 axis=1, 899 errors="ignore", 900 ) 901 # Save both .csv and .rds 902 interesting_df.to_csv( 903 os.path.join(output_path, "ocular_interesting.csv"), index=False 904 ) 905 pyreadr.write_rds( 906 os.path.join(output_path, "ocular_interesting.rds"), interesting_df 907 ) 908 else: 909 data_df = pd.concat([self.features, self.metadata], axis=1) 910 else: 911 # Get all data and reset_index (will copy it) 912 data_df = pd.concat([self.features, self.metadata], axis=1) 913 914 # Split based on cluster number to conform to *-final[1-4].rds 915 n_clusters = max(data_df["clust"]) + 1 916 split_idx = [round(i * n_clusters / 4) for i in range(5)] 917 for i in range(4): 918 subset = (split_idx[i] <= data_df["clust"]) & ( 919 data_df["clust"] < split_idx[i + 1] 920 ) 921 data_df.loc[subset, "hcpc"] = i + 1 922 subset = data_df[subset].reset_index(drop=True) 923 pyreadr.write_rds( 924 os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset 925 ) 926 927 # Create new example cell strings 928 data_df["example_cell_id"] = ( 929 data_df["slide_id"] 930 + " " 931 + data_df["frame_id"].astype(str) 932 + " " 933 + data_df["cell_id"].astype(str) 934 + " " 935 + data_df["cellx"].astype(int).astype(str) 936 + " " 937 + data_df["celly"].astype(int).astype(str) 938 ) 939 # Find averagable data columns 940 if "cellcluster_id" in data_df.columns: 941 end_idx = data_df.columns.get_loc("cellcluster_id") 942 else: 943 end_idx = data_df.columns.get_loc("slide_id") 944 avg_cols = data_df.columns[:end_idx].tolist() 945 # Group by cluster and average 946 data_df = data_df.groupby("clust").agg( 947 **{col: (col, "mean") for col in avg_cols}, 948 count=("clust", "size"), # count rows in each cluster 949 example_cells=("example_cell_id", lambda x: ",".join(x)), 950 hcpc=("hcpc", lambda x: x.iloc[0]), 951 ) 952 data_df = data_df.reset_index() # Do NOT drop, index is "clust" 953 # Create new columns 954 metadata = pd.DataFrame( 955 { 956 "count": data_df["count"], 957 "example_cells": data_df["example_cells"], 958 "clust": data_df["clust"].astype(int), 959 "hcpc": data_df["hcpc"].astype(int), 960 "id": data_df["clust"].astype(int).astype(str), 961 "cccluster": "0", # Dummy value 962 "ccdistance": 0.0, # Dummy value 963 "rownum": list(range(len(data_df))), 964 "framegroup": 0, # Dummy value 965 } 966 ) 967 data_df = pd.concat([data_df[avg_cols], metadata], axis=1) 968 # Save the cluster data 969 data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv"), index=False) 970 pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df)
Save the events to an OCULAR file. Relies on the dataframe originating from an OCULAR file (same columns; duplicate metadata/info).
Parameters
- output_path:
- event_type: