csi_images.csi_events
Contains the Event class, which represents a single event in a scan. The Event class optionally holds metadata and features. Lists of events with similar metadata or features can be combined into DataFrames for analysis.
The Event class holds the position of the event in the frame, which can be converted to the position in the scanner or slide coordinate positions. See the csi_utils.csi_scans documentation page for more information on the coordinate systems.
1""" 2Contains the Event class, which represents a single event in a scan. 3The Event class optionally holds metadata and features. Lists of events with 4similar metadata or features can be combined into DataFrames for analysis. 5 6The Event class holds the position of the event in the frame, which can be converted 7to the position in the scanner or slide coordinate positions. See the 8csi_utils.csi_scans documentation page for more information on the coordinate systems. 9""" 10 11import os 12import math 13import typing 14 15import numpy as np 16import pandas as pd 17 18import pyreadr 19 20from .csi_scans import Scan 21from .csi_tiles import Tile 22from .csi_frames import Frame 23 24 25class Event: 26 """ 27 A class that represents a single event in a scan, making it easy to evaluate 28 singular events. Required metadata is exposed as attributes, and optional 29 metadata and features are stored as DataFrames. 30 """ 31 32 SCAN_TO_SLIDE_TRANSFORM = { 33 # Axioscan zero is in the top-right corner instead of top-left 34 Scan.Type.AXIOSCAN7: np.array( 35 [ 36 [1, 0, 75000], 37 [0, 1, 0], 38 [0, 0, 1], 39 ] 40 ), 41 # BZScanner coordinates are a special kind of messed up: 42 # - The slide is upside-down. 43 # - The slide is oriented vertically, with the barcode at the bottom. 44 # - Tiles are numbered from the top-right 45 Scan.Type.BZSCANNER: np.array( 46 [ 47 [0, -1, 75000], 48 [-1, 0, 25000], 49 [0, 0, 1], 50 ] 51 ), 52 } 53 """ 54 Homogeneous transformation matrices for converting between scanner and slide 55 coordinates. The matrices are 3x3, with the final column representing the 56 translation in micrometers (um). For more information, see 57 [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations). 58 59 Transformations are nominal, and accuracy is not guaranteed; this is due to 60 imperfections in slides and alignment in the scanners. Units are in micrometers. 61 """ 62 63 def __init__( 64 self, 65 scan: Scan, 66 tile: Tile, 67 x: int, 68 y: int, 69 size: int = 12, # End-to-end size in pixels 70 metadata: pd.Series = None, 71 features: pd.Series = None, 72 ): 73 self.scan = scan 74 self.tile = tile 75 self.x = x 76 self.y = y 77 self.size = size 78 self.metadata = metadata 79 self.features = features 80 81 def __repr__(self) -> str: 82 return f"{self.scan.slide_id}-{self.tile.n}-{self.x}-{self.y}" 83 84 def __eq__(self, other) -> bool: 85 return self.__repr__() == other.__repr__() 86 87 def __lt__(self, other): 88 return self.__repr__() < other.__repr__() 89 90 def get_scan_position(self) -> tuple[float, float]: 91 """ 92 Get the position of the event in the scanner's coordinate frame. 93 :return: the scan position of the event in micrometers (um). 94 """ 95 # Get overall pixel position 96 pixel_x = self.x + (self.scan.tile_width_px * self.tile.x) 97 pixel_y = self.y + (self.scan.tile_height_px * self.tile.y) 98 # Convert to micrometers 99 x_um = pixel_x * self.scan.pixel_size_um 100 y_um = pixel_y * self.scan.pixel_size_um 101 # Add the scan's origin in the scanner frame 102 x_um += self.scan.roi[self.tile.n_roi].origin_x_um 103 y_um += self.scan.roi[self.tile.n_roi].origin_y_um 104 return x_um, y_um 105 106 def get_slide_position(self) -> tuple[float, float]: 107 """ 108 Get the slide position of the event in micrometers (um). 109 :return: the slide position of the event. 110 """ 111 # Turn scan_position into a 3x1 vector 112 scan_position = self.get_scan_position() 113 scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]]) 114 115 # Multiply by the appropriate homogeneous matrix 116 if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value): 117 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7] 118 elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value): 119 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER] 120 else: 121 raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.") 122 slide_position = np.matmul(transform, scan_position) 123 return float(slide_position[0][0]), float(slide_position[1][0]) 124 125 def crop_images( 126 self, images: list[np.ndarray], crop_size: int = 100, in_pixels: bool = True 127 ) -> list[np.ndarray]: 128 """ 129 Get the event crops from the frame images. Called "get" because it does not 130 need to extract anything; it is very quick for extracting multiple events from 131 the same tile. 132 Use this if you're interested in many events. 133 :param images: the frame images. 134 :param crop_size: the square size of the image crop to get for this event. 135 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 136 :return: image_size x image_size crops of the event in the provided frames. If 137 the event is too close to the edge, the crop will be smaller and not centered. 138 """ 139 # Convert a crop size in micrometers to pixels 140 if not in_pixels: 141 crop_size = round(crop_size / self.scan.pixel_size_um) 142 # Find the crop bounds 143 bounds = [ 144 self.x - crop_size // 2, 145 self.y - crop_size // 2, 146 self.x + math.ceil(crop_size / 2), 147 self.y + math.ceil(crop_size / 2), 148 ] 149 # Determine how much the bounds violate the image size 150 displacements = [ 151 max(0, -bounds[0]), 152 max(0, -bounds[1]), 153 max(0, bounds[2] - images[0].shape[1]), 154 max(0, bounds[3] - images[0].shape[0]), 155 ] 156 # Cap off the bounds 157 bounds = [ 158 max(0, bounds[0]), 159 max(0, bounds[1]), 160 min(images[0].shape[1], bounds[2]), 161 min(images[0].shape[0], bounds[3]), 162 ] 163 164 # Crop the images 165 cropped_images = [] 166 for image in images: 167 # Create a blank image of the right size 168 cropped_image = np.zeros((crop_size, crop_size), dtype=image.dtype) 169 170 # Insert the cropped image into the blank image, leaving a black buffer 171 # around the edges if the crop would go beyond the original image bounds 172 cropped_image[ 173 displacements[1] : crop_size - displacements[3], 174 displacements[0] : crop_size - displacements[2], 175 ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]] 176 cropped_images.append(cropped_image) 177 return cropped_images 178 179 def extract_images( 180 self, crop_size: int = 100, in_pixels: bool = True 181 ) -> list[np.ndarray]: 182 """ 183 Extract the images from the scan and tile, reading from the file. Called 184 "extract" because it must read and extract the images from file, which is slow. 185 Use this if you're interested in only a few events, as it is inefficient when 186 reading multiple events from the same tile. 187 :param crop_size: the square size of the image crop to get for this event. 188 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 189 :return: a list of cropped images from the scan in the order of the channels. 190 """ 191 frames = Frame.get_frames(self.tile) 192 images = [frame.get_image() for frame in frames] 193 return self.crop_images(images, crop_size, in_pixels) 194 195 @classmethod 196 def extract_images_for_list( 197 cls, 198 events: list[typing.Self], 199 crop_size: int | list[int] = None, 200 in_pixels: bool = True, 201 ) -> list[list[np.ndarray]]: 202 """ 203 Get the images for a list of events, ensuring that there is no wasteful reading 204 of the same tile multiple times. This function is more efficient than calling 205 extract_event_images for each event. 206 TODO: test this function 207 :param events: the events to extract images for. 208 :param crop_size: the square size of the image crop to get for this event. 209 Defaults to four times the size of the event. 210 :param in_pixels: whether the crop size is in pixels or micrometers. 211 Defaults to pixels, and is ignored if crop_size is None. 212 :return: a list of lists of cropped images for each event. 213 """ 214 if len(events) == 0: 215 return [] 216 217 # Populate a crop size if none provided 218 if crop_size is None: 219 crop_size = [4 * event.size for event in events] 220 in_pixels = True 221 # Propagate a constant crop size 222 elif isinstance(crop_size, int): 223 crop_size = [crop_size] * len(events) 224 225 # Sort the events by tile; use a shallow copy to avoid modifying the original 226 order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__())) 227 228 # Allocate the list to size 229 images = [None] * len(events) 230 last_tile = None 231 frame_images = None # Holds large numpy arrays, so expensive to compare 232 # Iterate through in sorted order 233 for i in order: 234 if last_tile != events[i].tile: 235 # Gather the frame images, preserving them for the next event 236 frames = Frame.get_frames(events[i].tile) 237 frame_images = [frame.get_image() for frame in frames] 238 239 last_tile = events[i].tile 240 # Use the frame images to crop the event images 241 # Preserve the original order using order[i] 242 images[i] = events[i].crop_images(frame_images, crop_size[i], in_pixels) 243 return images 244 245 246class EventArray: 247 """ 248 A class that holds a large number of events' data, making it easy to analyze and 249 manipulate many events at once. A more separated version of the Event class. 250 """ 251 252 INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y", "size"] 253 254 def __init__( 255 self, 256 info: pd.DataFrame = None, 257 metadata: pd.DataFrame = None, 258 features: pd.DataFrame = None, 259 ): 260 # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y", "size" 261 if info is not None and ( 262 not all( 263 col in info.columns 264 for col in ["slide_id", "tile", "roi", "x", "y", "size"] 265 ) 266 or len(info.columns) != 6 267 ): 268 raise ValueError( 269 "EventArray.info must have columns 'slide_id', 'tile', 'roi', 'x', 'y', 'size'" 270 ) 271 # All DataFrames must all have the same number of rows 272 if metadata is not None and (info is None or len(info) != len(metadata)): 273 raise ValueError( 274 "If EventArray.metadata is not None, it should match rows with .info" 275 ) 276 if features is not None and (info is None or len(info) != len(features)): 277 raise ValueError( 278 "If EventArray.features is not None, it should match rows with .info" 279 ) 280 self.info = info 281 self.metadata = metadata 282 self.features = features 283 284 def __len__(self) -> int: 285 # Convenience method to get the number of events 286 if self.info is None: 287 return 0 288 else: 289 return len(self.info) 290 291 def __eq__(self, other): 292 is_equal = True 293 # Parse all possibilities for info 294 if isinstance(self.info, pd.DataFrame): 295 if isinstance(other.info, pd.DataFrame): 296 is_equal = self.info.equals(other.info) 297 if not is_equal: 298 return False 299 else: 300 return False 301 elif self.info is None: 302 if other.info is not None: 303 return False 304 305 # Parse all possibilities for metadata 306 if isinstance(self.metadata, pd.DataFrame): 307 if isinstance(other.metadata, pd.DataFrame): 308 is_equal = self.metadata.equals(other.metadata) 309 if not is_equal: 310 return False 311 else: 312 return False 313 elif self.metadata is None: 314 if other.metadata is not None: 315 return False 316 317 # Parse all possibilities for features 318 if isinstance(self.features, pd.DataFrame): 319 if isinstance(other.features, pd.DataFrame): 320 is_equal = self.features.equals(other.features) 321 if not is_equal: 322 return False 323 else: 324 return False 325 elif self.features is None: 326 if other.features is not None: 327 return False 328 329 return is_equal 330 331 def get_sort_order(self, by: str | list[str], ascending: bool | list[bool] = True): 332 """ 333 Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames. 334 :param by: name of the column(s) to sort by. 335 :param ascending: whether to sort in ascending order; can be a list to match by 336 :return: the order of the indices to sort by. 337 """ 338 columns = self.get(by) 339 return columns.sort_values(by=by, ascending=ascending).index 340 341 def sort( 342 self, by: str | list[str], ascending: bool | list[bool] = True 343 ) -> typing.Self: 344 """ 345 Sort the EventArray by column(s) in the info, metadata, or features DataFrames. 346 :param by: name of the column(s) to sort by. 347 :param ascending: whether to sort in ascending order; can be a list to match by 348 :return: a new, sorted EventArray. 349 """ 350 order = self.get_sort_order(by, ascending) 351 info = self.info.loc[order].reset_index(drop=True) 352 if self.metadata is not None: 353 metadata = self.metadata.loc[order].reset_index(drop=True) 354 else: 355 metadata = None 356 if self.features is not None: 357 features = self.features.loc[order].reset_index(drop=True) 358 else: 359 features = None 360 return EventArray(info, metadata, features) 361 362 def get(self, column_names: int | str | list[int] | list[str]) -> pd.DataFrame: 363 """ 364 Get a DataFrame with the specified columns from the EventArray, by value. 365 :param column_names: the names of the columns to get. 366 :return: a DataFrame with the specified columns. 367 """ 368 if isinstance(column_names, int) or isinstance(column_names, str): 369 column_names = [column_names] 370 columns = [] 371 for column_name in column_names: 372 if column_name in self.info.columns: 373 columns.append(self.info[column_name]) 374 elif self.metadata is not None and column_name in self.metadata.columns: 375 columns.append(column_name) 376 elif self.features is not None and column_name in self.features.columns: 377 columns.append(column_name) 378 else: 379 raise ValueError(f"Column {column_name} not found in EventArray") 380 return pd.concat(columns, axis=1) 381 382 def rows(self, rows) -> typing.Self: 383 """ 384 Get a subset of the EventArray rows based on a boolean or integer index, by value. 385 :param rows: the indices to get as a 1D boolean/integer list/array/series 386 :return: a new EventArray with the subset of events. 387 """ 388 info = self.info.loc[rows].reset_index(drop=True) 389 if self.metadata is not None: 390 metadata = self.metadata.loc[rows].reset_index(drop=True) 391 else: 392 metadata = None 393 if self.features is not None: 394 features = self.features.loc[rows].reset_index(drop=True) 395 else: 396 features = None 397 return EventArray(info, metadata, features) 398 399 def copy(self) -> typing.Self: 400 """ 401 Create a deep copy of the EventArray. 402 :return: a deep copy of the EventArray. 403 """ 404 return EventArray( 405 info=self.info.copy(), 406 metadata=None if self.metadata is None else self.metadata.copy(), 407 features=None if self.features is None else self.features.copy(), 408 ) 409 410 def add_metadata(self, new_metadata: pd.DataFrame) -> None: 411 """ 412 Add metadata to the EventArray. Removes the need to check if metadata is None. 413 Overwrites any existing metadata with the same column names as the new metadata. 414 :param new_metadata: the metadata to add. 415 """ 416 if len(self) != len(new_metadata): 417 raise ValueError("New metadata must match length of existing info") 418 419 if self.metadata is None: 420 self.metadata = new_metadata 421 else: 422 self.metadata[new_metadata.columns] = new_metadata 423 424 def add_features(self, new_features: pd.DataFrame) -> None: 425 """ 426 Add features to the EventArray. Removes the need to check if features is None. 427 Overwrites any existing features with the same column names as the new features. 428 :param new_features: the features to add. 429 """ 430 if len(self) != len(new_features): 431 raise ValueError("New features must match length of existing info") 432 433 if self.features is None: 434 self.features = new_features 435 else: 436 self.features[new_features.columns] = new_features 437 438 @classmethod 439 def merge(cls, events: list[typing.Self]) -> typing.Self: 440 """ 441 Combine EventArrays in a list into a single EventArray. 442 :param events: the new list of events. 443 """ 444 all_info = [] 445 all_metadata = [] 446 all_features = [] 447 for event_array in events: 448 # Skip empty EventArrays 449 if event_array.info is not None: 450 all_info.append(event_array.info) 451 if event_array.metadata is not None: 452 all_metadata.append(event_array.metadata) 453 if event_array.features is not None: 454 all_features.append(event_array.features) 455 if len(all_info) == 0: 456 return EventArray() 457 else: 458 all_info = pd.concat(all_info, ignore_index=True) 459 if len(all_metadata) == 0: 460 all_metadata = None 461 else: 462 all_metadata = pd.concat(all_metadata, ignore_index=True) 463 if len(all_features) == 0: 464 all_features = None 465 else: 466 all_features = pd.concat(all_features, ignore_index=True) 467 468 return EventArray(all_info, all_metadata, all_features) 469 470 @classmethod 471 def from_events(cls, events: list[Event]) -> typing.Self: 472 """ 473 Set the events in the EventArray to a new list of events. 474 :param events: the new list of events. 475 """ 476 # Return an empty array if we were passed nothing 477 if events is None or len(events) == 0: 478 return EventArray() 479 # Otherwise, grab the info 480 info = pd.DataFrame( 481 { 482 "slide_id": [event.scan.slide_id for event in events], 483 "tile": [event.tile.n for event in events], 484 "roi": [event.tile.n_roi for event in events], 485 "x": [event.x for event in events], 486 "y": [event.y for event in events], 487 "size": [event.size for event in events], 488 } 489 ) 490 metadata_list = [event.metadata for event in events] 491 # Iterate through and ensure that all metadata is the same shape 492 for metadata in metadata_list: 493 if type(metadata) != type(metadata_list[0]): 494 raise ValueError("All metadata must be the same type.") 495 if metadata is not None and metadata.shape != metadata_list[0].shape: 496 raise ValueError("All metadata must be the same shape.") 497 if metadata_list[0] is None: 498 metadata = None 499 else: 500 metadata = pd.DataFrame(metadata_list) 501 features_list = [event.features for event in events] 502 # Iterate through and ensure that all features are the same shape 503 for features in features_list: 504 if type(features) != type(features_list[0]): 505 raise ValueError("All features must be the same type.") 506 if features is not None and features.shape != features_list[0].shape: 507 raise ValueError("All features must be the same shape.") 508 if features_list[0] is None: 509 features = None 510 else: 511 features = pd.DataFrame(features_list) 512 return EventArray(info=info, metadata=metadata, features=features) 513 514 def to_events( 515 self, 516 scans: list[Scan], 517 ignore_missing_scans=True, 518 ignore_metadata=False, 519 ignore_features=False, 520 ) -> list[Event]: 521 """ 522 Get the events in the EventArray as a list of events. 523 :param scans: the scans that the events belong to. Pass an empty list if you 524 don't care about scan metadata. 525 :param ignore_missing_scans: whether to create blank scans for events without scans. 526 :param ignore_metadata: whether to ignore metadata or not 527 :param ignore_features: whether to ignore features or not 528 :return: 529 """ 530 events = [] 531 for i in range(len(self.info)): 532 # Determine the associated scan 533 scan = None 534 for s in scans: 535 if s.slide_id == self.info["slide_id"][i]: 536 scan = s 537 break 538 if scan is None: 539 if ignore_missing_scans: 540 # Create a placeholder scan if the scan is missing 541 scan = Scan.make_placeholder( 542 self.info["slide_id"][i], 543 self.info["tile"][i], 544 self.info["roi"][i], 545 ) 546 else: 547 raise ValueError( 548 f"Scan {self.info['slide_id'][i]} not found for event {i}." 549 ) 550 # Add to the list 551 events.append( 552 Event( 553 scan, 554 Tile(scan, self.info["tile"][i], self.info["roi"][i]), 555 self.info["x"][i], 556 self.info["y"][i], 557 size=self.info["size"][i], 558 metadata=None if ignore_metadata else self.metadata.loc[i], 559 features=None if ignore_features else self.features.loc[i], 560 ) 561 ) 562 return events 563 564 def to_dataframe(self) -> pd.DataFrame: 565 """ 566 Convert all the data in the EventArray to a single DataFrame. 567 :return: a DataFrame with all the data in the EventArray. 568 """ 569 # Make a copy of the info DataFrame and prepend "info_" to the column names 570 output = self.info.copy() 571 output.columns = [f"info_{col}" for col in output.columns] 572 # Combine with the metadata and prepend "metadata_" to the column names 573 if self.metadata is not None: 574 metadata = self.metadata.copy() 575 metadata.columns = [f"metadata_{col}" for col in metadata.columns] 576 output = pd.concat([output, metadata], axis=1) 577 # Combine with the features and prepend "features_" to the column names 578 if self.features is not None: 579 features = self.features.copy() 580 features.columns = [f"features_{col}" for col in features.columns] 581 output = pd.concat([output, features], axis=1) 582 return output 583 584 @classmethod 585 def from_dataframe(cls, df) -> typing.Self: 586 """ 587 From a single, special DataFrame, create an EventArray. 588 :return: a DataFrame with all the data in the EventArray. 589 """ 590 # Split the columns into info, metadata, and features and strip prefix 591 info = df[[col for col in df.columns if col.startswith("info_")]].copy() 592 info.columns = [col.replace("info_", "") for col in info.columns] 593 if info.size == 0: 594 info = None 595 metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy() 596 metadata.columns = [col.replace("metadata_", "") for col in metadata.columns] 597 if metadata.size == 0: 598 metadata = None 599 features = df[[col for col in df.columns if col.startswith("features_")]].copy() 600 features.columns = [col.replace("features_", "") for col in features.columns] 601 if features.size == 0: 602 features = None 603 return cls(info=info, metadata=metadata, features=features) 604 605 def save_csv(self, output_path: str) -> bool: 606 """ 607 Save the events to an CSV file, including metadata and features. 608 :param output_path: 609 :return: 610 """ 611 self.to_dataframe().to_csv(output_path, index=False) 612 return os.path.exists(output_path) 613 614 @classmethod 615 def load_csv(cls, input_path: str) -> typing.Self: 616 """ 617 Load the events from an CSV file, including metadata and features. 618 :param input_path: 619 :return: 620 """ 621 # Load the CSV file 622 df = pd.read_csv(input_path) 623 return cls.from_dataframe(df) 624 625 def save_hdf5(self, output_path: str) -> bool: 626 """ 627 Save the events to an HDF5 file, including metadata and features. 628 Uses the pandas-provided HDF5 functions for ease, and external compatibility, 629 though these files are slightly harder to view in HDFView or similar. 630 :param output_path: 631 :return: 632 """ 633 # Open the output_path as an HDF5 file 634 with pd.HDFStore(output_path) as store: 635 # Store the dataframes in the HDF5 file 636 if self.info is not None: 637 store.put("info", self.info, index=False) 638 if self.metadata is not None: 639 store.put("metadata", self.metadata, index=False) 640 if self.features is not None: 641 store.put("features", self.features, index=False) 642 return os.path.exists(output_path) 643 644 @classmethod 645 def load_hdf5(cls, input_path: str) -> typing.Self: 646 """ 647 Load the events from an HDF5 file, including metadata and features. 648 :param input_path: 649 :return: 650 """ 651 # Open the input_path as an HDF5 file 652 with pd.HDFStore(input_path) as store: 653 # Load the dataframes from the HDF5 file 654 info = store.get("info") if "info" in store else None 655 metadata = store.get("metadata") if "metadata" in store else None 656 features = store.get("features") if "features" in store else None 657 return cls(info=info, metadata=metadata, features=features) 658 659 @classmethod 660 def load_ocular( 661 cls, 662 input_path: str, 663 event_type="cells", 664 cell_data_files=( 665 "rc-final1.rds", 666 "rc-final2.rds", 667 "rc-final3.rds", 668 "rc-final4.rds", 669 "ocular_interesting.rds", 670 ), 671 others_data_files=( 672 "others-final1.rds", 673 "others-final2.rds", 674 "others-final3.rds", 675 "others-final4.rds", 676 ), 677 atlas_data_files=( 678 "ocular_interesting.rds", 679 "ocular_not_interesting.rds", 680 ), 681 merge_event_data_with_stats=True, 682 filter_and_generate_morphs=True, 683 drop_common_events=True, 684 log=None, 685 ) -> typing.Self: 686 """ 687 688 :param input_path: 689 :param event_type: 690 :param cell_data_files: 691 :param others_data_files: 692 :param atlas_data_files: 693 :param merge_event_data_with_stats: 694 :param filter_and_generate_morphs: 695 :param drop_common_events: 696 :param log: 697 :return: 698 """ 699 # Check if the input path is a directory or a file 700 if os.path.isfile(input_path): 701 data_files = [os.path.basename(input_path)] 702 input_path = os.path.dirname(input_path) 703 if event_type == "cells": 704 data_files = cell_data_files 705 elif event_type == "others": 706 data_files = others_data_files 707 else: 708 raise ValueError("Invalid event type.") 709 710 # Load the data from the OCULAR files 711 file_data = {} 712 for file in data_files: 713 file_path = os.path.join(input_path, file) 714 if not os.path.isfile(file_path): 715 if log is not None: 716 log.warning(f"{file} not found for in {input_path}") 717 continue 718 file_data[file] = pyreadr.read_r(file_path) 719 # Get the DataFrame associated with None (pyreadr dict quirk) 720 file_data[file] = file_data[file][None] 721 if len(file_data[file]) == 0: 722 # File gets dropped from the dict 723 file_data.pop(file) 724 if log is not None: 725 log.warning(f"{file} has no cells") 726 continue 727 728 if log is not None: 729 log.debug(f"{file} has {len(file_data[file])} cells") 730 731 # Drop common cells if requested and in this file 732 if file in atlas_data_files and drop_common_events: 733 common_cell_indices = ( 734 file_data[file]["catalogue_classification"] == "common_cell" 735 ) 736 if log is not None: 737 log.debug( 738 f"Dropping {int(pd.Series.sum(common_cell_indices))}" 739 f"common cells from {file}" 740 ) 741 file_data[file] = file_data[file][common_cell_indices == False] 742 743 if len(file_data[file]) == 0: 744 # File gets dropped from the dict 745 file_data.pop(file) 746 if log is not None: 747 log.warning(f"{file} has no cells after dropping common cells") 748 continue 749 750 # Extract frame_id and cell_id 751 # DAPI- events already have frame_id cell_id outside rowname 752 if event_type == "cells": 753 file_data[file]["rowname"] = file_data[file]["rowname"].astype("str") 754 # get frame_id cell_id from rownames column and split into two columns 755 split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True) 756 if len(split_res.columns) != 2: 757 log.warning( 758 f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}' 759 ) 760 # then assign it back to the dataframe 761 file_data[file][["frame_id", "cell_id"]] = split_res.astype("int") 762 # reset indexes since they can cause NaN values in concat 763 file_data[file] = file_data[file].reset_index(drop=True) 764 765 # Merge the data from all files 766 if len(file_data) == 0: 767 return EventArray() 768 elif len(file_data) == 1: 769 data = [file_data[file] for file in file_data.keys()][0] 770 else: 771 data = pd.concat(file_data.values()) 772 773 if log is not None: 774 log.debug(f"Gathered a total of {len(data)} events") 775 776 # Others is missing the "slide_id". Insert it right before "frame_id" column 777 if event_type == "others" and "slide_id" not in data.columns: 778 if os.path.basename(input_path) == "ocular": 779 slide_id = os.path.basename(os.path.dirname(input_path)) 780 else: 781 slide_id = "UNKNOWN" 782 data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id) 783 784 # Sort according to ascending cell_id to keep the original, which is in manual_df 785 data = data.sort_values(by=["cell_id"], ascending=True) 786 # Filter out duplicates by x & y 787 data = data.assign( 788 unique_id=data["slide_id"] 789 + "_" 790 + data["frame_id"].astype(str) 791 + "_" 792 + data["cellx"].astype(int).astype(str) 793 + "_" 794 + data["celly"].astype(int).astype(str) 795 ) 796 data = data.drop_duplicates(subset=["unique_id"], keep="first") 797 # Normal unique_id is with cell_id 798 data = data.assign( 799 unique_id=data["slide_id"] 800 + "_" 801 + data["frame_id"].astype(str) 802 + "_" 803 + data["cell_id"].astype(str) 804 ) 805 data = data.reset_index(drop=True) 806 # All columns up to "slide_id" are features; drop the "slide_id" 807 features = data.loc[:, :"slide_id"].iloc[:, :-1] 808 data = data.loc[:, "slide_id":] 809 # Grab the info columns 810 info = data[["slide_id", "frame_id", "cellx", "celly"]] 811 info.columns = ["slide_id", "tile", "x", "y"] 812 info = info.assign( 813 roi=0, # OCULAR only works on 1 ROI, as far as known 814 size=25, # Static, for later montaging 815 ) 816 info = info[["slide_id", "tile", "roi", "x", "y", "size"]] 817 # Metadata has duplicate columns for later convenience 818 metadata = data 819 return EventArray(info, metadata, features) 820 821 def save_ocular(self, output_path: str, event_type: str = "cells") -> bool: 822 """ 823 Save the events to an OCULAR file. Relies on the dataframe originating 824 from an OCULAR file (same columns; duplicate metadata/info). 825 :param output_path: 826 :return: 827 """ 828 if event_type == "cells": 829 file_stub = "rc-final" 830 elif event_type == "others": 831 file_stub = "others-final" 832 else: 833 raise ValueError("Invalid event type. Must be cells or others.") 834 835 # Check for the "ocular_interesting" column 836 if event_type == "cells" and "ocular_interesting" in self.metadata.columns: 837 interesting = self.metadata["ocular_interesting"] 838 # Split the metadata into interesting and regular 839 # Interesting will only have dropped columns, with no internal changes 840 interesting = pd.concat( 841 [self.features[interesting], self.metadata[interesting]], axis=1 842 ).reset_index(drop=True) 843 # Data will get some columns changed; reset_index will copy it 844 data = ( 845 pd.concat( 846 [self.features[~interesting], self.metadata[~interesting]], axis=1 847 ) 848 .reset_index(drop=True) 849 .drop(columns=["ocular_interesting"]) 850 ) 851 852 # Drop particular columns for "interesting" 853 interesting = interesting.drop( 854 [ 855 "clust", 856 "hcpc", 857 "frame_id", 858 "cell_id", 859 "unique_id", 860 "ocular_interesting", 861 ], 862 axis=1, 863 errors="ignore", 864 ) 865 # Save both .csv and .rds 866 interesting.to_csv( 867 os.path.join(output_path, "ocular_interesting.csv"), index=False 868 ) 869 pyreadr.write_rds( 870 os.path.join(output_path, "ocular_interesting.rds"), interesting 871 ) 872 else: 873 # Get all data and reset_index (will copy it) 874 data = pd.concat([self.features, self.metadata], axis=1).reset_index( 875 drop=True 876 ) 877 878 # Split based on cluster number to conform to *-final[1-4].rds 879 n_clusters = max(data["clust"]) + 1 880 split_idx = [round(i * n_clusters / 4) for i in range(5)] 881 for i in range(4): 882 subset = (split_idx[i] <= data["clust"]) & ( 883 data["clust"] < split_idx[i + 1] 884 ) 885 subset = data[subset].reset_index(drop=True) 886 subset["hcpc"] = i + 1 887 pyreadr.write_rds( 888 os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset 889 ) 890 891 # Create new example cell strings 892 data["example_cell_id"] = ( 893 data["slide_id"] 894 + " " 895 + data["frame_id"].astype(str) 896 + " " 897 + data["cell_id"].astype(str) 898 + " " 899 + data["cellx"].astype(int).astype(str) 900 + " " 901 + data["celly"].astype(int).astype(str) 902 ) 903 # Find averagable data columns 904 if "cellcluster_id" in data.columns: 905 avg_cols = data.columns[: data.columns.get_loc("cellcluster_id")].tolist() 906 else: 907 avg_cols = data.columns[: data.columns.get_loc("slide_id")].tolist() 908 # Group by cluster and average 909 data = data.groupby("clust").agg( 910 **{col: (col, "mean") for col in avg_cols}, 911 count=("clust", "size"), # count rows in each cluster 912 example_cells=("example_cell_id", lambda x: ",".join(x)), 913 hcpc=("hcpc", lambda x: x.iloc[0]), 914 ) 915 data = data.reset_index() # Do NOT drop, index is "clust" 916 # Create new columns 917 metadata = pd.DataFrame( 918 { 919 "count": data["count"], 920 "example_cells": data["example_cells"], 921 "clust": data["clust"].astype(int), 922 "hcpc": data["hcpc"].astype(int), 923 "id": data["clust"].astype(int).astype(str), 924 "cccluster": "0", # Dummy value 925 "ccdistance": 0.0, # Dummy value 926 "rownum": list(range(len(data))), 927 "framegroup": 0, # Dummy value 928 } 929 ) 930 data = pd.concat([data[avg_cols], metadata], axis=1) 931 # Save the data 932 data.to_csv(os.path.join(output_path, f"{file_stub}.csv"), index=False) 933 pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data)
26class Event: 27 """ 28 A class that represents a single event in a scan, making it easy to evaluate 29 singular events. Required metadata is exposed as attributes, and optional 30 metadata and features are stored as DataFrames. 31 """ 32 33 SCAN_TO_SLIDE_TRANSFORM = { 34 # Axioscan zero is in the top-right corner instead of top-left 35 Scan.Type.AXIOSCAN7: np.array( 36 [ 37 [1, 0, 75000], 38 [0, 1, 0], 39 [0, 0, 1], 40 ] 41 ), 42 # BZScanner coordinates are a special kind of messed up: 43 # - The slide is upside-down. 44 # - The slide is oriented vertically, with the barcode at the bottom. 45 # - Tiles are numbered from the top-right 46 Scan.Type.BZSCANNER: np.array( 47 [ 48 [0, -1, 75000], 49 [-1, 0, 25000], 50 [0, 0, 1], 51 ] 52 ), 53 } 54 """ 55 Homogeneous transformation matrices for converting between scanner and slide 56 coordinates. The matrices are 3x3, with the final column representing the 57 translation in micrometers (um). For more information, see 58 [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations). 59 60 Transformations are nominal, and accuracy is not guaranteed; this is due to 61 imperfections in slides and alignment in the scanners. Units are in micrometers. 62 """ 63 64 def __init__( 65 self, 66 scan: Scan, 67 tile: Tile, 68 x: int, 69 y: int, 70 size: int = 12, # End-to-end size in pixels 71 metadata: pd.Series = None, 72 features: pd.Series = None, 73 ): 74 self.scan = scan 75 self.tile = tile 76 self.x = x 77 self.y = y 78 self.size = size 79 self.metadata = metadata 80 self.features = features 81 82 def __repr__(self) -> str: 83 return f"{self.scan.slide_id}-{self.tile.n}-{self.x}-{self.y}" 84 85 def __eq__(self, other) -> bool: 86 return self.__repr__() == other.__repr__() 87 88 def __lt__(self, other): 89 return self.__repr__() < other.__repr__() 90 91 def get_scan_position(self) -> tuple[float, float]: 92 """ 93 Get the position of the event in the scanner's coordinate frame. 94 :return: the scan position of the event in micrometers (um). 95 """ 96 # Get overall pixel position 97 pixel_x = self.x + (self.scan.tile_width_px * self.tile.x) 98 pixel_y = self.y + (self.scan.tile_height_px * self.tile.y) 99 # Convert to micrometers 100 x_um = pixel_x * self.scan.pixel_size_um 101 y_um = pixel_y * self.scan.pixel_size_um 102 # Add the scan's origin in the scanner frame 103 x_um += self.scan.roi[self.tile.n_roi].origin_x_um 104 y_um += self.scan.roi[self.tile.n_roi].origin_y_um 105 return x_um, y_um 106 107 def get_slide_position(self) -> tuple[float, float]: 108 """ 109 Get the slide position of the event in micrometers (um). 110 :return: the slide position of the event. 111 """ 112 # Turn scan_position into a 3x1 vector 113 scan_position = self.get_scan_position() 114 scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]]) 115 116 # Multiply by the appropriate homogeneous matrix 117 if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value): 118 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7] 119 elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value): 120 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER] 121 else: 122 raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.") 123 slide_position = np.matmul(transform, scan_position) 124 return float(slide_position[0][0]), float(slide_position[1][0]) 125 126 def crop_images( 127 self, images: list[np.ndarray], crop_size: int = 100, in_pixels: bool = True 128 ) -> list[np.ndarray]: 129 """ 130 Get the event crops from the frame images. Called "get" because it does not 131 need to extract anything; it is very quick for extracting multiple events from 132 the same tile. 133 Use this if you're interested in many events. 134 :param images: the frame images. 135 :param crop_size: the square size of the image crop to get for this event. 136 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 137 :return: image_size x image_size crops of the event in the provided frames. If 138 the event is too close to the edge, the crop will be smaller and not centered. 139 """ 140 # Convert a crop size in micrometers to pixels 141 if not in_pixels: 142 crop_size = round(crop_size / self.scan.pixel_size_um) 143 # Find the crop bounds 144 bounds = [ 145 self.x - crop_size // 2, 146 self.y - crop_size // 2, 147 self.x + math.ceil(crop_size / 2), 148 self.y + math.ceil(crop_size / 2), 149 ] 150 # Determine how much the bounds violate the image size 151 displacements = [ 152 max(0, -bounds[0]), 153 max(0, -bounds[1]), 154 max(0, bounds[2] - images[0].shape[1]), 155 max(0, bounds[3] - images[0].shape[0]), 156 ] 157 # Cap off the bounds 158 bounds = [ 159 max(0, bounds[0]), 160 max(0, bounds[1]), 161 min(images[0].shape[1], bounds[2]), 162 min(images[0].shape[0], bounds[3]), 163 ] 164 165 # Crop the images 166 cropped_images = [] 167 for image in images: 168 # Create a blank image of the right size 169 cropped_image = np.zeros((crop_size, crop_size), dtype=image.dtype) 170 171 # Insert the cropped image into the blank image, leaving a black buffer 172 # around the edges if the crop would go beyond the original image bounds 173 cropped_image[ 174 displacements[1] : crop_size - displacements[3], 175 displacements[0] : crop_size - displacements[2], 176 ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]] 177 cropped_images.append(cropped_image) 178 return cropped_images 179 180 def extract_images( 181 self, crop_size: int = 100, in_pixels: bool = True 182 ) -> list[np.ndarray]: 183 """ 184 Extract the images from the scan and tile, reading from the file. Called 185 "extract" because it must read and extract the images from file, which is slow. 186 Use this if you're interested in only a few events, as it is inefficient when 187 reading multiple events from the same tile. 188 :param crop_size: the square size of the image crop to get for this event. 189 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 190 :return: a list of cropped images from the scan in the order of the channels. 191 """ 192 frames = Frame.get_frames(self.tile) 193 images = [frame.get_image() for frame in frames] 194 return self.crop_images(images, crop_size, in_pixels) 195 196 @classmethod 197 def extract_images_for_list( 198 cls, 199 events: list[typing.Self], 200 crop_size: int | list[int] = None, 201 in_pixels: bool = True, 202 ) -> list[list[np.ndarray]]: 203 """ 204 Get the images for a list of events, ensuring that there is no wasteful reading 205 of the same tile multiple times. This function is more efficient than calling 206 extract_event_images for each event. 207 TODO: test this function 208 :param events: the events to extract images for. 209 :param crop_size: the square size of the image crop to get for this event. 210 Defaults to four times the size of the event. 211 :param in_pixels: whether the crop size is in pixels or micrometers. 212 Defaults to pixels, and is ignored if crop_size is None. 213 :return: a list of lists of cropped images for each event. 214 """ 215 if len(events) == 0: 216 return [] 217 218 # Populate a crop size if none provided 219 if crop_size is None: 220 crop_size = [4 * event.size for event in events] 221 in_pixels = True 222 # Propagate a constant crop size 223 elif isinstance(crop_size, int): 224 crop_size = [crop_size] * len(events) 225 226 # Sort the events by tile; use a shallow copy to avoid modifying the original 227 order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__())) 228 229 # Allocate the list to size 230 images = [None] * len(events) 231 last_tile = None 232 frame_images = None # Holds large numpy arrays, so expensive to compare 233 # Iterate through in sorted order 234 for i in order: 235 if last_tile != events[i].tile: 236 # Gather the frame images, preserving them for the next event 237 frames = Frame.get_frames(events[i].tile) 238 frame_images = [frame.get_image() for frame in frames] 239 240 last_tile = events[i].tile 241 # Use the frame images to crop the event images 242 # Preserve the original order using order[i] 243 images[i] = events[i].crop_images(frame_images, crop_size[i], in_pixels) 244 return images
A class that represents a single event in a scan, making it easy to evaluate singular events. Required metadata is exposed as attributes, and optional metadata and features are stored as DataFrames.
64 def __init__( 65 self, 66 scan: Scan, 67 tile: Tile, 68 x: int, 69 y: int, 70 size: int = 12, # End-to-end size in pixels 71 metadata: pd.Series = None, 72 features: pd.Series = None, 73 ): 74 self.scan = scan 75 self.tile = tile 76 self.x = x 77 self.y = y 78 self.size = size 79 self.metadata = metadata 80 self.features = features
Homogeneous transformation matrices for converting between scanner and slide coordinates. The matrices are 3x3, with the final column representing the translation in micrometers (um). For more information, see affine transformations.
Transformations are nominal, and accuracy is not guaranteed; this is due to imperfections in slides and alignment in the scanners. Units are in micrometers.
91 def get_scan_position(self) -> tuple[float, float]: 92 """ 93 Get the position of the event in the scanner's coordinate frame. 94 :return: the scan position of the event in micrometers (um). 95 """ 96 # Get overall pixel position 97 pixel_x = self.x + (self.scan.tile_width_px * self.tile.x) 98 pixel_y = self.y + (self.scan.tile_height_px * self.tile.y) 99 # Convert to micrometers 100 x_um = pixel_x * self.scan.pixel_size_um 101 y_um = pixel_y * self.scan.pixel_size_um 102 # Add the scan's origin in the scanner frame 103 x_um += self.scan.roi[self.tile.n_roi].origin_x_um 104 y_um += self.scan.roi[self.tile.n_roi].origin_y_um 105 return x_um, y_um
Get the position of the event in the scanner's coordinate frame.
Returns
the scan position of the event in micrometers (um).
107 def get_slide_position(self) -> tuple[float, float]: 108 """ 109 Get the slide position of the event in micrometers (um). 110 :return: the slide position of the event. 111 """ 112 # Turn scan_position into a 3x1 vector 113 scan_position = self.get_scan_position() 114 scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]]) 115 116 # Multiply by the appropriate homogeneous matrix 117 if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value): 118 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7] 119 elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value): 120 transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER] 121 else: 122 raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.") 123 slide_position = np.matmul(transform, scan_position) 124 return float(slide_position[0][0]), float(slide_position[1][0])
Get the slide position of the event in micrometers (um).
Returns
the slide position of the event.
126 def crop_images( 127 self, images: list[np.ndarray], crop_size: int = 100, in_pixels: bool = True 128 ) -> list[np.ndarray]: 129 """ 130 Get the event crops from the frame images. Called "get" because it does not 131 need to extract anything; it is very quick for extracting multiple events from 132 the same tile. 133 Use this if you're interested in many events. 134 :param images: the frame images. 135 :param crop_size: the square size of the image crop to get for this event. 136 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 137 :return: image_size x image_size crops of the event in the provided frames. If 138 the event is too close to the edge, the crop will be smaller and not centered. 139 """ 140 # Convert a crop size in micrometers to pixels 141 if not in_pixels: 142 crop_size = round(crop_size / self.scan.pixel_size_um) 143 # Find the crop bounds 144 bounds = [ 145 self.x - crop_size // 2, 146 self.y - crop_size // 2, 147 self.x + math.ceil(crop_size / 2), 148 self.y + math.ceil(crop_size / 2), 149 ] 150 # Determine how much the bounds violate the image size 151 displacements = [ 152 max(0, -bounds[0]), 153 max(0, -bounds[1]), 154 max(0, bounds[2] - images[0].shape[1]), 155 max(0, bounds[3] - images[0].shape[0]), 156 ] 157 # Cap off the bounds 158 bounds = [ 159 max(0, bounds[0]), 160 max(0, bounds[1]), 161 min(images[0].shape[1], bounds[2]), 162 min(images[0].shape[0], bounds[3]), 163 ] 164 165 # Crop the images 166 cropped_images = [] 167 for image in images: 168 # Create a blank image of the right size 169 cropped_image = np.zeros((crop_size, crop_size), dtype=image.dtype) 170 171 # Insert the cropped image into the blank image, leaving a black buffer 172 # around the edges if the crop would go beyond the original image bounds 173 cropped_image[ 174 displacements[1] : crop_size - displacements[3], 175 displacements[0] : crop_size - displacements[2], 176 ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]] 177 cropped_images.append(cropped_image) 178 return cropped_images
Get the event crops from the frame images. Called "get" because it does not need to extract anything; it is very quick for extracting multiple events from the same tile. Use this if you're interested in many events.
Parameters
- images: the frame images.
- crop_size: the square size of the image crop to get for this event.
- in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
Returns
image_size x image_size crops of the event in the provided frames. If the event is too close to the edge, the crop will be smaller and not centered.
180 def extract_images( 181 self, crop_size: int = 100, in_pixels: bool = True 182 ) -> list[np.ndarray]: 183 """ 184 Extract the images from the scan and tile, reading from the file. Called 185 "extract" because it must read and extract the images from file, which is slow. 186 Use this if you're interested in only a few events, as it is inefficient when 187 reading multiple events from the same tile. 188 :param crop_size: the square size of the image crop to get for this event. 189 :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels. 190 :return: a list of cropped images from the scan in the order of the channels. 191 """ 192 frames = Frame.get_frames(self.tile) 193 images = [frame.get_image() for frame in frames] 194 return self.crop_images(images, crop_size, in_pixels)
Extract the images from the scan and tile, reading from the file. Called "extract" because it must read and extract the images from file, which is slow. Use this if you're interested in only a few events, as it is inefficient when reading multiple events from the same tile.
Parameters
- crop_size: the square size of the image crop to get for this event.
- in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
Returns
a list of cropped images from the scan in the order of the channels.
196 @classmethod 197 def extract_images_for_list( 198 cls, 199 events: list[typing.Self], 200 crop_size: int | list[int] = None, 201 in_pixels: bool = True, 202 ) -> list[list[np.ndarray]]: 203 """ 204 Get the images for a list of events, ensuring that there is no wasteful reading 205 of the same tile multiple times. This function is more efficient than calling 206 extract_event_images for each event. 207 TODO: test this function 208 :param events: the events to extract images for. 209 :param crop_size: the square size of the image crop to get for this event. 210 Defaults to four times the size of the event. 211 :param in_pixels: whether the crop size is in pixels or micrometers. 212 Defaults to pixels, and is ignored if crop_size is None. 213 :return: a list of lists of cropped images for each event. 214 """ 215 if len(events) == 0: 216 return [] 217 218 # Populate a crop size if none provided 219 if crop_size is None: 220 crop_size = [4 * event.size for event in events] 221 in_pixels = True 222 # Propagate a constant crop size 223 elif isinstance(crop_size, int): 224 crop_size = [crop_size] * len(events) 225 226 # Sort the events by tile; use a shallow copy to avoid modifying the original 227 order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__())) 228 229 # Allocate the list to size 230 images = [None] * len(events) 231 last_tile = None 232 frame_images = None # Holds large numpy arrays, so expensive to compare 233 # Iterate through in sorted order 234 for i in order: 235 if last_tile != events[i].tile: 236 # Gather the frame images, preserving them for the next event 237 frames = Frame.get_frames(events[i].tile) 238 frame_images = [frame.get_image() for frame in frames] 239 240 last_tile = events[i].tile 241 # Use the frame images to crop the event images 242 # Preserve the original order using order[i] 243 images[i] = events[i].crop_images(frame_images, crop_size[i], in_pixels) 244 return images
Get the images for a list of events, ensuring that there is no wasteful reading of the same tile multiple times. This function is more efficient than calling extract_event_images for each event. TODO: test this function
Parameters
- events: the events to extract images for.
- crop_size: the square size of the image crop to get for this event. Defaults to four times the size of the event.
- in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels, and is ignored if crop_size is None.
Returns
a list of lists of cropped images for each event.
247class EventArray: 248 """ 249 A class that holds a large number of events' data, making it easy to analyze and 250 manipulate many events at once. A more separated version of the Event class. 251 """ 252 253 INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y", "size"] 254 255 def __init__( 256 self, 257 info: pd.DataFrame = None, 258 metadata: pd.DataFrame = None, 259 features: pd.DataFrame = None, 260 ): 261 # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y", "size" 262 if info is not None and ( 263 not all( 264 col in info.columns 265 for col in ["slide_id", "tile", "roi", "x", "y", "size"] 266 ) 267 or len(info.columns) != 6 268 ): 269 raise ValueError( 270 "EventArray.info must have columns 'slide_id', 'tile', 'roi', 'x', 'y', 'size'" 271 ) 272 # All DataFrames must all have the same number of rows 273 if metadata is not None and (info is None or len(info) != len(metadata)): 274 raise ValueError( 275 "If EventArray.metadata is not None, it should match rows with .info" 276 ) 277 if features is not None and (info is None or len(info) != len(features)): 278 raise ValueError( 279 "If EventArray.features is not None, it should match rows with .info" 280 ) 281 self.info = info 282 self.metadata = metadata 283 self.features = features 284 285 def __len__(self) -> int: 286 # Convenience method to get the number of events 287 if self.info is None: 288 return 0 289 else: 290 return len(self.info) 291 292 def __eq__(self, other): 293 is_equal = True 294 # Parse all possibilities for info 295 if isinstance(self.info, pd.DataFrame): 296 if isinstance(other.info, pd.DataFrame): 297 is_equal = self.info.equals(other.info) 298 if not is_equal: 299 return False 300 else: 301 return False 302 elif self.info is None: 303 if other.info is not None: 304 return False 305 306 # Parse all possibilities for metadata 307 if isinstance(self.metadata, pd.DataFrame): 308 if isinstance(other.metadata, pd.DataFrame): 309 is_equal = self.metadata.equals(other.metadata) 310 if not is_equal: 311 return False 312 else: 313 return False 314 elif self.metadata is None: 315 if other.metadata is not None: 316 return False 317 318 # Parse all possibilities for features 319 if isinstance(self.features, pd.DataFrame): 320 if isinstance(other.features, pd.DataFrame): 321 is_equal = self.features.equals(other.features) 322 if not is_equal: 323 return False 324 else: 325 return False 326 elif self.features is None: 327 if other.features is not None: 328 return False 329 330 return is_equal 331 332 def get_sort_order(self, by: str | list[str], ascending: bool | list[bool] = True): 333 """ 334 Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames. 335 :param by: name of the column(s) to sort by. 336 :param ascending: whether to sort in ascending order; can be a list to match by 337 :return: the order of the indices to sort by. 338 """ 339 columns = self.get(by) 340 return columns.sort_values(by=by, ascending=ascending).index 341 342 def sort( 343 self, by: str | list[str], ascending: bool | list[bool] = True 344 ) -> typing.Self: 345 """ 346 Sort the EventArray by column(s) in the info, metadata, or features DataFrames. 347 :param by: name of the column(s) to sort by. 348 :param ascending: whether to sort in ascending order; can be a list to match by 349 :return: a new, sorted EventArray. 350 """ 351 order = self.get_sort_order(by, ascending) 352 info = self.info.loc[order].reset_index(drop=True) 353 if self.metadata is not None: 354 metadata = self.metadata.loc[order].reset_index(drop=True) 355 else: 356 metadata = None 357 if self.features is not None: 358 features = self.features.loc[order].reset_index(drop=True) 359 else: 360 features = None 361 return EventArray(info, metadata, features) 362 363 def get(self, column_names: int | str | list[int] | list[str]) -> pd.DataFrame: 364 """ 365 Get a DataFrame with the specified columns from the EventArray, by value. 366 :param column_names: the names of the columns to get. 367 :return: a DataFrame with the specified columns. 368 """ 369 if isinstance(column_names, int) or isinstance(column_names, str): 370 column_names = [column_names] 371 columns = [] 372 for column_name in column_names: 373 if column_name in self.info.columns: 374 columns.append(self.info[column_name]) 375 elif self.metadata is not None and column_name in self.metadata.columns: 376 columns.append(column_name) 377 elif self.features is not None and column_name in self.features.columns: 378 columns.append(column_name) 379 else: 380 raise ValueError(f"Column {column_name} not found in EventArray") 381 return pd.concat(columns, axis=1) 382 383 def rows(self, rows) -> typing.Self: 384 """ 385 Get a subset of the EventArray rows based on a boolean or integer index, by value. 386 :param rows: the indices to get as a 1D boolean/integer list/array/series 387 :return: a new EventArray with the subset of events. 388 """ 389 info = self.info.loc[rows].reset_index(drop=True) 390 if self.metadata is not None: 391 metadata = self.metadata.loc[rows].reset_index(drop=True) 392 else: 393 metadata = None 394 if self.features is not None: 395 features = self.features.loc[rows].reset_index(drop=True) 396 else: 397 features = None 398 return EventArray(info, metadata, features) 399 400 def copy(self) -> typing.Self: 401 """ 402 Create a deep copy of the EventArray. 403 :return: a deep copy of the EventArray. 404 """ 405 return EventArray( 406 info=self.info.copy(), 407 metadata=None if self.metadata is None else self.metadata.copy(), 408 features=None if self.features is None else self.features.copy(), 409 ) 410 411 def add_metadata(self, new_metadata: pd.DataFrame) -> None: 412 """ 413 Add metadata to the EventArray. Removes the need to check if metadata is None. 414 Overwrites any existing metadata with the same column names as the new metadata. 415 :param new_metadata: the metadata to add. 416 """ 417 if len(self) != len(new_metadata): 418 raise ValueError("New metadata must match length of existing info") 419 420 if self.metadata is None: 421 self.metadata = new_metadata 422 else: 423 self.metadata[new_metadata.columns] = new_metadata 424 425 def add_features(self, new_features: pd.DataFrame) -> None: 426 """ 427 Add features to the EventArray. Removes the need to check if features is None. 428 Overwrites any existing features with the same column names as the new features. 429 :param new_features: the features to add. 430 """ 431 if len(self) != len(new_features): 432 raise ValueError("New features must match length of existing info") 433 434 if self.features is None: 435 self.features = new_features 436 else: 437 self.features[new_features.columns] = new_features 438 439 @classmethod 440 def merge(cls, events: list[typing.Self]) -> typing.Self: 441 """ 442 Combine EventArrays in a list into a single EventArray. 443 :param events: the new list of events. 444 """ 445 all_info = [] 446 all_metadata = [] 447 all_features = [] 448 for event_array in events: 449 # Skip empty EventArrays 450 if event_array.info is not None: 451 all_info.append(event_array.info) 452 if event_array.metadata is not None: 453 all_metadata.append(event_array.metadata) 454 if event_array.features is not None: 455 all_features.append(event_array.features) 456 if len(all_info) == 0: 457 return EventArray() 458 else: 459 all_info = pd.concat(all_info, ignore_index=True) 460 if len(all_metadata) == 0: 461 all_metadata = None 462 else: 463 all_metadata = pd.concat(all_metadata, ignore_index=True) 464 if len(all_features) == 0: 465 all_features = None 466 else: 467 all_features = pd.concat(all_features, ignore_index=True) 468 469 return EventArray(all_info, all_metadata, all_features) 470 471 @classmethod 472 def from_events(cls, events: list[Event]) -> typing.Self: 473 """ 474 Set the events in the EventArray to a new list of events. 475 :param events: the new list of events. 476 """ 477 # Return an empty array if we were passed nothing 478 if events is None or len(events) == 0: 479 return EventArray() 480 # Otherwise, grab the info 481 info = pd.DataFrame( 482 { 483 "slide_id": [event.scan.slide_id for event in events], 484 "tile": [event.tile.n for event in events], 485 "roi": [event.tile.n_roi for event in events], 486 "x": [event.x for event in events], 487 "y": [event.y for event in events], 488 "size": [event.size for event in events], 489 } 490 ) 491 metadata_list = [event.metadata for event in events] 492 # Iterate through and ensure that all metadata is the same shape 493 for metadata in metadata_list: 494 if type(metadata) != type(metadata_list[0]): 495 raise ValueError("All metadata must be the same type.") 496 if metadata is not None and metadata.shape != metadata_list[0].shape: 497 raise ValueError("All metadata must be the same shape.") 498 if metadata_list[0] is None: 499 metadata = None 500 else: 501 metadata = pd.DataFrame(metadata_list) 502 features_list = [event.features for event in events] 503 # Iterate through and ensure that all features are the same shape 504 for features in features_list: 505 if type(features) != type(features_list[0]): 506 raise ValueError("All features must be the same type.") 507 if features is not None and features.shape != features_list[0].shape: 508 raise ValueError("All features must be the same shape.") 509 if features_list[0] is None: 510 features = None 511 else: 512 features = pd.DataFrame(features_list) 513 return EventArray(info=info, metadata=metadata, features=features) 514 515 def to_events( 516 self, 517 scans: list[Scan], 518 ignore_missing_scans=True, 519 ignore_metadata=False, 520 ignore_features=False, 521 ) -> list[Event]: 522 """ 523 Get the events in the EventArray as a list of events. 524 :param scans: the scans that the events belong to. Pass an empty list if you 525 don't care about scan metadata. 526 :param ignore_missing_scans: whether to create blank scans for events without scans. 527 :param ignore_metadata: whether to ignore metadata or not 528 :param ignore_features: whether to ignore features or not 529 :return: 530 """ 531 events = [] 532 for i in range(len(self.info)): 533 # Determine the associated scan 534 scan = None 535 for s in scans: 536 if s.slide_id == self.info["slide_id"][i]: 537 scan = s 538 break 539 if scan is None: 540 if ignore_missing_scans: 541 # Create a placeholder scan if the scan is missing 542 scan = Scan.make_placeholder( 543 self.info["slide_id"][i], 544 self.info["tile"][i], 545 self.info["roi"][i], 546 ) 547 else: 548 raise ValueError( 549 f"Scan {self.info['slide_id'][i]} not found for event {i}." 550 ) 551 # Add to the list 552 events.append( 553 Event( 554 scan, 555 Tile(scan, self.info["tile"][i], self.info["roi"][i]), 556 self.info["x"][i], 557 self.info["y"][i], 558 size=self.info["size"][i], 559 metadata=None if ignore_metadata else self.metadata.loc[i], 560 features=None if ignore_features else self.features.loc[i], 561 ) 562 ) 563 return events 564 565 def to_dataframe(self) -> pd.DataFrame: 566 """ 567 Convert all the data in the EventArray to a single DataFrame. 568 :return: a DataFrame with all the data in the EventArray. 569 """ 570 # Make a copy of the info DataFrame and prepend "info_" to the column names 571 output = self.info.copy() 572 output.columns = [f"info_{col}" for col in output.columns] 573 # Combine with the metadata and prepend "metadata_" to the column names 574 if self.metadata is not None: 575 metadata = self.metadata.copy() 576 metadata.columns = [f"metadata_{col}" for col in metadata.columns] 577 output = pd.concat([output, metadata], axis=1) 578 # Combine with the features and prepend "features_" to the column names 579 if self.features is not None: 580 features = self.features.copy() 581 features.columns = [f"features_{col}" for col in features.columns] 582 output = pd.concat([output, features], axis=1) 583 return output 584 585 @classmethod 586 def from_dataframe(cls, df) -> typing.Self: 587 """ 588 From a single, special DataFrame, create an EventArray. 589 :return: a DataFrame with all the data in the EventArray. 590 """ 591 # Split the columns into info, metadata, and features and strip prefix 592 info = df[[col for col in df.columns if col.startswith("info_")]].copy() 593 info.columns = [col.replace("info_", "") for col in info.columns] 594 if info.size == 0: 595 info = None 596 metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy() 597 metadata.columns = [col.replace("metadata_", "") for col in metadata.columns] 598 if metadata.size == 0: 599 metadata = None 600 features = df[[col for col in df.columns if col.startswith("features_")]].copy() 601 features.columns = [col.replace("features_", "") for col in features.columns] 602 if features.size == 0: 603 features = None 604 return cls(info=info, metadata=metadata, features=features) 605 606 def save_csv(self, output_path: str) -> bool: 607 """ 608 Save the events to an CSV file, including metadata and features. 609 :param output_path: 610 :return: 611 """ 612 self.to_dataframe().to_csv(output_path, index=False) 613 return os.path.exists(output_path) 614 615 @classmethod 616 def load_csv(cls, input_path: str) -> typing.Self: 617 """ 618 Load the events from an CSV file, including metadata and features. 619 :param input_path: 620 :return: 621 """ 622 # Load the CSV file 623 df = pd.read_csv(input_path) 624 return cls.from_dataframe(df) 625 626 def save_hdf5(self, output_path: str) -> bool: 627 """ 628 Save the events to an HDF5 file, including metadata and features. 629 Uses the pandas-provided HDF5 functions for ease, and external compatibility, 630 though these files are slightly harder to view in HDFView or similar. 631 :param output_path: 632 :return: 633 """ 634 # Open the output_path as an HDF5 file 635 with pd.HDFStore(output_path) as store: 636 # Store the dataframes in the HDF5 file 637 if self.info is not None: 638 store.put("info", self.info, index=False) 639 if self.metadata is not None: 640 store.put("metadata", self.metadata, index=False) 641 if self.features is not None: 642 store.put("features", self.features, index=False) 643 return os.path.exists(output_path) 644 645 @classmethod 646 def load_hdf5(cls, input_path: str) -> typing.Self: 647 """ 648 Load the events from an HDF5 file, including metadata and features. 649 :param input_path: 650 :return: 651 """ 652 # Open the input_path as an HDF5 file 653 with pd.HDFStore(input_path) as store: 654 # Load the dataframes from the HDF5 file 655 info = store.get("info") if "info" in store else None 656 metadata = store.get("metadata") if "metadata" in store else None 657 features = store.get("features") if "features" in store else None 658 return cls(info=info, metadata=metadata, features=features) 659 660 @classmethod 661 def load_ocular( 662 cls, 663 input_path: str, 664 event_type="cells", 665 cell_data_files=( 666 "rc-final1.rds", 667 "rc-final2.rds", 668 "rc-final3.rds", 669 "rc-final4.rds", 670 "ocular_interesting.rds", 671 ), 672 others_data_files=( 673 "others-final1.rds", 674 "others-final2.rds", 675 "others-final3.rds", 676 "others-final4.rds", 677 ), 678 atlas_data_files=( 679 "ocular_interesting.rds", 680 "ocular_not_interesting.rds", 681 ), 682 merge_event_data_with_stats=True, 683 filter_and_generate_morphs=True, 684 drop_common_events=True, 685 log=None, 686 ) -> typing.Self: 687 """ 688 689 :param input_path: 690 :param event_type: 691 :param cell_data_files: 692 :param others_data_files: 693 :param atlas_data_files: 694 :param merge_event_data_with_stats: 695 :param filter_and_generate_morphs: 696 :param drop_common_events: 697 :param log: 698 :return: 699 """ 700 # Check if the input path is a directory or a file 701 if os.path.isfile(input_path): 702 data_files = [os.path.basename(input_path)] 703 input_path = os.path.dirname(input_path) 704 if event_type == "cells": 705 data_files = cell_data_files 706 elif event_type == "others": 707 data_files = others_data_files 708 else: 709 raise ValueError("Invalid event type.") 710 711 # Load the data from the OCULAR files 712 file_data = {} 713 for file in data_files: 714 file_path = os.path.join(input_path, file) 715 if not os.path.isfile(file_path): 716 if log is not None: 717 log.warning(f"{file} not found for in {input_path}") 718 continue 719 file_data[file] = pyreadr.read_r(file_path) 720 # Get the DataFrame associated with None (pyreadr dict quirk) 721 file_data[file] = file_data[file][None] 722 if len(file_data[file]) == 0: 723 # File gets dropped from the dict 724 file_data.pop(file) 725 if log is not None: 726 log.warning(f"{file} has no cells") 727 continue 728 729 if log is not None: 730 log.debug(f"{file} has {len(file_data[file])} cells") 731 732 # Drop common cells if requested and in this file 733 if file in atlas_data_files and drop_common_events: 734 common_cell_indices = ( 735 file_data[file]["catalogue_classification"] == "common_cell" 736 ) 737 if log is not None: 738 log.debug( 739 f"Dropping {int(pd.Series.sum(common_cell_indices))}" 740 f"common cells from {file}" 741 ) 742 file_data[file] = file_data[file][common_cell_indices == False] 743 744 if len(file_data[file]) == 0: 745 # File gets dropped from the dict 746 file_data.pop(file) 747 if log is not None: 748 log.warning(f"{file} has no cells after dropping common cells") 749 continue 750 751 # Extract frame_id and cell_id 752 # DAPI- events already have frame_id cell_id outside rowname 753 if event_type == "cells": 754 file_data[file]["rowname"] = file_data[file]["rowname"].astype("str") 755 # get frame_id cell_id from rownames column and split into two columns 756 split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True) 757 if len(split_res.columns) != 2: 758 log.warning( 759 f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}' 760 ) 761 # then assign it back to the dataframe 762 file_data[file][["frame_id", "cell_id"]] = split_res.astype("int") 763 # reset indexes since they can cause NaN values in concat 764 file_data[file] = file_data[file].reset_index(drop=True) 765 766 # Merge the data from all files 767 if len(file_data) == 0: 768 return EventArray() 769 elif len(file_data) == 1: 770 data = [file_data[file] for file in file_data.keys()][0] 771 else: 772 data = pd.concat(file_data.values()) 773 774 if log is not None: 775 log.debug(f"Gathered a total of {len(data)} events") 776 777 # Others is missing the "slide_id". Insert it right before "frame_id" column 778 if event_type == "others" and "slide_id" not in data.columns: 779 if os.path.basename(input_path) == "ocular": 780 slide_id = os.path.basename(os.path.dirname(input_path)) 781 else: 782 slide_id = "UNKNOWN" 783 data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id) 784 785 # Sort according to ascending cell_id to keep the original, which is in manual_df 786 data = data.sort_values(by=["cell_id"], ascending=True) 787 # Filter out duplicates by x & y 788 data = data.assign( 789 unique_id=data["slide_id"] 790 + "_" 791 + data["frame_id"].astype(str) 792 + "_" 793 + data["cellx"].astype(int).astype(str) 794 + "_" 795 + data["celly"].astype(int).astype(str) 796 ) 797 data = data.drop_duplicates(subset=["unique_id"], keep="first") 798 # Normal unique_id is with cell_id 799 data = data.assign( 800 unique_id=data["slide_id"] 801 + "_" 802 + data["frame_id"].astype(str) 803 + "_" 804 + data["cell_id"].astype(str) 805 ) 806 data = data.reset_index(drop=True) 807 # All columns up to "slide_id" are features; drop the "slide_id" 808 features = data.loc[:, :"slide_id"].iloc[:, :-1] 809 data = data.loc[:, "slide_id":] 810 # Grab the info columns 811 info = data[["slide_id", "frame_id", "cellx", "celly"]] 812 info.columns = ["slide_id", "tile", "x", "y"] 813 info = info.assign( 814 roi=0, # OCULAR only works on 1 ROI, as far as known 815 size=25, # Static, for later montaging 816 ) 817 info = info[["slide_id", "tile", "roi", "x", "y", "size"]] 818 # Metadata has duplicate columns for later convenience 819 metadata = data 820 return EventArray(info, metadata, features) 821 822 def save_ocular(self, output_path: str, event_type: str = "cells") -> bool: 823 """ 824 Save the events to an OCULAR file. Relies on the dataframe originating 825 from an OCULAR file (same columns; duplicate metadata/info). 826 :param output_path: 827 :return: 828 """ 829 if event_type == "cells": 830 file_stub = "rc-final" 831 elif event_type == "others": 832 file_stub = "others-final" 833 else: 834 raise ValueError("Invalid event type. Must be cells or others.") 835 836 # Check for the "ocular_interesting" column 837 if event_type == "cells" and "ocular_interesting" in self.metadata.columns: 838 interesting = self.metadata["ocular_interesting"] 839 # Split the metadata into interesting and regular 840 # Interesting will only have dropped columns, with no internal changes 841 interesting = pd.concat( 842 [self.features[interesting], self.metadata[interesting]], axis=1 843 ).reset_index(drop=True) 844 # Data will get some columns changed; reset_index will copy it 845 data = ( 846 pd.concat( 847 [self.features[~interesting], self.metadata[~interesting]], axis=1 848 ) 849 .reset_index(drop=True) 850 .drop(columns=["ocular_interesting"]) 851 ) 852 853 # Drop particular columns for "interesting" 854 interesting = interesting.drop( 855 [ 856 "clust", 857 "hcpc", 858 "frame_id", 859 "cell_id", 860 "unique_id", 861 "ocular_interesting", 862 ], 863 axis=1, 864 errors="ignore", 865 ) 866 # Save both .csv and .rds 867 interesting.to_csv( 868 os.path.join(output_path, "ocular_interesting.csv"), index=False 869 ) 870 pyreadr.write_rds( 871 os.path.join(output_path, "ocular_interesting.rds"), interesting 872 ) 873 else: 874 # Get all data and reset_index (will copy it) 875 data = pd.concat([self.features, self.metadata], axis=1).reset_index( 876 drop=True 877 ) 878 879 # Split based on cluster number to conform to *-final[1-4].rds 880 n_clusters = max(data["clust"]) + 1 881 split_idx = [round(i * n_clusters / 4) for i in range(5)] 882 for i in range(4): 883 subset = (split_idx[i] <= data["clust"]) & ( 884 data["clust"] < split_idx[i + 1] 885 ) 886 subset = data[subset].reset_index(drop=True) 887 subset["hcpc"] = i + 1 888 pyreadr.write_rds( 889 os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset 890 ) 891 892 # Create new example cell strings 893 data["example_cell_id"] = ( 894 data["slide_id"] 895 + " " 896 + data["frame_id"].astype(str) 897 + " " 898 + data["cell_id"].astype(str) 899 + " " 900 + data["cellx"].astype(int).astype(str) 901 + " " 902 + data["celly"].astype(int).astype(str) 903 ) 904 # Find averagable data columns 905 if "cellcluster_id" in data.columns: 906 avg_cols = data.columns[: data.columns.get_loc("cellcluster_id")].tolist() 907 else: 908 avg_cols = data.columns[: data.columns.get_loc("slide_id")].tolist() 909 # Group by cluster and average 910 data = data.groupby("clust").agg( 911 **{col: (col, "mean") for col in avg_cols}, 912 count=("clust", "size"), # count rows in each cluster 913 example_cells=("example_cell_id", lambda x: ",".join(x)), 914 hcpc=("hcpc", lambda x: x.iloc[0]), 915 ) 916 data = data.reset_index() # Do NOT drop, index is "clust" 917 # Create new columns 918 metadata = pd.DataFrame( 919 { 920 "count": data["count"], 921 "example_cells": data["example_cells"], 922 "clust": data["clust"].astype(int), 923 "hcpc": data["hcpc"].astype(int), 924 "id": data["clust"].astype(int).astype(str), 925 "cccluster": "0", # Dummy value 926 "ccdistance": 0.0, # Dummy value 927 "rownum": list(range(len(data))), 928 "framegroup": 0, # Dummy value 929 } 930 ) 931 data = pd.concat([data[avg_cols], metadata], axis=1) 932 # Save the data 933 data.to_csv(os.path.join(output_path, f"{file_stub}.csv"), index=False) 934 pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data)
A class that holds a large number of events' data, making it easy to analyze and manipulate many events at once. A more separated version of the Event class.
255 def __init__( 256 self, 257 info: pd.DataFrame = None, 258 metadata: pd.DataFrame = None, 259 features: pd.DataFrame = None, 260 ): 261 # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y", "size" 262 if info is not None and ( 263 not all( 264 col in info.columns 265 for col in ["slide_id", "tile", "roi", "x", "y", "size"] 266 ) 267 or len(info.columns) != 6 268 ): 269 raise ValueError( 270 "EventArray.info must have columns 'slide_id', 'tile', 'roi', 'x', 'y', 'size'" 271 ) 272 # All DataFrames must all have the same number of rows 273 if metadata is not None and (info is None or len(info) != len(metadata)): 274 raise ValueError( 275 "If EventArray.metadata is not None, it should match rows with .info" 276 ) 277 if features is not None and (info is None or len(info) != len(features)): 278 raise ValueError( 279 "If EventArray.features is not None, it should match rows with .info" 280 ) 281 self.info = info 282 self.metadata = metadata 283 self.features = features
332 def get_sort_order(self, by: str | list[str], ascending: bool | list[bool] = True): 333 """ 334 Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames. 335 :param by: name of the column(s) to sort by. 336 :param ascending: whether to sort in ascending order; can be a list to match by 337 :return: the order of the indices to sort by. 338 """ 339 columns = self.get(by) 340 return columns.sort_values(by=by, ascending=ascending).index
Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.
Parameters
- by: name of the column(s) to sort by.
- ascending: whether to sort in ascending order; can be a list to match by
Returns
the order of the indices to sort by.
342 def sort( 343 self, by: str | list[str], ascending: bool | list[bool] = True 344 ) -> typing.Self: 345 """ 346 Sort the EventArray by column(s) in the info, metadata, or features DataFrames. 347 :param by: name of the column(s) to sort by. 348 :param ascending: whether to sort in ascending order; can be a list to match by 349 :return: a new, sorted EventArray. 350 """ 351 order = self.get_sort_order(by, ascending) 352 info = self.info.loc[order].reset_index(drop=True) 353 if self.metadata is not None: 354 metadata = self.metadata.loc[order].reset_index(drop=True) 355 else: 356 metadata = None 357 if self.features is not None: 358 features = self.features.loc[order].reset_index(drop=True) 359 else: 360 features = None 361 return EventArray(info, metadata, features)
Sort the EventArray by column(s) in the info, metadata, or features DataFrames.
Parameters
- by: name of the column(s) to sort by.
- ascending: whether to sort in ascending order; can be a list to match by
Returns
a new, sorted EventArray.
363 def get(self, column_names: int | str | list[int] | list[str]) -> pd.DataFrame: 364 """ 365 Get a DataFrame with the specified columns from the EventArray, by value. 366 :param column_names: the names of the columns to get. 367 :return: a DataFrame with the specified columns. 368 """ 369 if isinstance(column_names, int) or isinstance(column_names, str): 370 column_names = [column_names] 371 columns = [] 372 for column_name in column_names: 373 if column_name in self.info.columns: 374 columns.append(self.info[column_name]) 375 elif self.metadata is not None and column_name in self.metadata.columns: 376 columns.append(column_name) 377 elif self.features is not None and column_name in self.features.columns: 378 columns.append(column_name) 379 else: 380 raise ValueError(f"Column {column_name} not found in EventArray") 381 return pd.concat(columns, axis=1)
Get a DataFrame with the specified columns from the EventArray, by value.
Parameters
- column_names: the names of the columns to get.
Returns
a DataFrame with the specified columns.
383 def rows(self, rows) -> typing.Self: 384 """ 385 Get a subset of the EventArray rows based on a boolean or integer index, by value. 386 :param rows: the indices to get as a 1D boolean/integer list/array/series 387 :return: a new EventArray with the subset of events. 388 """ 389 info = self.info.loc[rows].reset_index(drop=True) 390 if self.metadata is not None: 391 metadata = self.metadata.loc[rows].reset_index(drop=True) 392 else: 393 metadata = None 394 if self.features is not None: 395 features = self.features.loc[rows].reset_index(drop=True) 396 else: 397 features = None 398 return EventArray(info, metadata, features)
Get a subset of the EventArray rows based on a boolean or integer index, by value.
Parameters
- rows: the indices to get as a 1D boolean/integer list/array/series
Returns
a new EventArray with the subset of events.
400 def copy(self) -> typing.Self: 401 """ 402 Create a deep copy of the EventArray. 403 :return: a deep copy of the EventArray. 404 """ 405 return EventArray( 406 info=self.info.copy(), 407 metadata=None if self.metadata is None else self.metadata.copy(), 408 features=None if self.features is None else self.features.copy(), 409 )
Create a deep copy of the EventArray.
Returns
a deep copy of the EventArray.
411 def add_metadata(self, new_metadata: pd.DataFrame) -> None: 412 """ 413 Add metadata to the EventArray. Removes the need to check if metadata is None. 414 Overwrites any existing metadata with the same column names as the new metadata. 415 :param new_metadata: the metadata to add. 416 """ 417 if len(self) != len(new_metadata): 418 raise ValueError("New metadata must match length of existing info") 419 420 if self.metadata is None: 421 self.metadata = new_metadata 422 else: 423 self.metadata[new_metadata.columns] = new_metadata
Add metadata to the EventArray. Removes the need to check if metadata is None. Overwrites any existing metadata with the same column names as the new metadata.
Parameters
- new_metadata: the metadata to add.
425 def add_features(self, new_features: pd.DataFrame) -> None: 426 """ 427 Add features to the EventArray. Removes the need to check if features is None. 428 Overwrites any existing features with the same column names as the new features. 429 :param new_features: the features to add. 430 """ 431 if len(self) != len(new_features): 432 raise ValueError("New features must match length of existing info") 433 434 if self.features is None: 435 self.features = new_features 436 else: 437 self.features[new_features.columns] = new_features
Add features to the EventArray. Removes the need to check if features is None. Overwrites any existing features with the same column names as the new features.
Parameters
- new_features: the features to add.
439 @classmethod 440 def merge(cls, events: list[typing.Self]) -> typing.Self: 441 """ 442 Combine EventArrays in a list into a single EventArray. 443 :param events: the new list of events. 444 """ 445 all_info = [] 446 all_metadata = [] 447 all_features = [] 448 for event_array in events: 449 # Skip empty EventArrays 450 if event_array.info is not None: 451 all_info.append(event_array.info) 452 if event_array.metadata is not None: 453 all_metadata.append(event_array.metadata) 454 if event_array.features is not None: 455 all_features.append(event_array.features) 456 if len(all_info) == 0: 457 return EventArray() 458 else: 459 all_info = pd.concat(all_info, ignore_index=True) 460 if len(all_metadata) == 0: 461 all_metadata = None 462 else: 463 all_metadata = pd.concat(all_metadata, ignore_index=True) 464 if len(all_features) == 0: 465 all_features = None 466 else: 467 all_features = pd.concat(all_features, ignore_index=True) 468 469 return EventArray(all_info, all_metadata, all_features)
Combine EventArrays in a list into a single EventArray.
Parameters
- events: the new list of events.
471 @classmethod 472 def from_events(cls, events: list[Event]) -> typing.Self: 473 """ 474 Set the events in the EventArray to a new list of events. 475 :param events: the new list of events. 476 """ 477 # Return an empty array if we were passed nothing 478 if events is None or len(events) == 0: 479 return EventArray() 480 # Otherwise, grab the info 481 info = pd.DataFrame( 482 { 483 "slide_id": [event.scan.slide_id for event in events], 484 "tile": [event.tile.n for event in events], 485 "roi": [event.tile.n_roi for event in events], 486 "x": [event.x for event in events], 487 "y": [event.y for event in events], 488 "size": [event.size for event in events], 489 } 490 ) 491 metadata_list = [event.metadata for event in events] 492 # Iterate through and ensure that all metadata is the same shape 493 for metadata in metadata_list: 494 if type(metadata) != type(metadata_list[0]): 495 raise ValueError("All metadata must be the same type.") 496 if metadata is not None and metadata.shape != metadata_list[0].shape: 497 raise ValueError("All metadata must be the same shape.") 498 if metadata_list[0] is None: 499 metadata = None 500 else: 501 metadata = pd.DataFrame(metadata_list) 502 features_list = [event.features for event in events] 503 # Iterate through and ensure that all features are the same shape 504 for features in features_list: 505 if type(features) != type(features_list[0]): 506 raise ValueError("All features must be the same type.") 507 if features is not None and features.shape != features_list[0].shape: 508 raise ValueError("All features must be the same shape.") 509 if features_list[0] is None: 510 features = None 511 else: 512 features = pd.DataFrame(features_list) 513 return EventArray(info=info, metadata=metadata, features=features)
Set the events in the EventArray to a new list of events.
Parameters
- events: the new list of events.
515 def to_events( 516 self, 517 scans: list[Scan], 518 ignore_missing_scans=True, 519 ignore_metadata=False, 520 ignore_features=False, 521 ) -> list[Event]: 522 """ 523 Get the events in the EventArray as a list of events. 524 :param scans: the scans that the events belong to. Pass an empty list if you 525 don't care about scan metadata. 526 :param ignore_missing_scans: whether to create blank scans for events without scans. 527 :param ignore_metadata: whether to ignore metadata or not 528 :param ignore_features: whether to ignore features or not 529 :return: 530 """ 531 events = [] 532 for i in range(len(self.info)): 533 # Determine the associated scan 534 scan = None 535 for s in scans: 536 if s.slide_id == self.info["slide_id"][i]: 537 scan = s 538 break 539 if scan is None: 540 if ignore_missing_scans: 541 # Create a placeholder scan if the scan is missing 542 scan = Scan.make_placeholder( 543 self.info["slide_id"][i], 544 self.info["tile"][i], 545 self.info["roi"][i], 546 ) 547 else: 548 raise ValueError( 549 f"Scan {self.info['slide_id'][i]} not found for event {i}." 550 ) 551 # Add to the list 552 events.append( 553 Event( 554 scan, 555 Tile(scan, self.info["tile"][i], self.info["roi"][i]), 556 self.info["x"][i], 557 self.info["y"][i], 558 size=self.info["size"][i], 559 metadata=None if ignore_metadata else self.metadata.loc[i], 560 features=None if ignore_features else self.features.loc[i], 561 ) 562 ) 563 return events
Get the events in the EventArray as a list of events.
Parameters
- scans: the scans that the events belong to. Pass an empty list if you don't care about scan metadata.
- ignore_missing_scans: whether to create blank scans for events without scans.
- ignore_metadata: whether to ignore metadata or not
- ignore_features: whether to ignore features or not
Returns
565 def to_dataframe(self) -> pd.DataFrame: 566 """ 567 Convert all the data in the EventArray to a single DataFrame. 568 :return: a DataFrame with all the data in the EventArray. 569 """ 570 # Make a copy of the info DataFrame and prepend "info_" to the column names 571 output = self.info.copy() 572 output.columns = [f"info_{col}" for col in output.columns] 573 # Combine with the metadata and prepend "metadata_" to the column names 574 if self.metadata is not None: 575 metadata = self.metadata.copy() 576 metadata.columns = [f"metadata_{col}" for col in metadata.columns] 577 output = pd.concat([output, metadata], axis=1) 578 # Combine with the features and prepend "features_" to the column names 579 if self.features is not None: 580 features = self.features.copy() 581 features.columns = [f"features_{col}" for col in features.columns] 582 output = pd.concat([output, features], axis=1) 583 return output
Convert all the data in the EventArray to a single DataFrame.
Returns
a DataFrame with all the data in the EventArray.
585 @classmethod 586 def from_dataframe(cls, df) -> typing.Self: 587 """ 588 From a single, special DataFrame, create an EventArray. 589 :return: a DataFrame with all the data in the EventArray. 590 """ 591 # Split the columns into info, metadata, and features and strip prefix 592 info = df[[col for col in df.columns if col.startswith("info_")]].copy() 593 info.columns = [col.replace("info_", "") for col in info.columns] 594 if info.size == 0: 595 info = None 596 metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy() 597 metadata.columns = [col.replace("metadata_", "") for col in metadata.columns] 598 if metadata.size == 0: 599 metadata = None 600 features = df[[col for col in df.columns if col.startswith("features_")]].copy() 601 features.columns = [col.replace("features_", "") for col in features.columns] 602 if features.size == 0: 603 features = None 604 return cls(info=info, metadata=metadata, features=features)
From a single, special DataFrame, create an EventArray.
Returns
a DataFrame with all the data in the EventArray.
606 def save_csv(self, output_path: str) -> bool: 607 """ 608 Save the events to an CSV file, including metadata and features. 609 :param output_path: 610 :return: 611 """ 612 self.to_dataframe().to_csv(output_path, index=False) 613 return os.path.exists(output_path)
Save the events to an CSV file, including metadata and features.
Parameters
- output_path:
Returns
615 @classmethod 616 def load_csv(cls, input_path: str) -> typing.Self: 617 """ 618 Load the events from an CSV file, including metadata and features. 619 :param input_path: 620 :return: 621 """ 622 # Load the CSV file 623 df = pd.read_csv(input_path) 624 return cls.from_dataframe(df)
Load the events from an CSV file, including metadata and features.
Parameters
- input_path:
Returns
626 def save_hdf5(self, output_path: str) -> bool: 627 """ 628 Save the events to an HDF5 file, including metadata and features. 629 Uses the pandas-provided HDF5 functions for ease, and external compatibility, 630 though these files are slightly harder to view in HDFView or similar. 631 :param output_path: 632 :return: 633 """ 634 # Open the output_path as an HDF5 file 635 with pd.HDFStore(output_path) as store: 636 # Store the dataframes in the HDF5 file 637 if self.info is not None: 638 store.put("info", self.info, index=False) 639 if self.metadata is not None: 640 store.put("metadata", self.metadata, index=False) 641 if self.features is not None: 642 store.put("features", self.features, index=False) 643 return os.path.exists(output_path)
Save the events to an HDF5 file, including metadata and features. Uses the pandas-provided HDF5 functions for ease, and external compatibility, though these files are slightly harder to view in HDFView or similar.
Parameters
- output_path:
Returns
645 @classmethod 646 def load_hdf5(cls, input_path: str) -> typing.Self: 647 """ 648 Load the events from an HDF5 file, including metadata and features. 649 :param input_path: 650 :return: 651 """ 652 # Open the input_path as an HDF5 file 653 with pd.HDFStore(input_path) as store: 654 # Load the dataframes from the HDF5 file 655 info = store.get("info") if "info" in store else None 656 metadata = store.get("metadata") if "metadata" in store else None 657 features = store.get("features") if "features" in store else None 658 return cls(info=info, metadata=metadata, features=features)
Load the events from an HDF5 file, including metadata and features.
Parameters
- input_path:
Returns
660 @classmethod 661 def load_ocular( 662 cls, 663 input_path: str, 664 event_type="cells", 665 cell_data_files=( 666 "rc-final1.rds", 667 "rc-final2.rds", 668 "rc-final3.rds", 669 "rc-final4.rds", 670 "ocular_interesting.rds", 671 ), 672 others_data_files=( 673 "others-final1.rds", 674 "others-final2.rds", 675 "others-final3.rds", 676 "others-final4.rds", 677 ), 678 atlas_data_files=( 679 "ocular_interesting.rds", 680 "ocular_not_interesting.rds", 681 ), 682 merge_event_data_with_stats=True, 683 filter_and_generate_morphs=True, 684 drop_common_events=True, 685 log=None, 686 ) -> typing.Self: 687 """ 688 689 :param input_path: 690 :param event_type: 691 :param cell_data_files: 692 :param others_data_files: 693 :param atlas_data_files: 694 :param merge_event_data_with_stats: 695 :param filter_and_generate_morphs: 696 :param drop_common_events: 697 :param log: 698 :return: 699 """ 700 # Check if the input path is a directory or a file 701 if os.path.isfile(input_path): 702 data_files = [os.path.basename(input_path)] 703 input_path = os.path.dirname(input_path) 704 if event_type == "cells": 705 data_files = cell_data_files 706 elif event_type == "others": 707 data_files = others_data_files 708 else: 709 raise ValueError("Invalid event type.") 710 711 # Load the data from the OCULAR files 712 file_data = {} 713 for file in data_files: 714 file_path = os.path.join(input_path, file) 715 if not os.path.isfile(file_path): 716 if log is not None: 717 log.warning(f"{file} not found for in {input_path}") 718 continue 719 file_data[file] = pyreadr.read_r(file_path) 720 # Get the DataFrame associated with None (pyreadr dict quirk) 721 file_data[file] = file_data[file][None] 722 if len(file_data[file]) == 0: 723 # File gets dropped from the dict 724 file_data.pop(file) 725 if log is not None: 726 log.warning(f"{file} has no cells") 727 continue 728 729 if log is not None: 730 log.debug(f"{file} has {len(file_data[file])} cells") 731 732 # Drop common cells if requested and in this file 733 if file in atlas_data_files and drop_common_events: 734 common_cell_indices = ( 735 file_data[file]["catalogue_classification"] == "common_cell" 736 ) 737 if log is not None: 738 log.debug( 739 f"Dropping {int(pd.Series.sum(common_cell_indices))}" 740 f"common cells from {file}" 741 ) 742 file_data[file] = file_data[file][common_cell_indices == False] 743 744 if len(file_data[file]) == 0: 745 # File gets dropped from the dict 746 file_data.pop(file) 747 if log is not None: 748 log.warning(f"{file} has no cells after dropping common cells") 749 continue 750 751 # Extract frame_id and cell_id 752 # DAPI- events already have frame_id cell_id outside rowname 753 if event_type == "cells": 754 file_data[file]["rowname"] = file_data[file]["rowname"].astype("str") 755 # get frame_id cell_id from rownames column and split into two columns 756 split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True) 757 if len(split_res.columns) != 2: 758 log.warning( 759 f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}' 760 ) 761 # then assign it back to the dataframe 762 file_data[file][["frame_id", "cell_id"]] = split_res.astype("int") 763 # reset indexes since they can cause NaN values in concat 764 file_data[file] = file_data[file].reset_index(drop=True) 765 766 # Merge the data from all files 767 if len(file_data) == 0: 768 return EventArray() 769 elif len(file_data) == 1: 770 data = [file_data[file] for file in file_data.keys()][0] 771 else: 772 data = pd.concat(file_data.values()) 773 774 if log is not None: 775 log.debug(f"Gathered a total of {len(data)} events") 776 777 # Others is missing the "slide_id". Insert it right before "frame_id" column 778 if event_type == "others" and "slide_id" not in data.columns: 779 if os.path.basename(input_path) == "ocular": 780 slide_id = os.path.basename(os.path.dirname(input_path)) 781 else: 782 slide_id = "UNKNOWN" 783 data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id) 784 785 # Sort according to ascending cell_id to keep the original, which is in manual_df 786 data = data.sort_values(by=["cell_id"], ascending=True) 787 # Filter out duplicates by x & y 788 data = data.assign( 789 unique_id=data["slide_id"] 790 + "_" 791 + data["frame_id"].astype(str) 792 + "_" 793 + data["cellx"].astype(int).astype(str) 794 + "_" 795 + data["celly"].astype(int).astype(str) 796 ) 797 data = data.drop_duplicates(subset=["unique_id"], keep="first") 798 # Normal unique_id is with cell_id 799 data = data.assign( 800 unique_id=data["slide_id"] 801 + "_" 802 + data["frame_id"].astype(str) 803 + "_" 804 + data["cell_id"].astype(str) 805 ) 806 data = data.reset_index(drop=True) 807 # All columns up to "slide_id" are features; drop the "slide_id" 808 features = data.loc[:, :"slide_id"].iloc[:, :-1] 809 data = data.loc[:, "slide_id":] 810 # Grab the info columns 811 info = data[["slide_id", "frame_id", "cellx", "celly"]] 812 info.columns = ["slide_id", "tile", "x", "y"] 813 info = info.assign( 814 roi=0, # OCULAR only works on 1 ROI, as far as known 815 size=25, # Static, for later montaging 816 ) 817 info = info[["slide_id", "tile", "roi", "x", "y", "size"]] 818 # Metadata has duplicate columns for later convenience 819 metadata = data 820 return EventArray(info, metadata, features)
Parameters
- input_path:
- event_type:
- cell_data_files:
- others_data_files:
- atlas_data_files:
- merge_event_data_with_stats:
- filter_and_generate_morphs:
- drop_common_events:
- log:
Returns
822 def save_ocular(self, output_path: str, event_type: str = "cells") -> bool: 823 """ 824 Save the events to an OCULAR file. Relies on the dataframe originating 825 from an OCULAR file (same columns; duplicate metadata/info). 826 :param output_path: 827 :return: 828 """ 829 if event_type == "cells": 830 file_stub = "rc-final" 831 elif event_type == "others": 832 file_stub = "others-final" 833 else: 834 raise ValueError("Invalid event type. Must be cells or others.") 835 836 # Check for the "ocular_interesting" column 837 if event_type == "cells" and "ocular_interesting" in self.metadata.columns: 838 interesting = self.metadata["ocular_interesting"] 839 # Split the metadata into interesting and regular 840 # Interesting will only have dropped columns, with no internal changes 841 interesting = pd.concat( 842 [self.features[interesting], self.metadata[interesting]], axis=1 843 ).reset_index(drop=True) 844 # Data will get some columns changed; reset_index will copy it 845 data = ( 846 pd.concat( 847 [self.features[~interesting], self.metadata[~interesting]], axis=1 848 ) 849 .reset_index(drop=True) 850 .drop(columns=["ocular_interesting"]) 851 ) 852 853 # Drop particular columns for "interesting" 854 interesting = interesting.drop( 855 [ 856 "clust", 857 "hcpc", 858 "frame_id", 859 "cell_id", 860 "unique_id", 861 "ocular_interesting", 862 ], 863 axis=1, 864 errors="ignore", 865 ) 866 # Save both .csv and .rds 867 interesting.to_csv( 868 os.path.join(output_path, "ocular_interesting.csv"), index=False 869 ) 870 pyreadr.write_rds( 871 os.path.join(output_path, "ocular_interesting.rds"), interesting 872 ) 873 else: 874 # Get all data and reset_index (will copy it) 875 data = pd.concat([self.features, self.metadata], axis=1).reset_index( 876 drop=True 877 ) 878 879 # Split based on cluster number to conform to *-final[1-4].rds 880 n_clusters = max(data["clust"]) + 1 881 split_idx = [round(i * n_clusters / 4) for i in range(5)] 882 for i in range(4): 883 subset = (split_idx[i] <= data["clust"]) & ( 884 data["clust"] < split_idx[i + 1] 885 ) 886 subset = data[subset].reset_index(drop=True) 887 subset["hcpc"] = i + 1 888 pyreadr.write_rds( 889 os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset 890 ) 891 892 # Create new example cell strings 893 data["example_cell_id"] = ( 894 data["slide_id"] 895 + " " 896 + data["frame_id"].astype(str) 897 + " " 898 + data["cell_id"].astype(str) 899 + " " 900 + data["cellx"].astype(int).astype(str) 901 + " " 902 + data["celly"].astype(int).astype(str) 903 ) 904 # Find averagable data columns 905 if "cellcluster_id" in data.columns: 906 avg_cols = data.columns[: data.columns.get_loc("cellcluster_id")].tolist() 907 else: 908 avg_cols = data.columns[: data.columns.get_loc("slide_id")].tolist() 909 # Group by cluster and average 910 data = data.groupby("clust").agg( 911 **{col: (col, "mean") for col in avg_cols}, 912 count=("clust", "size"), # count rows in each cluster 913 example_cells=("example_cell_id", lambda x: ",".join(x)), 914 hcpc=("hcpc", lambda x: x.iloc[0]), 915 ) 916 data = data.reset_index() # Do NOT drop, index is "clust" 917 # Create new columns 918 metadata = pd.DataFrame( 919 { 920 "count": data["count"], 921 "example_cells": data["example_cells"], 922 "clust": data["clust"].astype(int), 923 "hcpc": data["hcpc"].astype(int), 924 "id": data["clust"].astype(int).astype(str), 925 "cccluster": "0", # Dummy value 926 "ccdistance": 0.0, # Dummy value 927 "rownum": list(range(len(data))), 928 "framegroup": 0, # Dummy value 929 } 930 ) 931 data = pd.concat([data[avg_cols], metadata], axis=1) 932 # Save the data 933 data.to_csv(os.path.join(output_path, f"{file_stub}.csv"), index=False) 934 pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data)
Save the events to an OCULAR file. Relies on the dataframe originating from an OCULAR file (same columns; duplicate metadata/info).
Parameters
- output_path: