csi_images.csi_events

Contains the Event class, which represents a single event in a scan. The Event class optionally holds metadata and features. Lists of events with similar metadata or features can be combined into DataFrames for analysis.

The Event class holds the position of the event in the frame, which can be converted to the position in the scanner or slide coordinate positions. See the csi_utils.csi_scans documentation page for more information on the coordinate systems.

  1"""
  2Contains the Event class, which represents a single event in a scan.
  3The Event class optionally holds metadata and features. Lists of events with
  4similar metadata or features can be combined into DataFrames for analysis.
  5
  6The Event class holds the position of the event in the frame, which can be converted
  7to the position in the scanner or slide coordinate positions. See the
  8csi_utils.csi_scans documentation page for more information on the coordinate systems.
  9"""
 10
 11import os
 12import math
 13import typing
 14
 15import numpy as np
 16import pandas as pd
 17
 18import pyreadr
 19
 20from .csi_scans import Scan
 21from .csi_tiles import Tile
 22from .csi_frames import Frame
 23
 24
 25class Event:
 26    """
 27    A class that represents a single event in a scan, making it easy to evaluate
 28    singular events. Required metadata is exposed as attributes, and optional
 29    metadata and features are stored as DataFrames.
 30    """
 31
 32    SCAN_TO_SLIDE_TRANSFORM = {
 33        # Axioscan zero is in the top-right corner instead of top-left
 34        Scan.Type.AXIOSCAN7: np.array(
 35            [
 36                [1, 0, 75000],
 37                [0, 1, 0],
 38                [0, 0, 1],
 39            ]
 40        ),
 41        # BZScanner coordinates are a special kind of messed up:
 42        # - The slide is upside-down.
 43        # - The slide is oriented vertically, with the barcode at the bottom.
 44        # - Tiles are numbered from the top-right
 45        Scan.Type.BZSCANNER: np.array(
 46            [
 47                [0, -1, 75000],
 48                [-1, 0, 25000],
 49                [0, 0, 1],
 50            ]
 51        ),
 52    }
 53    """
 54    Homogeneous transformation matrices for converting between scanner and slide
 55    coordinates. The matrices are 3x3, with the final column representing the
 56    translation in micrometers (um). For more information, see 
 57    [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations).
 58    
 59    Transformations are nominal, and accuracy is not guaranteed; this is due to 
 60    imperfections in slides and alignment in the scanners. Units are in micrometers.
 61    """
 62
 63    def __init__(
 64        self,
 65        scan: Scan,
 66        tile: Tile,
 67        x: int,
 68        y: int,
 69        size: int = 12,  # End-to-end size in pixels
 70        metadata: pd.Series = None,
 71        features: pd.Series = None,
 72    ):
 73        self.scan = scan
 74        self.tile = tile
 75        self.x = x
 76        self.y = y
 77        self.size = size
 78        self.metadata = metadata
 79        self.features = features
 80
 81    def __repr__(self) -> str:
 82        return f"{self.scan.slide_id}-{self.tile.n}-{self.x}-{self.y}"
 83
 84    def __eq__(self, other) -> bool:
 85        return self.__repr__() == other.__repr__()
 86
 87    def __lt__(self, other):
 88        return self.__repr__() < other.__repr__()
 89
 90    def get_scan_position(self) -> tuple[float, float]:
 91        """
 92        Get the position of the event in the scanner's coordinate frame.
 93        :return: the scan position of the event in micrometers (um).
 94        """
 95        # Get overall pixel position
 96        pixel_x = self.x + (self.scan.tile_width_px * self.tile.x)
 97        pixel_y = self.y + (self.scan.tile_height_px * self.tile.y)
 98        # Convert to micrometers
 99        x_um = pixel_x * self.scan.pixel_size_um
100        y_um = pixel_y * self.scan.pixel_size_um
101        # Add the scan's origin in the scanner frame
102        x_um += self.scan.roi[self.tile.n_roi].origin_x_um
103        y_um += self.scan.roi[self.tile.n_roi].origin_y_um
104        return x_um, y_um
105
106    def get_slide_position(self) -> tuple[float, float]:
107        """
108        Get the slide position of the event in micrometers (um).
109        :return: the slide position of the event.
110        """
111        # Turn scan_position into a 3x1 vector
112        scan_position = self.get_scan_position()
113        scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]])
114
115        # Multiply by the appropriate homogeneous matrix
116        if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value):
117            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7]
118        elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value):
119            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER]
120        else:
121            raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.")
122        slide_position = np.matmul(transform, scan_position)
123        return float(slide_position[0][0]), float(slide_position[1][0])
124
125    def crop_images(
126        self, images: list[np.ndarray], crop_size: int = 100, in_pixels: bool = True
127    ) -> list[np.ndarray]:
128        """
129        Get the event crops from the frame images. Called "get" because it does not
130        need to extract anything; it is very quick for extracting multiple events from
131        the same tile.
132        Use this if you're interested in many events.
133        :param images: the frame images.
134        :param crop_size: the square size of the image crop to get for this event.
135        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
136        :return: image_size x image_size crops of the event in the provided frames. If
137        the event is too close to the edge, the crop will be smaller and not centered.
138        """
139        # Convert a crop size in micrometers to pixels
140        if not in_pixels:
141            crop_size = round(crop_size / self.scan.pixel_size_um)
142        # Find the crop bounds
143        bounds = [
144            self.x - crop_size // 2,
145            self.y - crop_size // 2,
146            self.x + math.ceil(crop_size / 2),
147            self.y + math.ceil(crop_size / 2),
148        ]
149        # Determine how much the bounds violate the image size
150        displacements = [
151            max(0, -bounds[0]),
152            max(0, -bounds[1]),
153            max(0, bounds[2] - images[0].shape[1]),
154            max(0, bounds[3] - images[0].shape[0]),
155        ]
156        # Cap off the bounds
157        bounds = [
158            max(0, bounds[0]),
159            max(0, bounds[1]),
160            min(images[0].shape[1], bounds[2]),
161            min(images[0].shape[0], bounds[3]),
162        ]
163
164        # Crop the images
165        cropped_images = []
166        for image in images:
167            # Create a blank image of the right size
168            cropped_image = np.zeros((crop_size, crop_size), dtype=image.dtype)
169
170            # Insert the cropped image into the blank image, leaving a black buffer
171            # around the edges if the crop would go beyond the original image bounds
172            cropped_image[
173                displacements[1] : crop_size - displacements[3],
174                displacements[0] : crop_size - displacements[2],
175            ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]]
176            cropped_images.append(cropped_image)
177        return cropped_images
178
179    def extract_images(
180        self, crop_size: int = 100, in_pixels: bool = True
181    ) -> list[np.ndarray]:
182        """
183        Extract the images from the scan and tile, reading from the file. Called
184        "extract" because it must read and extract the images from file, which is slow.
185        Use this if you're interested in only a few events, as it is inefficient when
186        reading multiple events from the same tile.
187        :param crop_size: the square size of the image crop to get for this event.
188        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
189        :return: a list of cropped images from the scan in the order of the channels.
190        """
191        frames = Frame.get_frames(self.tile)
192        images = [frame.get_image() for frame in frames]
193        return self.crop_images(images, crop_size, in_pixels)
194
195    @classmethod
196    def extract_images_for_list(
197        cls,
198        events: list[typing.Self],
199        crop_size: int | list[int] = None,
200        in_pixels: bool = True,
201    ) -> list[list[np.ndarray]]:
202        """
203        Get the images for a list of events, ensuring that there is no wasteful reading
204        of the same tile multiple times. This function is more efficient than calling
205        extract_event_images for each event.
206        TODO: test this function
207        :param events: the events to extract images for.
208        :param crop_size: the square size of the image crop to get for this event.
209                          Defaults to four times the size of the event.
210        :param in_pixels: whether the crop size is in pixels or micrometers.
211                          Defaults to pixels, and is ignored if crop_size is None.
212        :return: a list of lists of cropped images for each event.
213        """
214        if len(events) == 0:
215            return []
216
217        # Populate a crop size if none provided
218        if crop_size is None:
219            crop_size = [4 * event.size for event in events]
220            in_pixels = True
221        # Propagate a constant crop size
222        elif isinstance(crop_size, int):
223            crop_size = [crop_size] * len(events)
224
225        # Sort the events by tile; use a shallow copy to avoid modifying the original
226        order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__()))
227
228        # Allocate the list to size
229        images = [None] * len(events)
230        last_tile = None
231        frame_images = None  # Holds large numpy arrays, so expensive to compare
232        # Iterate through in sorted order
233        for i in order:
234            if last_tile != events[i].tile:
235                # Gather the frame images, preserving them for the next event
236                frames = Frame.get_frames(events[i].tile)
237                frame_images = [frame.get_image() for frame in frames]
238
239                last_tile = events[i].tile
240            # Use the frame images to crop the event images
241            # Preserve the original order using order[i]
242            images[i] = events[i].crop_images(frame_images, crop_size[i], in_pixels)
243        return images
244
245
246class EventArray:
247    """
248    A class that holds a large number of events' data, making it easy to analyze and
249    manipulate many events at once. A more separated version of the Event class.
250    """
251
252    INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y", "size"]
253
254    def __init__(
255        self,
256        info: pd.DataFrame = None,
257        metadata: pd.DataFrame = None,
258        features: pd.DataFrame = None,
259    ):
260        # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y", "size"
261        if info is not None and (
262            not all(
263                col in info.columns
264                for col in ["slide_id", "tile", "roi", "x", "y", "size"]
265            )
266            or len(info.columns) != 6
267        ):
268            raise ValueError(
269                "EventArray.info must have columns 'slide_id', 'tile', 'roi', 'x', 'y', 'size'"
270            )
271        # All DataFrames must all have the same number of rows
272        if metadata is not None and (info is None or len(info) != len(metadata)):
273            raise ValueError(
274                "If EventArray.metadata is not None, it should match rows with .info"
275            )
276        if features is not None and (info is None or len(info) != len(features)):
277            raise ValueError(
278                "If EventArray.features is not None, it should match rows with .info"
279            )
280        self.info = info
281        self.metadata = metadata
282        self.features = features
283
284    def __len__(self) -> int:
285        # Convenience method to get the number of events
286        if self.info is None:
287            return 0
288        else:
289            return len(self.info)
290
291    def __eq__(self, other):
292        is_equal = True
293        # Parse all possibilities for info
294        if isinstance(self.info, pd.DataFrame):
295            if isinstance(other.info, pd.DataFrame):
296                is_equal = self.info.equals(other.info)
297                if not is_equal:
298                    return False
299            else:
300                return False
301        elif self.info is None:
302            if other.info is not None:
303                return False
304
305        # Parse all possibilities for metadata
306        if isinstance(self.metadata, pd.DataFrame):
307            if isinstance(other.metadata, pd.DataFrame):
308                is_equal = self.metadata.equals(other.metadata)
309                if not is_equal:
310                    return False
311            else:
312                return False
313        elif self.metadata is None:
314            if other.metadata is not None:
315                return False
316
317        # Parse all possibilities for features
318        if isinstance(self.features, pd.DataFrame):
319            if isinstance(other.features, pd.DataFrame):
320                is_equal = self.features.equals(other.features)
321                if not is_equal:
322                    return False
323            else:
324                return False
325        elif self.features is None:
326            if other.features is not None:
327                return False
328
329        return is_equal
330
331    def get_sort_order(self, by: str | list[str], ascending: bool | list[bool] = True):
332        """
333        Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.
334        :param by: name of the column(s) to sort by.
335        :param ascending: whether to sort in ascending order; can be a list to match by
336        :return: the order of the indices to sort by.
337        """
338        columns = self.get(by)
339        return columns.sort_values(by=by, ascending=ascending).index
340
341    def sort(
342        self, by: str | list[str], ascending: bool | list[bool] = True
343    ) -> typing.Self:
344        """
345        Sort the EventArray by column(s) in the info, metadata, or features DataFrames.
346        :param by: name of the column(s) to sort by.
347        :param ascending: whether to sort in ascending order; can be a list to match by
348        :return: a new, sorted EventArray.
349        """
350        order = self.get_sort_order(by, ascending)
351        info = self.info.loc[order].reset_index(drop=True)
352        if self.metadata is not None:
353            metadata = self.metadata.loc[order].reset_index(drop=True)
354        else:
355            metadata = None
356        if self.features is not None:
357            features = self.features.loc[order].reset_index(drop=True)
358        else:
359            features = None
360        return EventArray(info, metadata, features)
361
362    def get(self, column_names: int | str | list[int] | list[str]) -> pd.DataFrame:
363        """
364        Get a DataFrame with the specified columns from the EventArray, by value.
365        :param column_names: the names of the columns to get.
366        :return: a DataFrame with the specified columns.
367        """
368        if isinstance(column_names, int) or isinstance(column_names, str):
369            column_names = [column_names]
370        columns = []
371        for column_name in column_names:
372            if column_name in self.info.columns:
373                columns.append(self.info[column_name])
374            elif self.metadata is not None and column_name in self.metadata.columns:
375                columns.append(column_name)
376            elif self.features is not None and column_name in self.features.columns:
377                columns.append(column_name)
378            else:
379                raise ValueError(f"Column {column_name} not found in EventArray")
380        return pd.concat(columns, axis=1)
381
382    def rows(self, rows) -> typing.Self:
383        """
384        Get a subset of the EventArray rows based on a boolean or integer index, by value.
385        :param rows: the indices to get as a 1D boolean/integer list/array/series
386        :return: a new EventArray with the subset of events.
387        """
388        info = self.info.loc[rows].reset_index(drop=True)
389        if self.metadata is not None:
390            metadata = self.metadata.loc[rows].reset_index(drop=True)
391        else:
392            metadata = None
393        if self.features is not None:
394            features = self.features.loc[rows].reset_index(drop=True)
395        else:
396            features = None
397        return EventArray(info, metadata, features)
398
399    def copy(self) -> typing.Self:
400        """
401        Create a deep copy of the EventArray.
402        :return: a deep copy of the EventArray.
403        """
404        return EventArray(
405            info=self.info.copy(),
406            metadata=None if self.metadata is None else self.metadata.copy(),
407            features=None if self.features is None else self.features.copy(),
408        )
409
410    def add_metadata(self, new_metadata: pd.DataFrame) -> None:
411        """
412        Add metadata to the EventArray. Removes the need to check if metadata is None.
413        Overwrites any existing metadata with the same column names as the new metadata.
414        :param new_metadata: the metadata to add.
415        """
416        if len(self) != len(new_metadata):
417            raise ValueError("New metadata must match length of existing info")
418
419        if self.metadata is None:
420            self.metadata = new_metadata
421        else:
422            self.metadata[new_metadata.columns] = new_metadata
423
424    def add_features(self, new_features: pd.DataFrame) -> None:
425        """
426        Add features to the EventArray. Removes the need to check if features is None.
427        Overwrites any existing features with the same column names as the new features.
428        :param new_features: the features to add.
429        """
430        if len(self) != len(new_features):
431            raise ValueError("New features must match length of existing info")
432
433        if self.features is None:
434            self.features = new_features
435        else:
436            self.features[new_features.columns] = new_features
437
438    @classmethod
439    def merge(cls, events: list[typing.Self]) -> typing.Self:
440        """
441        Combine EventArrays in a list into a single EventArray.
442        :param events: the new list of events.
443        """
444        all_info = []
445        all_metadata = []
446        all_features = []
447        for event_array in events:
448            # Skip empty EventArrays
449            if event_array.info is not None:
450                all_info.append(event_array.info)
451            if event_array.metadata is not None:
452                all_metadata.append(event_array.metadata)
453            if event_array.features is not None:
454                all_features.append(event_array.features)
455        if len(all_info) == 0:
456            return EventArray()
457        else:
458            all_info = pd.concat(all_info, ignore_index=True)
459        if len(all_metadata) == 0:
460            all_metadata = None
461        else:
462            all_metadata = pd.concat(all_metadata, ignore_index=True)
463        if len(all_features) == 0:
464            all_features = None
465        else:
466            all_features = pd.concat(all_features, ignore_index=True)
467
468        return EventArray(all_info, all_metadata, all_features)
469
470    @classmethod
471    def from_events(cls, events: list[Event]) -> typing.Self:
472        """
473        Set the events in the EventArray to a new list of events.
474        :param events: the new list of events.
475        """
476        # Return an empty array if we were passed nothing
477        if events is None or len(events) == 0:
478            return EventArray()
479        # Otherwise, grab the info
480        info = pd.DataFrame(
481            {
482                "slide_id": [event.scan.slide_id for event in events],
483                "tile": [event.tile.n for event in events],
484                "roi": [event.tile.n_roi for event in events],
485                "x": [event.x for event in events],
486                "y": [event.y for event in events],
487                "size": [event.size for event in events],
488            }
489        )
490        metadata_list = [event.metadata for event in events]
491        # Iterate through and ensure that all metadata is the same shape
492        for metadata in metadata_list:
493            if type(metadata) != type(metadata_list[0]):
494                raise ValueError("All metadata must be the same type.")
495            if metadata is not None and metadata.shape != metadata_list[0].shape:
496                raise ValueError("All metadata must be the same shape.")
497        if metadata_list[0] is None:
498            metadata = None
499        else:
500            metadata = pd.DataFrame(metadata_list)
501        features_list = [event.features for event in events]
502        # Iterate through and ensure that all features are the same shape
503        for features in features_list:
504            if type(features) != type(features_list[0]):
505                raise ValueError("All features must be the same type.")
506            if features is not None and features.shape != features_list[0].shape:
507                raise ValueError("All features must be the same shape.")
508        if features_list[0] is None:
509            features = None
510        else:
511            features = pd.DataFrame(features_list)
512        return EventArray(info=info, metadata=metadata, features=features)
513
514    def to_events(
515        self,
516        scans: list[Scan],
517        ignore_missing_scans=True,
518        ignore_metadata=False,
519        ignore_features=False,
520    ) -> list[Event]:
521        """
522        Get the events in the EventArray as a list of events.
523        :param scans: the scans that the events belong to. Pass an empty list if you
524                      don't care about scan metadata.
525        :param ignore_missing_scans: whether to create blank scans for events without scans.
526        :param ignore_metadata: whether to ignore metadata or not
527        :param ignore_features: whether to ignore features or not
528        :return:
529        """
530        events = []
531        for i in range(len(self.info)):
532            # Determine the associated scan
533            scan = None
534            for s in scans:
535                if s.slide_id == self.info["slide_id"][i]:
536                    scan = s
537                    break
538            if scan is None:
539                if ignore_missing_scans:
540                    # Create a placeholder scan if the scan is missing
541                    scan = Scan.make_placeholder(
542                        self.info["slide_id"][i],
543                        self.info["tile"][i],
544                        self.info["roi"][i],
545                    )
546                else:
547                    raise ValueError(
548                        f"Scan {self.info['slide_id'][i]} not found for event {i}."
549                    )
550            # Add to the list
551            events.append(
552                Event(
553                    scan,
554                    Tile(scan, self.info["tile"][i], self.info["roi"][i]),
555                    self.info["x"][i],
556                    self.info["y"][i],
557                    size=self.info["size"][i],
558                    metadata=None if ignore_metadata else self.metadata.loc[i],
559                    features=None if ignore_features else self.features.loc[i],
560                )
561            )
562        return events
563
564    def to_dataframe(self) -> pd.DataFrame:
565        """
566        Convert all the data in the EventArray to a single DataFrame.
567        :return: a DataFrame with all the data in the EventArray.
568        """
569        # Make a copy of the info DataFrame and prepend "info_" to the column names
570        output = self.info.copy()
571        output.columns = [f"info_{col}" for col in output.columns]
572        # Combine with the metadata and prepend "metadata_" to the column names
573        if self.metadata is not None:
574            metadata = self.metadata.copy()
575            metadata.columns = [f"metadata_{col}" for col in metadata.columns]
576            output = pd.concat([output, metadata], axis=1)
577        # Combine with the features and prepend "features_" to the column names
578        if self.features is not None:
579            features = self.features.copy()
580            features.columns = [f"features_{col}" for col in features.columns]
581            output = pd.concat([output, features], axis=1)
582        return output
583
584    @classmethod
585    def from_dataframe(cls, df) -> typing.Self:
586        """
587        From a single, special DataFrame, create an EventArray.
588        :return: a DataFrame with all the data in the EventArray.
589        """
590        # Split the columns into info, metadata, and features and strip prefix
591        info = df[[col for col in df.columns if col.startswith("info_")]].copy()
592        info.columns = [col.replace("info_", "") for col in info.columns]
593        if info.size == 0:
594            info = None
595        metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy()
596        metadata.columns = [col.replace("metadata_", "") for col in metadata.columns]
597        if metadata.size == 0:
598            metadata = None
599        features = df[[col for col in df.columns if col.startswith("features_")]].copy()
600        features.columns = [col.replace("features_", "") for col in features.columns]
601        if features.size == 0:
602            features = None
603        return cls(info=info, metadata=metadata, features=features)
604
605    def save_csv(self, output_path: str) -> bool:
606        """
607        Save the events to an CSV file, including metadata and features.
608        :param output_path:
609        :return:
610        """
611        self.to_dataframe().to_csv(output_path, index=False)
612        return os.path.exists(output_path)
613
614    @classmethod
615    def load_csv(cls, input_path: str) -> typing.Self:
616        """
617        Load the events from an CSV file, including metadata and features.
618        :param input_path:
619        :return:
620        """
621        # Load the CSV file
622        df = pd.read_csv(input_path)
623        return cls.from_dataframe(df)
624
625    def save_hdf5(self, output_path: str) -> bool:
626        """
627        Save the events to an HDF5 file, including metadata and features.
628        Uses the pandas-provided HDF5 functions for ease, and external compatibility,
629        though these files are slightly harder to view in HDFView or similar.
630        :param output_path:
631        :return:
632        """
633        # Open the output_path as an HDF5 file
634        with pd.HDFStore(output_path) as store:
635            # Store the dataframes in the HDF5 file
636            if self.info is not None:
637                store.put("info", self.info, index=False)
638            if self.metadata is not None:
639                store.put("metadata", self.metadata, index=False)
640            if self.features is not None:
641                store.put("features", self.features, index=False)
642        return os.path.exists(output_path)
643
644    @classmethod
645    def load_hdf5(cls, input_path: str) -> typing.Self:
646        """
647        Load the events from an HDF5 file, including metadata and features.
648        :param input_path:
649        :return:
650        """
651        # Open the input_path as an HDF5 file
652        with pd.HDFStore(input_path) as store:
653            # Load the dataframes from the HDF5 file
654            info = store.get("info") if "info" in store else None
655            metadata = store.get("metadata") if "metadata" in store else None
656            features = store.get("features") if "features" in store else None
657        return cls(info=info, metadata=metadata, features=features)
658
659    @classmethod
660    def load_ocular(
661        cls,
662        input_path: str,
663        event_type="cells",
664        cell_data_files=(
665            "rc-final1.rds",
666            "rc-final2.rds",
667            "rc-final3.rds",
668            "rc-final4.rds",
669            "ocular_interesting.rds",
670        ),
671        others_data_files=(
672            "others-final1.rds",
673            "others-final2.rds",
674            "others-final3.rds",
675            "others-final4.rds",
676        ),
677        atlas_data_files=(
678            "ocular_interesting.rds",
679            "ocular_not_interesting.rds",
680        ),
681        merge_event_data_with_stats=True,
682        filter_and_generate_morphs=True,
683        drop_common_events=True,
684        log=None,
685    ) -> typing.Self:
686        """
687
688        :param input_path:
689        :param event_type:
690        :param cell_data_files:
691        :param others_data_files:
692        :param atlas_data_files:
693        :param merge_event_data_with_stats:
694        :param filter_and_generate_morphs:
695        :param drop_common_events:
696        :param log:
697        :return:
698        """
699        # Check if the input path is a directory or a file
700        if os.path.isfile(input_path):
701            data_files = [os.path.basename(input_path)]
702            input_path = os.path.dirname(input_path)
703        if event_type == "cells":
704            data_files = cell_data_files
705        elif event_type == "others":
706            data_files = others_data_files
707        else:
708            raise ValueError("Invalid event type.")
709
710        # Load the data from the OCULAR files
711        file_data = {}
712        for file in data_files:
713            file_path = os.path.join(input_path, file)
714            if not os.path.isfile(file_path):
715                if log is not None:
716                    log.warning(f"{file} not found for in {input_path}")
717                continue
718            file_data[file] = pyreadr.read_r(file_path)
719            # Get the DataFrame associated with None (pyreadr dict quirk)
720            file_data[file] = file_data[file][None]
721            if len(file_data[file]) == 0:
722                # File gets dropped from the dict
723                file_data.pop(file)
724                if log is not None:
725                    log.warning(f"{file} has no cells")
726                continue
727
728            if log is not None:
729                log.debug(f"{file} has {len(file_data[file])} cells")
730
731            # Drop common cells if requested and in this file
732            if file in atlas_data_files and drop_common_events:
733                common_cell_indices = (
734                    file_data[file]["catalogue_classification"] == "common_cell"
735                )
736                if log is not None:
737                    log.debug(
738                        f"Dropping {int(pd.Series.sum(common_cell_indices))}"
739                        f"common cells from {file}"
740                    )
741                file_data[file] = file_data[file][common_cell_indices == False]
742
743            if len(file_data[file]) == 0:
744                # File gets dropped from the dict
745                file_data.pop(file)
746                if log is not None:
747                    log.warning(f"{file} has no cells after dropping common cells")
748                continue
749
750            # Extract frame_id and cell_id
751            # DAPI- events already have frame_id cell_id outside rowname
752            if event_type == "cells":
753                file_data[file]["rowname"] = file_data[file]["rowname"].astype("str")
754                # get frame_id cell_id from rownames column and split into two columns
755                split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True)
756                if len(split_res.columns) != 2:
757                    log.warning(
758                        f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}'
759                    )
760                # then assign it back to the dataframe
761                file_data[file][["frame_id", "cell_id"]] = split_res.astype("int")
762            # reset indexes since they can cause NaN values in concat
763            file_data[file] = file_data[file].reset_index(drop=True)
764
765        # Merge the data from all files
766        if len(file_data) == 0:
767            return EventArray()
768        elif len(file_data) == 1:
769            data = [file_data[file] for file in file_data.keys()][0]
770        else:
771            data = pd.concat(file_data.values())
772
773        if log is not None:
774            log.debug(f"Gathered a total of {len(data)} events")
775
776        # Others is missing the "slide_id". Insert it right before "frame_id" column
777        if event_type == "others" and "slide_id" not in data.columns:
778            if os.path.basename(input_path) == "ocular":
779                slide_id = os.path.basename(os.path.dirname(input_path))
780            else:
781                slide_id = "UNKNOWN"
782            data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id)
783
784        # Sort according to ascending cell_id to keep the original, which is in manual_df
785        data = data.sort_values(by=["cell_id"], ascending=True)
786        # Filter out duplicates by x & y
787        data = data.assign(
788            unique_id=data["slide_id"]
789            + "_"
790            + data["frame_id"].astype(str)
791            + "_"
792            + data["cellx"].astype(int).astype(str)
793            + "_"
794            + data["celly"].astype(int).astype(str)
795        )
796        data = data.drop_duplicates(subset=["unique_id"], keep="first")
797        # Normal unique_id is with cell_id
798        data = data.assign(
799            unique_id=data["slide_id"]
800            + "_"
801            + data["frame_id"].astype(str)
802            + "_"
803            + data["cell_id"].astype(str)
804        )
805        data = data.reset_index(drop=True)
806        # All columns up to "slide_id" are features; drop the "slide_id"
807        features = data.loc[:, :"slide_id"].iloc[:, :-1]
808        data = data.loc[:, "slide_id":]
809        # Grab the info columns
810        info = data[["slide_id", "frame_id", "cellx", "celly"]]
811        info.columns = ["slide_id", "tile", "x", "y"]
812        info = info.assign(
813            roi=0,  # OCULAR only works on 1 ROI, as far as known
814            size=25,  # Static, for later montaging
815        )
816        info = info[["slide_id", "tile", "roi", "x", "y", "size"]]
817        # Metadata has duplicate columns for later convenience
818        metadata = data
819        return EventArray(info, metadata, features)
820
821    def save_ocular(self, output_path: str, event_type: str = "cells") -> bool:
822        """
823        Save the events to an OCULAR file. Relies on the dataframe originating
824        from an OCULAR file (same columns; duplicate metadata/info).
825        :param output_path:
826        :return:
827        """
828        if event_type == "cells":
829            file_stub = "rc-final"
830        elif event_type == "others":
831            file_stub = "others-final"
832        else:
833            raise ValueError("Invalid event type. Must be cells or others.")
834
835        # Check for the "ocular_interesting" column
836        if event_type == "cells" and "ocular_interesting" in self.metadata.columns:
837            interesting = self.metadata["ocular_interesting"]
838            # Split the metadata into interesting and regular
839            # Interesting will only have dropped columns, with no internal changes
840            interesting = pd.concat(
841                [self.features[interesting], self.metadata[interesting]], axis=1
842            ).reset_index(drop=True)
843            # Data will get some columns changed; reset_index will copy it
844            data = (
845                pd.concat(
846                    [self.features[~interesting], self.metadata[~interesting]], axis=1
847                )
848                .reset_index(drop=True)
849                .drop(columns=["ocular_interesting"])
850            )
851
852            # Drop particular columns for "interesting"
853            interesting = interesting.drop(
854                [
855                    "clust",
856                    "hcpc",
857                    "frame_id",
858                    "cell_id",
859                    "unique_id",
860                    "ocular_interesting",
861                ],
862                axis=1,
863                errors="ignore",
864            )
865            # Save both .csv and .rds
866            interesting.to_csv(
867                os.path.join(output_path, "ocular_interesting.csv"), index=False
868            )
869            pyreadr.write_rds(
870                os.path.join(output_path, "ocular_interesting.rds"), interesting
871            )
872        else:
873            # Get all data and reset_index (will copy it)
874            data = pd.concat([self.features, self.metadata], axis=1).reset_index(
875                drop=True
876            )
877
878        # Split based on cluster number to conform to *-final[1-4].rds
879        n_clusters = max(data["clust"]) + 1
880        split_idx = [round(i * n_clusters / 4) for i in range(5)]
881        for i in range(4):
882            subset = (split_idx[i] <= data["clust"]) & (
883                data["clust"] < split_idx[i + 1]
884            )
885            subset = data[subset].reset_index(drop=True)
886            subset["hcpc"] = i + 1
887            pyreadr.write_rds(
888                os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset
889            )
890
891        # Create new example cell strings
892        data["example_cell_id"] = (
893            data["slide_id"]
894            + " "
895            + data["frame_id"].astype(str)
896            + " "
897            + data["cell_id"].astype(str)
898            + " "
899            + data["cellx"].astype(int).astype(str)
900            + " "
901            + data["celly"].astype(int).astype(str)
902        )
903        # Find averagable data columns
904        if "cellcluster_id" in data.columns:
905            avg_cols = data.columns[: data.columns.get_loc("cellcluster_id")].tolist()
906        else:
907            avg_cols = data.columns[: data.columns.get_loc("slide_id")].tolist()
908        # Group by cluster and average
909        data = data.groupby("clust").agg(
910            **{col: (col, "mean") for col in avg_cols},
911            count=("clust", "size"),  # count rows in each cluster
912            example_cells=("example_cell_id", lambda x: ",".join(x)),
913            hcpc=("hcpc", lambda x: x.iloc[0]),
914        )
915        data = data.reset_index()  # Do NOT drop, index is "clust"
916        # Create new columns
917        metadata = pd.DataFrame(
918            {
919                "count": data["count"],
920                "example_cells": data["example_cells"],
921                "clust": data["clust"].astype(int),
922                "hcpc": data["hcpc"].astype(int),
923                "id": data["clust"].astype(int).astype(str),
924                "cccluster": "0",  # Dummy value
925                "ccdistance": 0.0,  # Dummy value
926                "rownum": list(range(len(data))),
927                "framegroup": 0,  # Dummy value
928            }
929        )
930        data = pd.concat([data[avg_cols], metadata], axis=1)
931        # Save the data
932        data.to_csv(os.path.join(output_path, f"{file_stub}.csv"), index=False)
933        pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data)
class Event:
 26class Event:
 27    """
 28    A class that represents a single event in a scan, making it easy to evaluate
 29    singular events. Required metadata is exposed as attributes, and optional
 30    metadata and features are stored as DataFrames.
 31    """
 32
 33    SCAN_TO_SLIDE_TRANSFORM = {
 34        # Axioscan zero is in the top-right corner instead of top-left
 35        Scan.Type.AXIOSCAN7: np.array(
 36            [
 37                [1, 0, 75000],
 38                [0, 1, 0],
 39                [0, 0, 1],
 40            ]
 41        ),
 42        # BZScanner coordinates are a special kind of messed up:
 43        # - The slide is upside-down.
 44        # - The slide is oriented vertically, with the barcode at the bottom.
 45        # - Tiles are numbered from the top-right
 46        Scan.Type.BZSCANNER: np.array(
 47            [
 48                [0, -1, 75000],
 49                [-1, 0, 25000],
 50                [0, 0, 1],
 51            ]
 52        ),
 53    }
 54    """
 55    Homogeneous transformation matrices for converting between scanner and slide
 56    coordinates. The matrices are 3x3, with the final column representing the
 57    translation in micrometers (um). For more information, see 
 58    [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations).
 59    
 60    Transformations are nominal, and accuracy is not guaranteed; this is due to 
 61    imperfections in slides and alignment in the scanners. Units are in micrometers.
 62    """
 63
 64    def __init__(
 65        self,
 66        scan: Scan,
 67        tile: Tile,
 68        x: int,
 69        y: int,
 70        size: int = 12,  # End-to-end size in pixels
 71        metadata: pd.Series = None,
 72        features: pd.Series = None,
 73    ):
 74        self.scan = scan
 75        self.tile = tile
 76        self.x = x
 77        self.y = y
 78        self.size = size
 79        self.metadata = metadata
 80        self.features = features
 81
 82    def __repr__(self) -> str:
 83        return f"{self.scan.slide_id}-{self.tile.n}-{self.x}-{self.y}"
 84
 85    def __eq__(self, other) -> bool:
 86        return self.__repr__() == other.__repr__()
 87
 88    def __lt__(self, other):
 89        return self.__repr__() < other.__repr__()
 90
 91    def get_scan_position(self) -> tuple[float, float]:
 92        """
 93        Get the position of the event in the scanner's coordinate frame.
 94        :return: the scan position of the event in micrometers (um).
 95        """
 96        # Get overall pixel position
 97        pixel_x = self.x + (self.scan.tile_width_px * self.tile.x)
 98        pixel_y = self.y + (self.scan.tile_height_px * self.tile.y)
 99        # Convert to micrometers
100        x_um = pixel_x * self.scan.pixel_size_um
101        y_um = pixel_y * self.scan.pixel_size_um
102        # Add the scan's origin in the scanner frame
103        x_um += self.scan.roi[self.tile.n_roi].origin_x_um
104        y_um += self.scan.roi[self.tile.n_roi].origin_y_um
105        return x_um, y_um
106
107    def get_slide_position(self) -> tuple[float, float]:
108        """
109        Get the slide position of the event in micrometers (um).
110        :return: the slide position of the event.
111        """
112        # Turn scan_position into a 3x1 vector
113        scan_position = self.get_scan_position()
114        scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]])
115
116        # Multiply by the appropriate homogeneous matrix
117        if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value):
118            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7]
119        elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value):
120            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER]
121        else:
122            raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.")
123        slide_position = np.matmul(transform, scan_position)
124        return float(slide_position[0][0]), float(slide_position[1][0])
125
126    def crop_images(
127        self, images: list[np.ndarray], crop_size: int = 100, in_pixels: bool = True
128    ) -> list[np.ndarray]:
129        """
130        Get the event crops from the frame images. Called "get" because it does not
131        need to extract anything; it is very quick for extracting multiple events from
132        the same tile.
133        Use this if you're interested in many events.
134        :param images: the frame images.
135        :param crop_size: the square size of the image crop to get for this event.
136        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
137        :return: image_size x image_size crops of the event in the provided frames. If
138        the event is too close to the edge, the crop will be smaller and not centered.
139        """
140        # Convert a crop size in micrometers to pixels
141        if not in_pixels:
142            crop_size = round(crop_size / self.scan.pixel_size_um)
143        # Find the crop bounds
144        bounds = [
145            self.x - crop_size // 2,
146            self.y - crop_size // 2,
147            self.x + math.ceil(crop_size / 2),
148            self.y + math.ceil(crop_size / 2),
149        ]
150        # Determine how much the bounds violate the image size
151        displacements = [
152            max(0, -bounds[0]),
153            max(0, -bounds[1]),
154            max(0, bounds[2] - images[0].shape[1]),
155            max(0, bounds[3] - images[0].shape[0]),
156        ]
157        # Cap off the bounds
158        bounds = [
159            max(0, bounds[0]),
160            max(0, bounds[1]),
161            min(images[0].shape[1], bounds[2]),
162            min(images[0].shape[0], bounds[3]),
163        ]
164
165        # Crop the images
166        cropped_images = []
167        for image in images:
168            # Create a blank image of the right size
169            cropped_image = np.zeros((crop_size, crop_size), dtype=image.dtype)
170
171            # Insert the cropped image into the blank image, leaving a black buffer
172            # around the edges if the crop would go beyond the original image bounds
173            cropped_image[
174                displacements[1] : crop_size - displacements[3],
175                displacements[0] : crop_size - displacements[2],
176            ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]]
177            cropped_images.append(cropped_image)
178        return cropped_images
179
180    def extract_images(
181        self, crop_size: int = 100, in_pixels: bool = True
182    ) -> list[np.ndarray]:
183        """
184        Extract the images from the scan and tile, reading from the file. Called
185        "extract" because it must read and extract the images from file, which is slow.
186        Use this if you're interested in only a few events, as it is inefficient when
187        reading multiple events from the same tile.
188        :param crop_size: the square size of the image crop to get for this event.
189        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
190        :return: a list of cropped images from the scan in the order of the channels.
191        """
192        frames = Frame.get_frames(self.tile)
193        images = [frame.get_image() for frame in frames]
194        return self.crop_images(images, crop_size, in_pixels)
195
196    @classmethod
197    def extract_images_for_list(
198        cls,
199        events: list[typing.Self],
200        crop_size: int | list[int] = None,
201        in_pixels: bool = True,
202    ) -> list[list[np.ndarray]]:
203        """
204        Get the images for a list of events, ensuring that there is no wasteful reading
205        of the same tile multiple times. This function is more efficient than calling
206        extract_event_images for each event.
207        TODO: test this function
208        :param events: the events to extract images for.
209        :param crop_size: the square size of the image crop to get for this event.
210                          Defaults to four times the size of the event.
211        :param in_pixels: whether the crop size is in pixels or micrometers.
212                          Defaults to pixels, and is ignored if crop_size is None.
213        :return: a list of lists of cropped images for each event.
214        """
215        if len(events) == 0:
216            return []
217
218        # Populate a crop size if none provided
219        if crop_size is None:
220            crop_size = [4 * event.size for event in events]
221            in_pixels = True
222        # Propagate a constant crop size
223        elif isinstance(crop_size, int):
224            crop_size = [crop_size] * len(events)
225
226        # Sort the events by tile; use a shallow copy to avoid modifying the original
227        order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__()))
228
229        # Allocate the list to size
230        images = [None] * len(events)
231        last_tile = None
232        frame_images = None  # Holds large numpy arrays, so expensive to compare
233        # Iterate through in sorted order
234        for i in order:
235            if last_tile != events[i].tile:
236                # Gather the frame images, preserving them for the next event
237                frames = Frame.get_frames(events[i].tile)
238                frame_images = [frame.get_image() for frame in frames]
239
240                last_tile = events[i].tile
241            # Use the frame images to crop the event images
242            # Preserve the original order using order[i]
243            images[i] = events[i].crop_images(frame_images, crop_size[i], in_pixels)
244        return images

A class that represents a single event in a scan, making it easy to evaluate singular events. Required metadata is exposed as attributes, and optional metadata and features are stored as DataFrames.

Event( scan: csi_images.csi_scans.Scan, tile: csi_images.csi_tiles.Tile, x: int, y: int, size: int = 12, metadata: pandas.core.series.Series = None, features: pandas.core.series.Series = None)
64    def __init__(
65        self,
66        scan: Scan,
67        tile: Tile,
68        x: int,
69        y: int,
70        size: int = 12,  # End-to-end size in pixels
71        metadata: pd.Series = None,
72        features: pd.Series = None,
73    ):
74        self.scan = scan
75        self.tile = tile
76        self.x = x
77        self.y = y
78        self.size = size
79        self.metadata = metadata
80        self.features = features
SCAN_TO_SLIDE_TRANSFORM = {<Type.AXIOSCAN7: 'axioscan7'>: array([[ 1, 0, 75000], [ 0, 1, 0], [ 0, 0, 1]]), <Type.BZSCANNER: 'bzscanner'>: array([[ 0, -1, 75000], [ -1, 0, 25000], [ 0, 0, 1]])}

Homogeneous transformation matrices for converting between scanner and slide coordinates. The matrices are 3x3, with the final column representing the translation in micrometers (um). For more information, see affine transformations.

Transformations are nominal, and accuracy is not guaranteed; this is due to imperfections in slides and alignment in the scanners. Units are in micrometers.

scan
tile
x
y
size
metadata
features
def get_scan_position(self) -> tuple[float, float]:
 91    def get_scan_position(self) -> tuple[float, float]:
 92        """
 93        Get the position of the event in the scanner's coordinate frame.
 94        :return: the scan position of the event in micrometers (um).
 95        """
 96        # Get overall pixel position
 97        pixel_x = self.x + (self.scan.tile_width_px * self.tile.x)
 98        pixel_y = self.y + (self.scan.tile_height_px * self.tile.y)
 99        # Convert to micrometers
100        x_um = pixel_x * self.scan.pixel_size_um
101        y_um = pixel_y * self.scan.pixel_size_um
102        # Add the scan's origin in the scanner frame
103        x_um += self.scan.roi[self.tile.n_roi].origin_x_um
104        y_um += self.scan.roi[self.tile.n_roi].origin_y_um
105        return x_um, y_um

Get the position of the event in the scanner's coordinate frame.

Returns

the scan position of the event in micrometers (um).

def get_slide_position(self) -> tuple[float, float]:
107    def get_slide_position(self) -> tuple[float, float]:
108        """
109        Get the slide position of the event in micrometers (um).
110        :return: the slide position of the event.
111        """
112        # Turn scan_position into a 3x1 vector
113        scan_position = self.get_scan_position()
114        scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]])
115
116        # Multiply by the appropriate homogeneous matrix
117        if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value):
118            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7]
119        elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value):
120            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER]
121        else:
122            raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.")
123        slide_position = np.matmul(transform, scan_position)
124        return float(slide_position[0][0]), float(slide_position[1][0])

Get the slide position of the event in micrometers (um).

Returns

the slide position of the event.

def crop_images( self, images: list[numpy.ndarray], crop_size: int = 100, in_pixels: bool = True) -> list[numpy.ndarray]:
126    def crop_images(
127        self, images: list[np.ndarray], crop_size: int = 100, in_pixels: bool = True
128    ) -> list[np.ndarray]:
129        """
130        Get the event crops from the frame images. Called "get" because it does not
131        need to extract anything; it is very quick for extracting multiple events from
132        the same tile.
133        Use this if you're interested in many events.
134        :param images: the frame images.
135        :param crop_size: the square size of the image crop to get for this event.
136        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
137        :return: image_size x image_size crops of the event in the provided frames. If
138        the event is too close to the edge, the crop will be smaller and not centered.
139        """
140        # Convert a crop size in micrometers to pixels
141        if not in_pixels:
142            crop_size = round(crop_size / self.scan.pixel_size_um)
143        # Find the crop bounds
144        bounds = [
145            self.x - crop_size // 2,
146            self.y - crop_size // 2,
147            self.x + math.ceil(crop_size / 2),
148            self.y + math.ceil(crop_size / 2),
149        ]
150        # Determine how much the bounds violate the image size
151        displacements = [
152            max(0, -bounds[0]),
153            max(0, -bounds[1]),
154            max(0, bounds[2] - images[0].shape[1]),
155            max(0, bounds[3] - images[0].shape[0]),
156        ]
157        # Cap off the bounds
158        bounds = [
159            max(0, bounds[0]),
160            max(0, bounds[1]),
161            min(images[0].shape[1], bounds[2]),
162            min(images[0].shape[0], bounds[3]),
163        ]
164
165        # Crop the images
166        cropped_images = []
167        for image in images:
168            # Create a blank image of the right size
169            cropped_image = np.zeros((crop_size, crop_size), dtype=image.dtype)
170
171            # Insert the cropped image into the blank image, leaving a black buffer
172            # around the edges if the crop would go beyond the original image bounds
173            cropped_image[
174                displacements[1] : crop_size - displacements[3],
175                displacements[0] : crop_size - displacements[2],
176            ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]]
177            cropped_images.append(cropped_image)
178        return cropped_images

Get the event crops from the frame images. Called "get" because it does not need to extract anything; it is very quick for extracting multiple events from the same tile. Use this if you're interested in many events.

Parameters
  • images: the frame images.
  • crop_size: the square size of the image crop to get for this event.
  • in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
Returns

image_size x image_size crops of the event in the provided frames. If the event is too close to the edge, the crop will be smaller and not centered.

def extract_images( self, crop_size: int = 100, in_pixels: bool = True) -> list[numpy.ndarray]:
180    def extract_images(
181        self, crop_size: int = 100, in_pixels: bool = True
182    ) -> list[np.ndarray]:
183        """
184        Extract the images from the scan and tile, reading from the file. Called
185        "extract" because it must read and extract the images from file, which is slow.
186        Use this if you're interested in only a few events, as it is inefficient when
187        reading multiple events from the same tile.
188        :param crop_size: the square size of the image crop to get for this event.
189        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
190        :return: a list of cropped images from the scan in the order of the channels.
191        """
192        frames = Frame.get_frames(self.tile)
193        images = [frame.get_image() for frame in frames]
194        return self.crop_images(images, crop_size, in_pixels)

Extract the images from the scan and tile, reading from the file. Called "extract" because it must read and extract the images from file, which is slow. Use this if you're interested in only a few events, as it is inefficient when reading multiple events from the same tile.

Parameters
  • crop_size: the square size of the image crop to get for this event.
  • in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
Returns

a list of cropped images from the scan in the order of the channels.

@classmethod
def extract_images_for_list( cls, events: list[typing.Self], crop_size: int | list[int] = None, in_pixels: bool = True) -> list[list[numpy.ndarray]]:
196    @classmethod
197    def extract_images_for_list(
198        cls,
199        events: list[typing.Self],
200        crop_size: int | list[int] = None,
201        in_pixels: bool = True,
202    ) -> list[list[np.ndarray]]:
203        """
204        Get the images for a list of events, ensuring that there is no wasteful reading
205        of the same tile multiple times. This function is more efficient than calling
206        extract_event_images for each event.
207        TODO: test this function
208        :param events: the events to extract images for.
209        :param crop_size: the square size of the image crop to get for this event.
210                          Defaults to four times the size of the event.
211        :param in_pixels: whether the crop size is in pixels or micrometers.
212                          Defaults to pixels, and is ignored if crop_size is None.
213        :return: a list of lists of cropped images for each event.
214        """
215        if len(events) == 0:
216            return []
217
218        # Populate a crop size if none provided
219        if crop_size is None:
220            crop_size = [4 * event.size for event in events]
221            in_pixels = True
222        # Propagate a constant crop size
223        elif isinstance(crop_size, int):
224            crop_size = [crop_size] * len(events)
225
226        # Sort the events by tile; use a shallow copy to avoid modifying the original
227        order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__()))
228
229        # Allocate the list to size
230        images = [None] * len(events)
231        last_tile = None
232        frame_images = None  # Holds large numpy arrays, so expensive to compare
233        # Iterate through in sorted order
234        for i in order:
235            if last_tile != events[i].tile:
236                # Gather the frame images, preserving them for the next event
237                frames = Frame.get_frames(events[i].tile)
238                frame_images = [frame.get_image() for frame in frames]
239
240                last_tile = events[i].tile
241            # Use the frame images to crop the event images
242            # Preserve the original order using order[i]
243            images[i] = events[i].crop_images(frame_images, crop_size[i], in_pixels)
244        return images

Get the images for a list of events, ensuring that there is no wasteful reading of the same tile multiple times. This function is more efficient than calling extract_event_images for each event. TODO: test this function

Parameters
  • events: the events to extract images for.
  • crop_size: the square size of the image crop to get for this event. Defaults to four times the size of the event.
  • in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels, and is ignored if crop_size is None.
Returns

a list of lists of cropped images for each event.

class EventArray:
247class EventArray:
248    """
249    A class that holds a large number of events' data, making it easy to analyze and
250    manipulate many events at once. A more separated version of the Event class.
251    """
252
253    INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y", "size"]
254
255    def __init__(
256        self,
257        info: pd.DataFrame = None,
258        metadata: pd.DataFrame = None,
259        features: pd.DataFrame = None,
260    ):
261        # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y", "size"
262        if info is not None and (
263            not all(
264                col in info.columns
265                for col in ["slide_id", "tile", "roi", "x", "y", "size"]
266            )
267            or len(info.columns) != 6
268        ):
269            raise ValueError(
270                "EventArray.info must have columns 'slide_id', 'tile', 'roi', 'x', 'y', 'size'"
271            )
272        # All DataFrames must all have the same number of rows
273        if metadata is not None and (info is None or len(info) != len(metadata)):
274            raise ValueError(
275                "If EventArray.metadata is not None, it should match rows with .info"
276            )
277        if features is not None and (info is None or len(info) != len(features)):
278            raise ValueError(
279                "If EventArray.features is not None, it should match rows with .info"
280            )
281        self.info = info
282        self.metadata = metadata
283        self.features = features
284
285    def __len__(self) -> int:
286        # Convenience method to get the number of events
287        if self.info is None:
288            return 0
289        else:
290            return len(self.info)
291
292    def __eq__(self, other):
293        is_equal = True
294        # Parse all possibilities for info
295        if isinstance(self.info, pd.DataFrame):
296            if isinstance(other.info, pd.DataFrame):
297                is_equal = self.info.equals(other.info)
298                if not is_equal:
299                    return False
300            else:
301                return False
302        elif self.info is None:
303            if other.info is not None:
304                return False
305
306        # Parse all possibilities for metadata
307        if isinstance(self.metadata, pd.DataFrame):
308            if isinstance(other.metadata, pd.DataFrame):
309                is_equal = self.metadata.equals(other.metadata)
310                if not is_equal:
311                    return False
312            else:
313                return False
314        elif self.metadata is None:
315            if other.metadata is not None:
316                return False
317
318        # Parse all possibilities for features
319        if isinstance(self.features, pd.DataFrame):
320            if isinstance(other.features, pd.DataFrame):
321                is_equal = self.features.equals(other.features)
322                if not is_equal:
323                    return False
324            else:
325                return False
326        elif self.features is None:
327            if other.features is not None:
328                return False
329
330        return is_equal
331
332    def get_sort_order(self, by: str | list[str], ascending: bool | list[bool] = True):
333        """
334        Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.
335        :param by: name of the column(s) to sort by.
336        :param ascending: whether to sort in ascending order; can be a list to match by
337        :return: the order of the indices to sort by.
338        """
339        columns = self.get(by)
340        return columns.sort_values(by=by, ascending=ascending).index
341
342    def sort(
343        self, by: str | list[str], ascending: bool | list[bool] = True
344    ) -> typing.Self:
345        """
346        Sort the EventArray by column(s) in the info, metadata, or features DataFrames.
347        :param by: name of the column(s) to sort by.
348        :param ascending: whether to sort in ascending order; can be a list to match by
349        :return: a new, sorted EventArray.
350        """
351        order = self.get_sort_order(by, ascending)
352        info = self.info.loc[order].reset_index(drop=True)
353        if self.metadata is not None:
354            metadata = self.metadata.loc[order].reset_index(drop=True)
355        else:
356            metadata = None
357        if self.features is not None:
358            features = self.features.loc[order].reset_index(drop=True)
359        else:
360            features = None
361        return EventArray(info, metadata, features)
362
363    def get(self, column_names: int | str | list[int] | list[str]) -> pd.DataFrame:
364        """
365        Get a DataFrame with the specified columns from the EventArray, by value.
366        :param column_names: the names of the columns to get.
367        :return: a DataFrame with the specified columns.
368        """
369        if isinstance(column_names, int) or isinstance(column_names, str):
370            column_names = [column_names]
371        columns = []
372        for column_name in column_names:
373            if column_name in self.info.columns:
374                columns.append(self.info[column_name])
375            elif self.metadata is not None and column_name in self.metadata.columns:
376                columns.append(column_name)
377            elif self.features is not None and column_name in self.features.columns:
378                columns.append(column_name)
379            else:
380                raise ValueError(f"Column {column_name} not found in EventArray")
381        return pd.concat(columns, axis=1)
382
383    def rows(self, rows) -> typing.Self:
384        """
385        Get a subset of the EventArray rows based on a boolean or integer index, by value.
386        :param rows: the indices to get as a 1D boolean/integer list/array/series
387        :return: a new EventArray with the subset of events.
388        """
389        info = self.info.loc[rows].reset_index(drop=True)
390        if self.metadata is not None:
391            metadata = self.metadata.loc[rows].reset_index(drop=True)
392        else:
393            metadata = None
394        if self.features is not None:
395            features = self.features.loc[rows].reset_index(drop=True)
396        else:
397            features = None
398        return EventArray(info, metadata, features)
399
400    def copy(self) -> typing.Self:
401        """
402        Create a deep copy of the EventArray.
403        :return: a deep copy of the EventArray.
404        """
405        return EventArray(
406            info=self.info.copy(),
407            metadata=None if self.metadata is None else self.metadata.copy(),
408            features=None if self.features is None else self.features.copy(),
409        )
410
411    def add_metadata(self, new_metadata: pd.DataFrame) -> None:
412        """
413        Add metadata to the EventArray. Removes the need to check if metadata is None.
414        Overwrites any existing metadata with the same column names as the new metadata.
415        :param new_metadata: the metadata to add.
416        """
417        if len(self) != len(new_metadata):
418            raise ValueError("New metadata must match length of existing info")
419
420        if self.metadata is None:
421            self.metadata = new_metadata
422        else:
423            self.metadata[new_metadata.columns] = new_metadata
424
425    def add_features(self, new_features: pd.DataFrame) -> None:
426        """
427        Add features to the EventArray. Removes the need to check if features is None.
428        Overwrites any existing features with the same column names as the new features.
429        :param new_features: the features to add.
430        """
431        if len(self) != len(new_features):
432            raise ValueError("New features must match length of existing info")
433
434        if self.features is None:
435            self.features = new_features
436        else:
437            self.features[new_features.columns] = new_features
438
439    @classmethod
440    def merge(cls, events: list[typing.Self]) -> typing.Self:
441        """
442        Combine EventArrays in a list into a single EventArray.
443        :param events: the new list of events.
444        """
445        all_info = []
446        all_metadata = []
447        all_features = []
448        for event_array in events:
449            # Skip empty EventArrays
450            if event_array.info is not None:
451                all_info.append(event_array.info)
452            if event_array.metadata is not None:
453                all_metadata.append(event_array.metadata)
454            if event_array.features is not None:
455                all_features.append(event_array.features)
456        if len(all_info) == 0:
457            return EventArray()
458        else:
459            all_info = pd.concat(all_info, ignore_index=True)
460        if len(all_metadata) == 0:
461            all_metadata = None
462        else:
463            all_metadata = pd.concat(all_metadata, ignore_index=True)
464        if len(all_features) == 0:
465            all_features = None
466        else:
467            all_features = pd.concat(all_features, ignore_index=True)
468
469        return EventArray(all_info, all_metadata, all_features)
470
471    @classmethod
472    def from_events(cls, events: list[Event]) -> typing.Self:
473        """
474        Set the events in the EventArray to a new list of events.
475        :param events: the new list of events.
476        """
477        # Return an empty array if we were passed nothing
478        if events is None or len(events) == 0:
479            return EventArray()
480        # Otherwise, grab the info
481        info = pd.DataFrame(
482            {
483                "slide_id": [event.scan.slide_id for event in events],
484                "tile": [event.tile.n for event in events],
485                "roi": [event.tile.n_roi for event in events],
486                "x": [event.x for event in events],
487                "y": [event.y for event in events],
488                "size": [event.size for event in events],
489            }
490        )
491        metadata_list = [event.metadata for event in events]
492        # Iterate through and ensure that all metadata is the same shape
493        for metadata in metadata_list:
494            if type(metadata) != type(metadata_list[0]):
495                raise ValueError("All metadata must be the same type.")
496            if metadata is not None and metadata.shape != metadata_list[0].shape:
497                raise ValueError("All metadata must be the same shape.")
498        if metadata_list[0] is None:
499            metadata = None
500        else:
501            metadata = pd.DataFrame(metadata_list)
502        features_list = [event.features for event in events]
503        # Iterate through and ensure that all features are the same shape
504        for features in features_list:
505            if type(features) != type(features_list[0]):
506                raise ValueError("All features must be the same type.")
507            if features is not None and features.shape != features_list[0].shape:
508                raise ValueError("All features must be the same shape.")
509        if features_list[0] is None:
510            features = None
511        else:
512            features = pd.DataFrame(features_list)
513        return EventArray(info=info, metadata=metadata, features=features)
514
515    def to_events(
516        self,
517        scans: list[Scan],
518        ignore_missing_scans=True,
519        ignore_metadata=False,
520        ignore_features=False,
521    ) -> list[Event]:
522        """
523        Get the events in the EventArray as a list of events.
524        :param scans: the scans that the events belong to. Pass an empty list if you
525                      don't care about scan metadata.
526        :param ignore_missing_scans: whether to create blank scans for events without scans.
527        :param ignore_metadata: whether to ignore metadata or not
528        :param ignore_features: whether to ignore features or not
529        :return:
530        """
531        events = []
532        for i in range(len(self.info)):
533            # Determine the associated scan
534            scan = None
535            for s in scans:
536                if s.slide_id == self.info["slide_id"][i]:
537                    scan = s
538                    break
539            if scan is None:
540                if ignore_missing_scans:
541                    # Create a placeholder scan if the scan is missing
542                    scan = Scan.make_placeholder(
543                        self.info["slide_id"][i],
544                        self.info["tile"][i],
545                        self.info["roi"][i],
546                    )
547                else:
548                    raise ValueError(
549                        f"Scan {self.info['slide_id'][i]} not found for event {i}."
550                    )
551            # Add to the list
552            events.append(
553                Event(
554                    scan,
555                    Tile(scan, self.info["tile"][i], self.info["roi"][i]),
556                    self.info["x"][i],
557                    self.info["y"][i],
558                    size=self.info["size"][i],
559                    metadata=None if ignore_metadata else self.metadata.loc[i],
560                    features=None if ignore_features else self.features.loc[i],
561                )
562            )
563        return events
564
565    def to_dataframe(self) -> pd.DataFrame:
566        """
567        Convert all the data in the EventArray to a single DataFrame.
568        :return: a DataFrame with all the data in the EventArray.
569        """
570        # Make a copy of the info DataFrame and prepend "info_" to the column names
571        output = self.info.copy()
572        output.columns = [f"info_{col}" for col in output.columns]
573        # Combine with the metadata and prepend "metadata_" to the column names
574        if self.metadata is not None:
575            metadata = self.metadata.copy()
576            metadata.columns = [f"metadata_{col}" for col in metadata.columns]
577            output = pd.concat([output, metadata], axis=1)
578        # Combine with the features and prepend "features_" to the column names
579        if self.features is not None:
580            features = self.features.copy()
581            features.columns = [f"features_{col}" for col in features.columns]
582            output = pd.concat([output, features], axis=1)
583        return output
584
585    @classmethod
586    def from_dataframe(cls, df) -> typing.Self:
587        """
588        From a single, special DataFrame, create an EventArray.
589        :return: a DataFrame with all the data in the EventArray.
590        """
591        # Split the columns into info, metadata, and features and strip prefix
592        info = df[[col for col in df.columns if col.startswith("info_")]].copy()
593        info.columns = [col.replace("info_", "") for col in info.columns]
594        if info.size == 0:
595            info = None
596        metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy()
597        metadata.columns = [col.replace("metadata_", "") for col in metadata.columns]
598        if metadata.size == 0:
599            metadata = None
600        features = df[[col for col in df.columns if col.startswith("features_")]].copy()
601        features.columns = [col.replace("features_", "") for col in features.columns]
602        if features.size == 0:
603            features = None
604        return cls(info=info, metadata=metadata, features=features)
605
606    def save_csv(self, output_path: str) -> bool:
607        """
608        Save the events to an CSV file, including metadata and features.
609        :param output_path:
610        :return:
611        """
612        self.to_dataframe().to_csv(output_path, index=False)
613        return os.path.exists(output_path)
614
615    @classmethod
616    def load_csv(cls, input_path: str) -> typing.Self:
617        """
618        Load the events from an CSV file, including metadata and features.
619        :param input_path:
620        :return:
621        """
622        # Load the CSV file
623        df = pd.read_csv(input_path)
624        return cls.from_dataframe(df)
625
626    def save_hdf5(self, output_path: str) -> bool:
627        """
628        Save the events to an HDF5 file, including metadata and features.
629        Uses the pandas-provided HDF5 functions for ease, and external compatibility,
630        though these files are slightly harder to view in HDFView or similar.
631        :param output_path:
632        :return:
633        """
634        # Open the output_path as an HDF5 file
635        with pd.HDFStore(output_path) as store:
636            # Store the dataframes in the HDF5 file
637            if self.info is not None:
638                store.put("info", self.info, index=False)
639            if self.metadata is not None:
640                store.put("metadata", self.metadata, index=False)
641            if self.features is not None:
642                store.put("features", self.features, index=False)
643        return os.path.exists(output_path)
644
645    @classmethod
646    def load_hdf5(cls, input_path: str) -> typing.Self:
647        """
648        Load the events from an HDF5 file, including metadata and features.
649        :param input_path:
650        :return:
651        """
652        # Open the input_path as an HDF5 file
653        with pd.HDFStore(input_path) as store:
654            # Load the dataframes from the HDF5 file
655            info = store.get("info") if "info" in store else None
656            metadata = store.get("metadata") if "metadata" in store else None
657            features = store.get("features") if "features" in store else None
658        return cls(info=info, metadata=metadata, features=features)
659
660    @classmethod
661    def load_ocular(
662        cls,
663        input_path: str,
664        event_type="cells",
665        cell_data_files=(
666            "rc-final1.rds",
667            "rc-final2.rds",
668            "rc-final3.rds",
669            "rc-final4.rds",
670            "ocular_interesting.rds",
671        ),
672        others_data_files=(
673            "others-final1.rds",
674            "others-final2.rds",
675            "others-final3.rds",
676            "others-final4.rds",
677        ),
678        atlas_data_files=(
679            "ocular_interesting.rds",
680            "ocular_not_interesting.rds",
681        ),
682        merge_event_data_with_stats=True,
683        filter_and_generate_morphs=True,
684        drop_common_events=True,
685        log=None,
686    ) -> typing.Self:
687        """
688
689        :param input_path:
690        :param event_type:
691        :param cell_data_files:
692        :param others_data_files:
693        :param atlas_data_files:
694        :param merge_event_data_with_stats:
695        :param filter_and_generate_morphs:
696        :param drop_common_events:
697        :param log:
698        :return:
699        """
700        # Check if the input path is a directory or a file
701        if os.path.isfile(input_path):
702            data_files = [os.path.basename(input_path)]
703            input_path = os.path.dirname(input_path)
704        if event_type == "cells":
705            data_files = cell_data_files
706        elif event_type == "others":
707            data_files = others_data_files
708        else:
709            raise ValueError("Invalid event type.")
710
711        # Load the data from the OCULAR files
712        file_data = {}
713        for file in data_files:
714            file_path = os.path.join(input_path, file)
715            if not os.path.isfile(file_path):
716                if log is not None:
717                    log.warning(f"{file} not found for in {input_path}")
718                continue
719            file_data[file] = pyreadr.read_r(file_path)
720            # Get the DataFrame associated with None (pyreadr dict quirk)
721            file_data[file] = file_data[file][None]
722            if len(file_data[file]) == 0:
723                # File gets dropped from the dict
724                file_data.pop(file)
725                if log is not None:
726                    log.warning(f"{file} has no cells")
727                continue
728
729            if log is not None:
730                log.debug(f"{file} has {len(file_data[file])} cells")
731
732            # Drop common cells if requested and in this file
733            if file in atlas_data_files and drop_common_events:
734                common_cell_indices = (
735                    file_data[file]["catalogue_classification"] == "common_cell"
736                )
737                if log is not None:
738                    log.debug(
739                        f"Dropping {int(pd.Series.sum(common_cell_indices))}"
740                        f"common cells from {file}"
741                    )
742                file_data[file] = file_data[file][common_cell_indices == False]
743
744            if len(file_data[file]) == 0:
745                # File gets dropped from the dict
746                file_data.pop(file)
747                if log is not None:
748                    log.warning(f"{file} has no cells after dropping common cells")
749                continue
750
751            # Extract frame_id and cell_id
752            # DAPI- events already have frame_id cell_id outside rowname
753            if event_type == "cells":
754                file_data[file]["rowname"] = file_data[file]["rowname"].astype("str")
755                # get frame_id cell_id from rownames column and split into two columns
756                split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True)
757                if len(split_res.columns) != 2:
758                    log.warning(
759                        f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}'
760                    )
761                # then assign it back to the dataframe
762                file_data[file][["frame_id", "cell_id"]] = split_res.astype("int")
763            # reset indexes since they can cause NaN values in concat
764            file_data[file] = file_data[file].reset_index(drop=True)
765
766        # Merge the data from all files
767        if len(file_data) == 0:
768            return EventArray()
769        elif len(file_data) == 1:
770            data = [file_data[file] for file in file_data.keys()][0]
771        else:
772            data = pd.concat(file_data.values())
773
774        if log is not None:
775            log.debug(f"Gathered a total of {len(data)} events")
776
777        # Others is missing the "slide_id". Insert it right before "frame_id" column
778        if event_type == "others" and "slide_id" not in data.columns:
779            if os.path.basename(input_path) == "ocular":
780                slide_id = os.path.basename(os.path.dirname(input_path))
781            else:
782                slide_id = "UNKNOWN"
783            data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id)
784
785        # Sort according to ascending cell_id to keep the original, which is in manual_df
786        data = data.sort_values(by=["cell_id"], ascending=True)
787        # Filter out duplicates by x & y
788        data = data.assign(
789            unique_id=data["slide_id"]
790            + "_"
791            + data["frame_id"].astype(str)
792            + "_"
793            + data["cellx"].astype(int).astype(str)
794            + "_"
795            + data["celly"].astype(int).astype(str)
796        )
797        data = data.drop_duplicates(subset=["unique_id"], keep="first")
798        # Normal unique_id is with cell_id
799        data = data.assign(
800            unique_id=data["slide_id"]
801            + "_"
802            + data["frame_id"].astype(str)
803            + "_"
804            + data["cell_id"].astype(str)
805        )
806        data = data.reset_index(drop=True)
807        # All columns up to "slide_id" are features; drop the "slide_id"
808        features = data.loc[:, :"slide_id"].iloc[:, :-1]
809        data = data.loc[:, "slide_id":]
810        # Grab the info columns
811        info = data[["slide_id", "frame_id", "cellx", "celly"]]
812        info.columns = ["slide_id", "tile", "x", "y"]
813        info = info.assign(
814            roi=0,  # OCULAR only works on 1 ROI, as far as known
815            size=25,  # Static, for later montaging
816        )
817        info = info[["slide_id", "tile", "roi", "x", "y", "size"]]
818        # Metadata has duplicate columns for later convenience
819        metadata = data
820        return EventArray(info, metadata, features)
821
822    def save_ocular(self, output_path: str, event_type: str = "cells") -> bool:
823        """
824        Save the events to an OCULAR file. Relies on the dataframe originating
825        from an OCULAR file (same columns; duplicate metadata/info).
826        :param output_path:
827        :return:
828        """
829        if event_type == "cells":
830            file_stub = "rc-final"
831        elif event_type == "others":
832            file_stub = "others-final"
833        else:
834            raise ValueError("Invalid event type. Must be cells or others.")
835
836        # Check for the "ocular_interesting" column
837        if event_type == "cells" and "ocular_interesting" in self.metadata.columns:
838            interesting = self.metadata["ocular_interesting"]
839            # Split the metadata into interesting and regular
840            # Interesting will only have dropped columns, with no internal changes
841            interesting = pd.concat(
842                [self.features[interesting], self.metadata[interesting]], axis=1
843            ).reset_index(drop=True)
844            # Data will get some columns changed; reset_index will copy it
845            data = (
846                pd.concat(
847                    [self.features[~interesting], self.metadata[~interesting]], axis=1
848                )
849                .reset_index(drop=True)
850                .drop(columns=["ocular_interesting"])
851            )
852
853            # Drop particular columns for "interesting"
854            interesting = interesting.drop(
855                [
856                    "clust",
857                    "hcpc",
858                    "frame_id",
859                    "cell_id",
860                    "unique_id",
861                    "ocular_interesting",
862                ],
863                axis=1,
864                errors="ignore",
865            )
866            # Save both .csv and .rds
867            interesting.to_csv(
868                os.path.join(output_path, "ocular_interesting.csv"), index=False
869            )
870            pyreadr.write_rds(
871                os.path.join(output_path, "ocular_interesting.rds"), interesting
872            )
873        else:
874            # Get all data and reset_index (will copy it)
875            data = pd.concat([self.features, self.metadata], axis=1).reset_index(
876                drop=True
877            )
878
879        # Split based on cluster number to conform to *-final[1-4].rds
880        n_clusters = max(data["clust"]) + 1
881        split_idx = [round(i * n_clusters / 4) for i in range(5)]
882        for i in range(4):
883            subset = (split_idx[i] <= data["clust"]) & (
884                data["clust"] < split_idx[i + 1]
885            )
886            subset = data[subset].reset_index(drop=True)
887            subset["hcpc"] = i + 1
888            pyreadr.write_rds(
889                os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset
890            )
891
892        # Create new example cell strings
893        data["example_cell_id"] = (
894            data["slide_id"]
895            + " "
896            + data["frame_id"].astype(str)
897            + " "
898            + data["cell_id"].astype(str)
899            + " "
900            + data["cellx"].astype(int).astype(str)
901            + " "
902            + data["celly"].astype(int).astype(str)
903        )
904        # Find averagable data columns
905        if "cellcluster_id" in data.columns:
906            avg_cols = data.columns[: data.columns.get_loc("cellcluster_id")].tolist()
907        else:
908            avg_cols = data.columns[: data.columns.get_loc("slide_id")].tolist()
909        # Group by cluster and average
910        data = data.groupby("clust").agg(
911            **{col: (col, "mean") for col in avg_cols},
912            count=("clust", "size"),  # count rows in each cluster
913            example_cells=("example_cell_id", lambda x: ",".join(x)),
914            hcpc=("hcpc", lambda x: x.iloc[0]),
915        )
916        data = data.reset_index()  # Do NOT drop, index is "clust"
917        # Create new columns
918        metadata = pd.DataFrame(
919            {
920                "count": data["count"],
921                "example_cells": data["example_cells"],
922                "clust": data["clust"].astype(int),
923                "hcpc": data["hcpc"].astype(int),
924                "id": data["clust"].astype(int).astype(str),
925                "cccluster": "0",  # Dummy value
926                "ccdistance": 0.0,  # Dummy value
927                "rownum": list(range(len(data))),
928                "framegroup": 0,  # Dummy value
929            }
930        )
931        data = pd.concat([data[avg_cols], metadata], axis=1)
932        # Save the data
933        data.to_csv(os.path.join(output_path, f"{file_stub}.csv"), index=False)
934        pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data)

A class that holds a large number of events' data, making it easy to analyze and manipulate many events at once. A more separated version of the Event class.

EventArray( info: pandas.core.frame.DataFrame = None, metadata: pandas.core.frame.DataFrame = None, features: pandas.core.frame.DataFrame = None)
255    def __init__(
256        self,
257        info: pd.DataFrame = None,
258        metadata: pd.DataFrame = None,
259        features: pd.DataFrame = None,
260    ):
261        # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y", "size"
262        if info is not None and (
263            not all(
264                col in info.columns
265                for col in ["slide_id", "tile", "roi", "x", "y", "size"]
266            )
267            or len(info.columns) != 6
268        ):
269            raise ValueError(
270                "EventArray.info must have columns 'slide_id', 'tile', 'roi', 'x', 'y', 'size'"
271            )
272        # All DataFrames must all have the same number of rows
273        if metadata is not None and (info is None or len(info) != len(metadata)):
274            raise ValueError(
275                "If EventArray.metadata is not None, it should match rows with .info"
276            )
277        if features is not None and (info is None or len(info) != len(features)):
278            raise ValueError(
279                "If EventArray.features is not None, it should match rows with .info"
280            )
281        self.info = info
282        self.metadata = metadata
283        self.features = features
INFO_COLUMNS = ['slide_id', 'tile', 'roi', 'x', 'y', 'size']
info
metadata
features
def get_sort_order(self, by: str | list[str], ascending: bool | list[bool] = True):
332    def get_sort_order(self, by: str | list[str], ascending: bool | list[bool] = True):
333        """
334        Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.
335        :param by: name of the column(s) to sort by.
336        :param ascending: whether to sort in ascending order; can be a list to match by
337        :return: the order of the indices to sort by.
338        """
339        columns = self.get(by)
340        return columns.sort_values(by=by, ascending=ascending).index

Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.

Parameters
  • by: name of the column(s) to sort by.
  • ascending: whether to sort in ascending order; can be a list to match by
Returns

the order of the indices to sort by.

def sort(self, by: str | list[str], ascending: bool | list[bool] = True) -> Self:
342    def sort(
343        self, by: str | list[str], ascending: bool | list[bool] = True
344    ) -> typing.Self:
345        """
346        Sort the EventArray by column(s) in the info, metadata, or features DataFrames.
347        :param by: name of the column(s) to sort by.
348        :param ascending: whether to sort in ascending order; can be a list to match by
349        :return: a new, sorted EventArray.
350        """
351        order = self.get_sort_order(by, ascending)
352        info = self.info.loc[order].reset_index(drop=True)
353        if self.metadata is not None:
354            metadata = self.metadata.loc[order].reset_index(drop=True)
355        else:
356            metadata = None
357        if self.features is not None:
358            features = self.features.loc[order].reset_index(drop=True)
359        else:
360            features = None
361        return EventArray(info, metadata, features)

Sort the EventArray by column(s) in the info, metadata, or features DataFrames.

Parameters
  • by: name of the column(s) to sort by.
  • ascending: whether to sort in ascending order; can be a list to match by
Returns

a new, sorted EventArray.

def get( self, column_names: int | str | list[int] | list[str]) -> pandas.core.frame.DataFrame:
363    def get(self, column_names: int | str | list[int] | list[str]) -> pd.DataFrame:
364        """
365        Get a DataFrame with the specified columns from the EventArray, by value.
366        :param column_names: the names of the columns to get.
367        :return: a DataFrame with the specified columns.
368        """
369        if isinstance(column_names, int) or isinstance(column_names, str):
370            column_names = [column_names]
371        columns = []
372        for column_name in column_names:
373            if column_name in self.info.columns:
374                columns.append(self.info[column_name])
375            elif self.metadata is not None and column_name in self.metadata.columns:
376                columns.append(column_name)
377            elif self.features is not None and column_name in self.features.columns:
378                columns.append(column_name)
379            else:
380                raise ValueError(f"Column {column_name} not found in EventArray")
381        return pd.concat(columns, axis=1)

Get a DataFrame with the specified columns from the EventArray, by value.

Parameters
  • column_names: the names of the columns to get.
Returns

a DataFrame with the specified columns.

def rows(self, rows) -> Self:
383    def rows(self, rows) -> typing.Self:
384        """
385        Get a subset of the EventArray rows based on a boolean or integer index, by value.
386        :param rows: the indices to get as a 1D boolean/integer list/array/series
387        :return: a new EventArray with the subset of events.
388        """
389        info = self.info.loc[rows].reset_index(drop=True)
390        if self.metadata is not None:
391            metadata = self.metadata.loc[rows].reset_index(drop=True)
392        else:
393            metadata = None
394        if self.features is not None:
395            features = self.features.loc[rows].reset_index(drop=True)
396        else:
397            features = None
398        return EventArray(info, metadata, features)

Get a subset of the EventArray rows based on a boolean or integer index, by value.

Parameters
  • rows: the indices to get as a 1D boolean/integer list/array/series
Returns

a new EventArray with the subset of events.

def copy(self) -> Self:
400    def copy(self) -> typing.Self:
401        """
402        Create a deep copy of the EventArray.
403        :return: a deep copy of the EventArray.
404        """
405        return EventArray(
406            info=self.info.copy(),
407            metadata=None if self.metadata is None else self.metadata.copy(),
408            features=None if self.features is None else self.features.copy(),
409        )

Create a deep copy of the EventArray.

Returns

a deep copy of the EventArray.

def add_metadata(self, new_metadata: pandas.core.frame.DataFrame) -> None:
411    def add_metadata(self, new_metadata: pd.DataFrame) -> None:
412        """
413        Add metadata to the EventArray. Removes the need to check if metadata is None.
414        Overwrites any existing metadata with the same column names as the new metadata.
415        :param new_metadata: the metadata to add.
416        """
417        if len(self) != len(new_metadata):
418            raise ValueError("New metadata must match length of existing info")
419
420        if self.metadata is None:
421            self.metadata = new_metadata
422        else:
423            self.metadata[new_metadata.columns] = new_metadata

Add metadata to the EventArray. Removes the need to check if metadata is None. Overwrites any existing metadata with the same column names as the new metadata.

Parameters
  • new_metadata: the metadata to add.
def add_features(self, new_features: pandas.core.frame.DataFrame) -> None:
425    def add_features(self, new_features: pd.DataFrame) -> None:
426        """
427        Add features to the EventArray. Removes the need to check if features is None.
428        Overwrites any existing features with the same column names as the new features.
429        :param new_features: the features to add.
430        """
431        if len(self) != len(new_features):
432            raise ValueError("New features must match length of existing info")
433
434        if self.features is None:
435            self.features = new_features
436        else:
437            self.features[new_features.columns] = new_features

Add features to the EventArray. Removes the need to check if features is None. Overwrites any existing features with the same column names as the new features.

Parameters
  • new_features: the features to add.
@classmethod
def merge(cls, events: list[typing.Self]) -> Self:
439    @classmethod
440    def merge(cls, events: list[typing.Self]) -> typing.Self:
441        """
442        Combine EventArrays in a list into a single EventArray.
443        :param events: the new list of events.
444        """
445        all_info = []
446        all_metadata = []
447        all_features = []
448        for event_array in events:
449            # Skip empty EventArrays
450            if event_array.info is not None:
451                all_info.append(event_array.info)
452            if event_array.metadata is not None:
453                all_metadata.append(event_array.metadata)
454            if event_array.features is not None:
455                all_features.append(event_array.features)
456        if len(all_info) == 0:
457            return EventArray()
458        else:
459            all_info = pd.concat(all_info, ignore_index=True)
460        if len(all_metadata) == 0:
461            all_metadata = None
462        else:
463            all_metadata = pd.concat(all_metadata, ignore_index=True)
464        if len(all_features) == 0:
465            all_features = None
466        else:
467            all_features = pd.concat(all_features, ignore_index=True)
468
469        return EventArray(all_info, all_metadata, all_features)

Combine EventArrays in a list into a single EventArray.

Parameters
  • events: the new list of events.
@classmethod
def from_events(cls, events: list[Event]) -> Self:
471    @classmethod
472    def from_events(cls, events: list[Event]) -> typing.Self:
473        """
474        Set the events in the EventArray to a new list of events.
475        :param events: the new list of events.
476        """
477        # Return an empty array if we were passed nothing
478        if events is None or len(events) == 0:
479            return EventArray()
480        # Otherwise, grab the info
481        info = pd.DataFrame(
482            {
483                "slide_id": [event.scan.slide_id for event in events],
484                "tile": [event.tile.n for event in events],
485                "roi": [event.tile.n_roi for event in events],
486                "x": [event.x for event in events],
487                "y": [event.y for event in events],
488                "size": [event.size for event in events],
489            }
490        )
491        metadata_list = [event.metadata for event in events]
492        # Iterate through and ensure that all metadata is the same shape
493        for metadata in metadata_list:
494            if type(metadata) != type(metadata_list[0]):
495                raise ValueError("All metadata must be the same type.")
496            if metadata is not None and metadata.shape != metadata_list[0].shape:
497                raise ValueError("All metadata must be the same shape.")
498        if metadata_list[0] is None:
499            metadata = None
500        else:
501            metadata = pd.DataFrame(metadata_list)
502        features_list = [event.features for event in events]
503        # Iterate through and ensure that all features are the same shape
504        for features in features_list:
505            if type(features) != type(features_list[0]):
506                raise ValueError("All features must be the same type.")
507            if features is not None and features.shape != features_list[0].shape:
508                raise ValueError("All features must be the same shape.")
509        if features_list[0] is None:
510            features = None
511        else:
512            features = pd.DataFrame(features_list)
513        return EventArray(info=info, metadata=metadata, features=features)

Set the events in the EventArray to a new list of events.

Parameters
  • events: the new list of events.
def to_events( self, scans: list[csi_images.csi_scans.Scan], ignore_missing_scans=True, ignore_metadata=False, ignore_features=False) -> list[Event]:
515    def to_events(
516        self,
517        scans: list[Scan],
518        ignore_missing_scans=True,
519        ignore_metadata=False,
520        ignore_features=False,
521    ) -> list[Event]:
522        """
523        Get the events in the EventArray as a list of events.
524        :param scans: the scans that the events belong to. Pass an empty list if you
525                      don't care about scan metadata.
526        :param ignore_missing_scans: whether to create blank scans for events without scans.
527        :param ignore_metadata: whether to ignore metadata or not
528        :param ignore_features: whether to ignore features or not
529        :return:
530        """
531        events = []
532        for i in range(len(self.info)):
533            # Determine the associated scan
534            scan = None
535            for s in scans:
536                if s.slide_id == self.info["slide_id"][i]:
537                    scan = s
538                    break
539            if scan is None:
540                if ignore_missing_scans:
541                    # Create a placeholder scan if the scan is missing
542                    scan = Scan.make_placeholder(
543                        self.info["slide_id"][i],
544                        self.info["tile"][i],
545                        self.info["roi"][i],
546                    )
547                else:
548                    raise ValueError(
549                        f"Scan {self.info['slide_id'][i]} not found for event {i}."
550                    )
551            # Add to the list
552            events.append(
553                Event(
554                    scan,
555                    Tile(scan, self.info["tile"][i], self.info["roi"][i]),
556                    self.info["x"][i],
557                    self.info["y"][i],
558                    size=self.info["size"][i],
559                    metadata=None if ignore_metadata else self.metadata.loc[i],
560                    features=None if ignore_features else self.features.loc[i],
561                )
562            )
563        return events

Get the events in the EventArray as a list of events.

Parameters
  • scans: the scans that the events belong to. Pass an empty list if you don't care about scan metadata.
  • ignore_missing_scans: whether to create blank scans for events without scans.
  • ignore_metadata: whether to ignore metadata or not
  • ignore_features: whether to ignore features or not
Returns
def to_dataframe(self) -> pandas.core.frame.DataFrame:
565    def to_dataframe(self) -> pd.DataFrame:
566        """
567        Convert all the data in the EventArray to a single DataFrame.
568        :return: a DataFrame with all the data in the EventArray.
569        """
570        # Make a copy of the info DataFrame and prepend "info_" to the column names
571        output = self.info.copy()
572        output.columns = [f"info_{col}" for col in output.columns]
573        # Combine with the metadata and prepend "metadata_" to the column names
574        if self.metadata is not None:
575            metadata = self.metadata.copy()
576            metadata.columns = [f"metadata_{col}" for col in metadata.columns]
577            output = pd.concat([output, metadata], axis=1)
578        # Combine with the features and prepend "features_" to the column names
579        if self.features is not None:
580            features = self.features.copy()
581            features.columns = [f"features_{col}" for col in features.columns]
582            output = pd.concat([output, features], axis=1)
583        return output

Convert all the data in the EventArray to a single DataFrame.

Returns

a DataFrame with all the data in the EventArray.

@classmethod
def from_dataframe(cls, df) -> Self:
585    @classmethod
586    def from_dataframe(cls, df) -> typing.Self:
587        """
588        From a single, special DataFrame, create an EventArray.
589        :return: a DataFrame with all the data in the EventArray.
590        """
591        # Split the columns into info, metadata, and features and strip prefix
592        info = df[[col for col in df.columns if col.startswith("info_")]].copy()
593        info.columns = [col.replace("info_", "") for col in info.columns]
594        if info.size == 0:
595            info = None
596        metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy()
597        metadata.columns = [col.replace("metadata_", "") for col in metadata.columns]
598        if metadata.size == 0:
599            metadata = None
600        features = df[[col for col in df.columns if col.startswith("features_")]].copy()
601        features.columns = [col.replace("features_", "") for col in features.columns]
602        if features.size == 0:
603            features = None
604        return cls(info=info, metadata=metadata, features=features)

From a single, special DataFrame, create an EventArray.

Returns

a DataFrame with all the data in the EventArray.

def save_csv(self, output_path: str) -> bool:
606    def save_csv(self, output_path: str) -> bool:
607        """
608        Save the events to an CSV file, including metadata and features.
609        :param output_path:
610        :return:
611        """
612        self.to_dataframe().to_csv(output_path, index=False)
613        return os.path.exists(output_path)

Save the events to an CSV file, including metadata and features.

Parameters
  • output_path:
Returns
@classmethod
def load_csv(cls, input_path: str) -> Self:
615    @classmethod
616    def load_csv(cls, input_path: str) -> typing.Self:
617        """
618        Load the events from an CSV file, including metadata and features.
619        :param input_path:
620        :return:
621        """
622        # Load the CSV file
623        df = pd.read_csv(input_path)
624        return cls.from_dataframe(df)

Load the events from an CSV file, including metadata and features.

Parameters
  • input_path:
Returns
def save_hdf5(self, output_path: str) -> bool:
626    def save_hdf5(self, output_path: str) -> bool:
627        """
628        Save the events to an HDF5 file, including metadata and features.
629        Uses the pandas-provided HDF5 functions for ease, and external compatibility,
630        though these files are slightly harder to view in HDFView or similar.
631        :param output_path:
632        :return:
633        """
634        # Open the output_path as an HDF5 file
635        with pd.HDFStore(output_path) as store:
636            # Store the dataframes in the HDF5 file
637            if self.info is not None:
638                store.put("info", self.info, index=False)
639            if self.metadata is not None:
640                store.put("metadata", self.metadata, index=False)
641            if self.features is not None:
642                store.put("features", self.features, index=False)
643        return os.path.exists(output_path)

Save the events to an HDF5 file, including metadata and features. Uses the pandas-provided HDF5 functions for ease, and external compatibility, though these files are slightly harder to view in HDFView or similar.

Parameters
  • output_path:
Returns
@classmethod
def load_hdf5(cls, input_path: str) -> Self:
645    @classmethod
646    def load_hdf5(cls, input_path: str) -> typing.Self:
647        """
648        Load the events from an HDF5 file, including metadata and features.
649        :param input_path:
650        :return:
651        """
652        # Open the input_path as an HDF5 file
653        with pd.HDFStore(input_path) as store:
654            # Load the dataframes from the HDF5 file
655            info = store.get("info") if "info" in store else None
656            metadata = store.get("metadata") if "metadata" in store else None
657            features = store.get("features") if "features" in store else None
658        return cls(info=info, metadata=metadata, features=features)

Load the events from an HDF5 file, including metadata and features.

Parameters
  • input_path:
Returns
@classmethod
def load_ocular( cls, input_path: str, event_type='cells', cell_data_files=('rc-final1.rds', 'rc-final2.rds', 'rc-final3.rds', 'rc-final4.rds', 'ocular_interesting.rds'), others_data_files=('others-final1.rds', 'others-final2.rds', 'others-final3.rds', 'others-final4.rds'), atlas_data_files=('ocular_interesting.rds', 'ocular_not_interesting.rds'), merge_event_data_with_stats=True, filter_and_generate_morphs=True, drop_common_events=True, log=None) -> Self:
660    @classmethod
661    def load_ocular(
662        cls,
663        input_path: str,
664        event_type="cells",
665        cell_data_files=(
666            "rc-final1.rds",
667            "rc-final2.rds",
668            "rc-final3.rds",
669            "rc-final4.rds",
670            "ocular_interesting.rds",
671        ),
672        others_data_files=(
673            "others-final1.rds",
674            "others-final2.rds",
675            "others-final3.rds",
676            "others-final4.rds",
677        ),
678        atlas_data_files=(
679            "ocular_interesting.rds",
680            "ocular_not_interesting.rds",
681        ),
682        merge_event_data_with_stats=True,
683        filter_and_generate_morphs=True,
684        drop_common_events=True,
685        log=None,
686    ) -> typing.Self:
687        """
688
689        :param input_path:
690        :param event_type:
691        :param cell_data_files:
692        :param others_data_files:
693        :param atlas_data_files:
694        :param merge_event_data_with_stats:
695        :param filter_and_generate_morphs:
696        :param drop_common_events:
697        :param log:
698        :return:
699        """
700        # Check if the input path is a directory or a file
701        if os.path.isfile(input_path):
702            data_files = [os.path.basename(input_path)]
703            input_path = os.path.dirname(input_path)
704        if event_type == "cells":
705            data_files = cell_data_files
706        elif event_type == "others":
707            data_files = others_data_files
708        else:
709            raise ValueError("Invalid event type.")
710
711        # Load the data from the OCULAR files
712        file_data = {}
713        for file in data_files:
714            file_path = os.path.join(input_path, file)
715            if not os.path.isfile(file_path):
716                if log is not None:
717                    log.warning(f"{file} not found for in {input_path}")
718                continue
719            file_data[file] = pyreadr.read_r(file_path)
720            # Get the DataFrame associated with None (pyreadr dict quirk)
721            file_data[file] = file_data[file][None]
722            if len(file_data[file]) == 0:
723                # File gets dropped from the dict
724                file_data.pop(file)
725                if log is not None:
726                    log.warning(f"{file} has no cells")
727                continue
728
729            if log is not None:
730                log.debug(f"{file} has {len(file_data[file])} cells")
731
732            # Drop common cells if requested and in this file
733            if file in atlas_data_files and drop_common_events:
734                common_cell_indices = (
735                    file_data[file]["catalogue_classification"] == "common_cell"
736                )
737                if log is not None:
738                    log.debug(
739                        f"Dropping {int(pd.Series.sum(common_cell_indices))}"
740                        f"common cells from {file}"
741                    )
742                file_data[file] = file_data[file][common_cell_indices == False]
743
744            if len(file_data[file]) == 0:
745                # File gets dropped from the dict
746                file_data.pop(file)
747                if log is not None:
748                    log.warning(f"{file} has no cells after dropping common cells")
749                continue
750
751            # Extract frame_id and cell_id
752            # DAPI- events already have frame_id cell_id outside rowname
753            if event_type == "cells":
754                file_data[file]["rowname"] = file_data[file]["rowname"].astype("str")
755                # get frame_id cell_id from rownames column and split into two columns
756                split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True)
757                if len(split_res.columns) != 2:
758                    log.warning(
759                        f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}'
760                    )
761                # then assign it back to the dataframe
762                file_data[file][["frame_id", "cell_id"]] = split_res.astype("int")
763            # reset indexes since they can cause NaN values in concat
764            file_data[file] = file_data[file].reset_index(drop=True)
765
766        # Merge the data from all files
767        if len(file_data) == 0:
768            return EventArray()
769        elif len(file_data) == 1:
770            data = [file_data[file] for file in file_data.keys()][0]
771        else:
772            data = pd.concat(file_data.values())
773
774        if log is not None:
775            log.debug(f"Gathered a total of {len(data)} events")
776
777        # Others is missing the "slide_id". Insert it right before "frame_id" column
778        if event_type == "others" and "slide_id" not in data.columns:
779            if os.path.basename(input_path) == "ocular":
780                slide_id = os.path.basename(os.path.dirname(input_path))
781            else:
782                slide_id = "UNKNOWN"
783            data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id)
784
785        # Sort according to ascending cell_id to keep the original, which is in manual_df
786        data = data.sort_values(by=["cell_id"], ascending=True)
787        # Filter out duplicates by x & y
788        data = data.assign(
789            unique_id=data["slide_id"]
790            + "_"
791            + data["frame_id"].astype(str)
792            + "_"
793            + data["cellx"].astype(int).astype(str)
794            + "_"
795            + data["celly"].astype(int).astype(str)
796        )
797        data = data.drop_duplicates(subset=["unique_id"], keep="first")
798        # Normal unique_id is with cell_id
799        data = data.assign(
800            unique_id=data["slide_id"]
801            + "_"
802            + data["frame_id"].astype(str)
803            + "_"
804            + data["cell_id"].astype(str)
805        )
806        data = data.reset_index(drop=True)
807        # All columns up to "slide_id" are features; drop the "slide_id"
808        features = data.loc[:, :"slide_id"].iloc[:, :-1]
809        data = data.loc[:, "slide_id":]
810        # Grab the info columns
811        info = data[["slide_id", "frame_id", "cellx", "celly"]]
812        info.columns = ["slide_id", "tile", "x", "y"]
813        info = info.assign(
814            roi=0,  # OCULAR only works on 1 ROI, as far as known
815            size=25,  # Static, for later montaging
816        )
817        info = info[["slide_id", "tile", "roi", "x", "y", "size"]]
818        # Metadata has duplicate columns for later convenience
819        metadata = data
820        return EventArray(info, metadata, features)
Parameters
  • input_path:
  • event_type:
  • cell_data_files:
  • others_data_files:
  • atlas_data_files:
  • merge_event_data_with_stats:
  • filter_and_generate_morphs:
  • drop_common_events:
  • log:
Returns
def save_ocular(self, output_path: str, event_type: str = 'cells') -> bool:
822    def save_ocular(self, output_path: str, event_type: str = "cells") -> bool:
823        """
824        Save the events to an OCULAR file. Relies on the dataframe originating
825        from an OCULAR file (same columns; duplicate metadata/info).
826        :param output_path:
827        :return:
828        """
829        if event_type == "cells":
830            file_stub = "rc-final"
831        elif event_type == "others":
832            file_stub = "others-final"
833        else:
834            raise ValueError("Invalid event type. Must be cells or others.")
835
836        # Check for the "ocular_interesting" column
837        if event_type == "cells" and "ocular_interesting" in self.metadata.columns:
838            interesting = self.metadata["ocular_interesting"]
839            # Split the metadata into interesting and regular
840            # Interesting will only have dropped columns, with no internal changes
841            interesting = pd.concat(
842                [self.features[interesting], self.metadata[interesting]], axis=1
843            ).reset_index(drop=True)
844            # Data will get some columns changed; reset_index will copy it
845            data = (
846                pd.concat(
847                    [self.features[~interesting], self.metadata[~interesting]], axis=1
848                )
849                .reset_index(drop=True)
850                .drop(columns=["ocular_interesting"])
851            )
852
853            # Drop particular columns for "interesting"
854            interesting = interesting.drop(
855                [
856                    "clust",
857                    "hcpc",
858                    "frame_id",
859                    "cell_id",
860                    "unique_id",
861                    "ocular_interesting",
862                ],
863                axis=1,
864                errors="ignore",
865            )
866            # Save both .csv and .rds
867            interesting.to_csv(
868                os.path.join(output_path, "ocular_interesting.csv"), index=False
869            )
870            pyreadr.write_rds(
871                os.path.join(output_path, "ocular_interesting.rds"), interesting
872            )
873        else:
874            # Get all data and reset_index (will copy it)
875            data = pd.concat([self.features, self.metadata], axis=1).reset_index(
876                drop=True
877            )
878
879        # Split based on cluster number to conform to *-final[1-4].rds
880        n_clusters = max(data["clust"]) + 1
881        split_idx = [round(i * n_clusters / 4) for i in range(5)]
882        for i in range(4):
883            subset = (split_idx[i] <= data["clust"]) & (
884                data["clust"] < split_idx[i + 1]
885            )
886            subset = data[subset].reset_index(drop=True)
887            subset["hcpc"] = i + 1
888            pyreadr.write_rds(
889                os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset
890            )
891
892        # Create new example cell strings
893        data["example_cell_id"] = (
894            data["slide_id"]
895            + " "
896            + data["frame_id"].astype(str)
897            + " "
898            + data["cell_id"].astype(str)
899            + " "
900            + data["cellx"].astype(int).astype(str)
901            + " "
902            + data["celly"].astype(int).astype(str)
903        )
904        # Find averagable data columns
905        if "cellcluster_id" in data.columns:
906            avg_cols = data.columns[: data.columns.get_loc("cellcluster_id")].tolist()
907        else:
908            avg_cols = data.columns[: data.columns.get_loc("slide_id")].tolist()
909        # Group by cluster and average
910        data = data.groupby("clust").agg(
911            **{col: (col, "mean") for col in avg_cols},
912            count=("clust", "size"),  # count rows in each cluster
913            example_cells=("example_cell_id", lambda x: ",".join(x)),
914            hcpc=("hcpc", lambda x: x.iloc[0]),
915        )
916        data = data.reset_index()  # Do NOT drop, index is "clust"
917        # Create new columns
918        metadata = pd.DataFrame(
919            {
920                "count": data["count"],
921                "example_cells": data["example_cells"],
922                "clust": data["clust"].astype(int),
923                "hcpc": data["hcpc"].astype(int),
924                "id": data["clust"].astype(int).astype(str),
925                "cccluster": "0",  # Dummy value
926                "ccdistance": 0.0,  # Dummy value
927                "rownum": list(range(len(data))),
928                "framegroup": 0,  # Dummy value
929            }
930        )
931        data = pd.concat([data[avg_cols], metadata], axis=1)
932        # Save the data
933        data.to_csv(os.path.join(output_path, f"{file_stub}.csv"), index=False)
934        pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data)

Save the events to an OCULAR file. Relies on the dataframe originating from an OCULAR file (same columns; duplicate metadata/info).

Parameters
  • output_path:
Returns