csi_images.ocular_files

  1import os
  2
  3import numpy as np
  4import pandas as pd
  5
  6from csi_images.csi_events import EventArray
  7
  8FRAME_INFO_FILE = "frameinfo.csv"
  9FRAME_MORPHOMETRICS_FILES = ["framestat-means.csv", "framestat-dev.csv"]
 10SLIDE_MORPHOMETRICS_FILE = "slidestat-calc.csv"
 11
 12
 13def get_cells(report_path: str) -> EventArray:
 14    """
 15    Convenience function to read the cells (post-clustering) from OCULAR.
 16    :param report_path:
 17    :return:
 18    """
 19    return EventArray.load_ocular(report_path, event_type="cells")
 20
 21
 22def get_others(report_path: str) -> EventArray:
 23    """
 24    Convenience function to read the DAPI- events (post-clustering from OCULAR.
 25    :param report_path:
 26    :return:
 27    """
 28    return EventArray.load_ocular(report_path, event_type="others")
 29
 30
 31def save_cells(report_path: str, events: EventArray):
 32    """
 33    Convenience function to save the cells (post-clustering) from OCULAR.
 34    :param report_path:
 35    :param events:
 36    :return:
 37    """
 38    return events.save_ocular(report_path, event_type="cells")
 39
 40
 41def save_others(report_path: str, events: EventArray):
 42    """
 43    Convenience function to save the DAPI- events (post-clustering) from OCULAR.
 44    :param report_path:
 45    :param events:
 46    :return:
 47    """
 48    return events.save_ocular(report_path, event_type="others")
 49
 50
 51def get_frame_info(report_path: str) -> pd.DataFrame:
 52    """
 53    Reads frameinfo.csv with high-level frame metadata.
 54    :param report_path: path to the OCULAR report folder.
 55    :return: DataFrame with frame info
 56    """
 57    file_path = os.path.join(report_path, FRAME_INFO_FILE)
 58    if not os.path.isfile(file_path):
 59        raise FileNotFoundError(f"{file_path} not found")
 60    # Read, dropping the repetitive first column if it exists
 61    data = pd.read_csv(file_path).drop(columns=["Unnamed: 0"], errors="ignore")
 62    return data
 63
 64
 65def get_frame_statistics(report_path: str) -> pd.DataFrame:
 66    """
 67    Reads framestat-means.csv and framestat-dev.csv and merges them.
 68    :param report_path: path to the OCULAR report folder.
 69    :return:
 70    """
 71    # Check for existence of all files
 72    file_paths = []
 73    for file in FRAME_MORPHOMETRICS_FILES:
 74        file_path = os.path.join(report_path, file)
 75        if not os.path.isfile(file_path):
 76            raise FileNotFoundError(f"{file_path} not found")
 77        file_paths.append(file_path)
 78
 79    data = []
 80    # Read in the data from each of the files
 81    for file in file_paths:
 82        file_data = pd.read_csv(file)
 83        # Rename unnamed column to frame_id
 84        file_data = file_data.rename(columns={"Unnamed: 0": "frame_id"})
 85        # Add an appropriate prefix to the column names
 86        if "means" in file:
 87            prefix_name = "frame_mean_"
 88        elif "dev" in file:
 89            prefix_name = "frame_sdev_"
 90        else:
 91            # Unexpected file name; no prefix
 92            prefix_name = ""
 93        file_data = file_data.add_prefix(prefix_name)
 94        # Strip the prefix from the frame_id column
 95        file_data = file_data.rename(columns={prefix_name + "frame_id": "frame_id"})
 96        data.append(file_data)
 97    data = pd.merge(data[0], data[1], on="frame_id")
 98    return data
 99
100
101def get_slide_statistics(report_path: str) -> pd.DataFrame:
102    """
103    Gets slide-level morphometric statistics from slidestat-calc.csv.
104    :param report_path: path to the OCULAR report folder.
105    :return:
106    """
107    file_path = os.path.join(report_path, SLIDE_MORPHOMETRICS_FILE)
108    if not os.path.isfile(file_path):
109        raise FileNotFoundError(f"{file_path} not found")
110
111    data = pd.read_csv(file_path)
112
113    # Row 0 is mean; convert to dataframe and transpose
114    mean = data.iloc[0, 1:].to_frame().transpose().reset_index(drop=True)
115    mean = mean.add_prefix("slide_mean_")
116
117    # Row 1 is standard deviations; convert to dataframe and transpose
118    sdev = data.iloc[1, 1:].to_frame().transpose().reset_index(drop=True)
119    sdev = sdev.add_prefix("slide_sdev_")
120
121    data = pd.concat([mean, sdev], axis=1)
122    return data
123
124
125def merge_statistics(
126    events: EventArray,
127    frame_stats: pd.DataFrame,
128    slide_stats: pd.DataFrame,
129) -> EventArray:
130    """
131    Merges frame-level and slide-level morphometric statistics into the EventArray.
132    :param events: EventArray object
133    :param frame_stats: frame-level morphometric statistics
134    :param slide_stats: slide-level morphometric statistics
135    :return: a new EventArray object with the merged data
136    """
137    # Create a combined slide and frame stats dataframe (1 + 2*761 columns)
138    slide_stats = pd.concat([slide_stats] * len(frame_stats), axis=0, ignore_index=True)
139    all_stats = pd.concat([frame_stats, slide_stats], axis=1)
140
141    # Do not modify the original events
142    events = events.copy()
143    # Check that all event frame_ids are in the stats
144    if not set(events.metadata["frame_id"]).issubset(set(all_stats["frame_id"])):
145        raise ValueError("Not all frame_ids are present in the morphometric statistics")
146    # Merge together using the frame_id, then drop the frame_id column from features
147    events.add_features(pd.DataFrame({"frame_id": events.metadata["frame_id"]}))
148    # Must be a left join to keep features in the same order! Finding that took me 2h
149    events.features = pd.merge(events.features, all_stats, on=["frame_id"], how="left")
150    events.features = events.features.drop(columns=["frame_id"])
151    return events
152
153
154def filter_and_generate_statistics(
155    events: EventArray,
156    morphs_to_drop: list[str] = None,
157    morphs_to_keep: list[str] = None,
158) -> EventArray:
159    """
160
161    :param events:
162    :param morphs_to_drop:
163    :param morphs_to_keep:
164    :return:
165    """
166    # Do not modify the original events
167    events = events.copy()
168    # columns to keep are the columns kept after pull
169    # initialize that variable to all columns for starters
170
171    channels = {
172        "D": "dapi",
173        "CK": "tritc",
174        "V": "fitc",
175        "CD": "cy5",
176    }
177
178    if morphs_to_drop is None:
179        morphs_to_drop = []
180    if morphs_to_keep is None:
181        morphs_to_keep = []
182    # 'multi_channel' problematic
183
184    # Identify columns that should be kept, considering the morphs_to_drop
185    columns_to_keep = list(events.features.columns)
186    if "haralick" in morphs_to_drop:
187        columns_to_keep = [col for col in columns_to_keep if ".h." not in col]
188    # remove theta
189    if "theta" in morphs_to_drop:
190        columns_to_keep = [col for col in columns_to_keep if "theta" not in col]
191    # remove blurred and then extracted
192    if "blurred" in morphs_to_drop:
193        for channel in channels.keys():
194            columns_to_keep = [
195                col for col in columns_to_keep if f"B{channels[channel]}" not in col
196            ]
197    # Remove everything that is not only for one channel
198    if "multi_channel" in morphs_to_drop:
199        for channel in channels.keys():
200            columns_to_keep = [
201                col for col in columns_to_keep if f".{channels[channel]}." in col
202            ]
203    # keep only mean, sd, and median
204    if "mean_sd_q05" in morphs_to_keep:
205        columns_to_keep = [
206            col
207            for col in columns_to_keep
208            if (".mean" in col) or (".sd" in col) or (".q05" in col)
209        ]
210    # keep only mean and median
211    if "mean_q05" in morphs_to_keep:
212        columns_to_keep = [
213            col for col in columns_to_keep if (".mean" in col) or (".q05" in col)
214        ]
215    # remove slide level info
216    if "slide" in morphs_to_drop:
217        columns_to_keep = [col for col in columns_to_keep if "slide" not in col]
218    # remove frame level info
219    if "frame" in morphs_to_drop:
220        columns_to_keep = [col for col in columns_to_keep if "frame" not in col]
221    # drop duplicates
222    columns_to_keep = list(set(columns_to_keep))
223
224    cell_features_for_sdom_frame_level = [
225        "cellf.tritc.b.mean",
226        "cellf.tritc.b.sd",
227        "cellf.tritc.b.mad",
228        "cellf.tritc.b.q001",
229        "cellf.tritc.b.q005",
230        "cellf.tritc.b.q05",
231        "cellf.tritc.b.q095",
232        "cellf.tritc.b.q099",
233        "tritc_cy5_ratio",
234    ]
235    cell_features_for_sdom_frame_level += [
236        "cellf.fitc.b.mean",
237        "cellf.fitc.b.sd",
238        "cellf.fitc.b.mad",
239        "cellf.fitc.b.q001",
240        "cellf.fitc.b.q005",
241        "cellf.fitc.b.q05",
242        "cellf.fitc.b.q095",
243        "cellf.fitc.b.q099",
244    ]
245    cell_features_for_sdom_frame_level += [
246        "cellf.cy5.b.mean",
247        "cellf.cy5.b.sd",
248        "cellf.cy5.b.mad",
249        "cellf.cy5.b.q001",
250        "cellf.cy5.b.q005",
251        "cellf.cy5.b.q05",
252        "cellf.cy5.b.q095",
253        "cellf.cy5.b.q099",
254    ]
255    cell_features_for_sdom_frame_level += [
256        "nucleusf.dapi.b.mean",
257        "nucleusf.dapi.b.sd",
258        "nucleusf.dapi.b.mad",
259        "nucleusf.dapi.b.q001",
260        "nucleusf.dapi.b.q005",
261        "nucleusf.dapi.b.q05",
262        "nucleusf.dapi.b.q095",
263        "nucleusf.dapi.b.q099",
264    ]
265    # Calculate SDOMs
266    computed_features = []
267    sdom_prefix = "cell_sdom_frame_level_"
268    computed_features += [
269        sdom_prefix + item for item in cell_features_for_sdom_frame_level
270    ]
271    events.features = _generate_computed_features(events.features, computed_features)
272
273    columns_to_keep += computed_features
274    events.features = events.features[columns_to_keep]
275
276    return events
277
278
279# computed features is a list containing the features that you want
280# e.g. 'cell_sdom_framelevel_'+ cell feature will automatically generate the correct features
281def _generate_computed_features(df: pd.DataFrame, features_to_compute: list[str]):
282    """
283    Calculates SDOMs for the given features, adding them as a column in the dataframe.
284    :param df: DataFrame with the features.
285    :param features_to_compute: list of features to compute SDOMs for.
286    :return:
287    """
288    # TODO: figure out if there were supposed to be more than 1 type...
289    # Type 1: cell_sdom_frame_level_ features:
290    feature_prefix = "cell_sdom_frame_level_"
291    # identify these computed features
292    sdom_frame_features = [s for s in features_to_compute if feature_prefix in s]
293
294    for feature in sdom_frame_features:
295        df[feature] = _calculate_sdom(df, feature, sdom_string_prefix=feature_prefix)
296
297    return df
298
299
300def _calculate_sdom(
301    df: pd.DataFrame,
302    sdom_frame_feature: str,
303    sdom_string_prefix: str = "cell_sdom_frame_level_",
304    fill_na=0,
305) -> pd.DataFrame:
306    # split string
307    split_string = sdom_frame_feature.split(sdom_string_prefix)
308    # get cell feature
309    cell_feature = split_string[1]
310    # get corresponding frame feature
311    frame_feature = "frame_mean_" + cell_feature
312    # get corresponding frame sdev feature
313    frame_feature_sd = "frame_sdev_" + cell_feature
314
315    df[sdom_frame_feature] = df[cell_feature] - df[frame_feature]
316    # if a frame has only 1 cell then this will divide by zero
317    # that is why i use the divide function of pandas
318    df[sdom_frame_feature] = df[sdom_frame_feature].divide(df[frame_feature_sd])
319
320    # replace infinity by zero
321    df[sdom_frame_feature].replace(np.inf, 0)
322
323    if fill_na is not None and df[sdom_frame_feature].isna().any():
324        n_of_nas = df[sdom_frame_feature][df[sdom_frame_feature].isna()].index
325        print(
326            f"Replacing nan with zero for {sdom_frame_feature} , number of entries: {len(n_of_nas)}"
327        )
328        df[sdom_frame_feature].fillna(fill_na, inplace=True)
329
330    # TODO: clean this up, it either needs to replace or return (probably return)
331    return df[sdom_frame_feature]
FRAME_INFO_FILE = 'frameinfo.csv'
FRAME_MORPHOMETRICS_FILES = ['framestat-means.csv', 'framestat-dev.csv']
SLIDE_MORPHOMETRICS_FILE = 'slidestat-calc.csv'
def get_cells(report_path: str) -> csi_images.csi_events.EventArray:
14def get_cells(report_path: str) -> EventArray:
15    """
16    Convenience function to read the cells (post-clustering) from OCULAR.
17    :param report_path:
18    :return:
19    """
20    return EventArray.load_ocular(report_path, event_type="cells")

Convenience function to read the cells (post-clustering) from OCULAR.

Parameters
  • report_path:
Returns
def get_others(report_path: str) -> csi_images.csi_events.EventArray:
23def get_others(report_path: str) -> EventArray:
24    """
25    Convenience function to read the DAPI- events (post-clustering from OCULAR.
26    :param report_path:
27    :return:
28    """
29    return EventArray.load_ocular(report_path, event_type="others")

Convenience function to read the DAPI- events (post-clustering from OCULAR.

Parameters
  • report_path:
Returns
def save_cells(report_path: str, events: csi_images.csi_events.EventArray):
32def save_cells(report_path: str, events: EventArray):
33    """
34    Convenience function to save the cells (post-clustering) from OCULAR.
35    :param report_path:
36    :param events:
37    :return:
38    """
39    return events.save_ocular(report_path, event_type="cells")

Convenience function to save the cells (post-clustering) from OCULAR.

Parameters
  • report_path:
  • events:
Returns
def save_others(report_path: str, events: csi_images.csi_events.EventArray):
42def save_others(report_path: str, events: EventArray):
43    """
44    Convenience function to save the DAPI- events (post-clustering) from OCULAR.
45    :param report_path:
46    :param events:
47    :return:
48    """
49    return events.save_ocular(report_path, event_type="others")

Convenience function to save the DAPI- events (post-clustering) from OCULAR.

Parameters
  • report_path:
  • events:
Returns
def get_frame_info(report_path: str) -> pandas.core.frame.DataFrame:
52def get_frame_info(report_path: str) -> pd.DataFrame:
53    """
54    Reads frameinfo.csv with high-level frame metadata.
55    :param report_path: path to the OCULAR report folder.
56    :return: DataFrame with frame info
57    """
58    file_path = os.path.join(report_path, FRAME_INFO_FILE)
59    if not os.path.isfile(file_path):
60        raise FileNotFoundError(f"{file_path} not found")
61    # Read, dropping the repetitive first column if it exists
62    data = pd.read_csv(file_path).drop(columns=["Unnamed: 0"], errors="ignore")
63    return data

Reads frameinfo.csv with high-level frame metadata.

Parameters
  • report_path: path to the OCULAR report folder.
Returns

DataFrame with frame info

def get_frame_statistics(report_path: str) -> pandas.core.frame.DataFrame:
66def get_frame_statistics(report_path: str) -> pd.DataFrame:
67    """
68    Reads framestat-means.csv and framestat-dev.csv and merges them.
69    :param report_path: path to the OCULAR report folder.
70    :return:
71    """
72    # Check for existence of all files
73    file_paths = []
74    for file in FRAME_MORPHOMETRICS_FILES:
75        file_path = os.path.join(report_path, file)
76        if not os.path.isfile(file_path):
77            raise FileNotFoundError(f"{file_path} not found")
78        file_paths.append(file_path)
79
80    data = []
81    # Read in the data from each of the files
82    for file in file_paths:
83        file_data = pd.read_csv(file)
84        # Rename unnamed column to frame_id
85        file_data = file_data.rename(columns={"Unnamed: 0": "frame_id"})
86        # Add an appropriate prefix to the column names
87        if "means" in file:
88            prefix_name = "frame_mean_"
89        elif "dev" in file:
90            prefix_name = "frame_sdev_"
91        else:
92            # Unexpected file name; no prefix
93            prefix_name = ""
94        file_data = file_data.add_prefix(prefix_name)
95        # Strip the prefix from the frame_id column
96        file_data = file_data.rename(columns={prefix_name + "frame_id": "frame_id"})
97        data.append(file_data)
98    data = pd.merge(data[0], data[1], on="frame_id")
99    return data

Reads framestat-means.csv and framestat-dev.csv and merges them.

Parameters
  • report_path: path to the OCULAR report folder.
Returns
def get_slide_statistics(report_path: str) -> pandas.core.frame.DataFrame:
102def get_slide_statistics(report_path: str) -> pd.DataFrame:
103    """
104    Gets slide-level morphometric statistics from slidestat-calc.csv.
105    :param report_path: path to the OCULAR report folder.
106    :return:
107    """
108    file_path = os.path.join(report_path, SLIDE_MORPHOMETRICS_FILE)
109    if not os.path.isfile(file_path):
110        raise FileNotFoundError(f"{file_path} not found")
111
112    data = pd.read_csv(file_path)
113
114    # Row 0 is mean; convert to dataframe and transpose
115    mean = data.iloc[0, 1:].to_frame().transpose().reset_index(drop=True)
116    mean = mean.add_prefix("slide_mean_")
117
118    # Row 1 is standard deviations; convert to dataframe and transpose
119    sdev = data.iloc[1, 1:].to_frame().transpose().reset_index(drop=True)
120    sdev = sdev.add_prefix("slide_sdev_")
121
122    data = pd.concat([mean, sdev], axis=1)
123    return data

Gets slide-level morphometric statistics from slidestat-calc.csv.

Parameters
  • report_path: path to the OCULAR report folder.
Returns
def merge_statistics( events: csi_images.csi_events.EventArray, frame_stats: pandas.core.frame.DataFrame, slide_stats: pandas.core.frame.DataFrame) -> csi_images.csi_events.EventArray:
126def merge_statistics(
127    events: EventArray,
128    frame_stats: pd.DataFrame,
129    slide_stats: pd.DataFrame,
130) -> EventArray:
131    """
132    Merges frame-level and slide-level morphometric statistics into the EventArray.
133    :param events: EventArray object
134    :param frame_stats: frame-level morphometric statistics
135    :param slide_stats: slide-level morphometric statistics
136    :return: a new EventArray object with the merged data
137    """
138    # Create a combined slide and frame stats dataframe (1 + 2*761 columns)
139    slide_stats = pd.concat([slide_stats] * len(frame_stats), axis=0, ignore_index=True)
140    all_stats = pd.concat([frame_stats, slide_stats], axis=1)
141
142    # Do not modify the original events
143    events = events.copy()
144    # Check that all event frame_ids are in the stats
145    if not set(events.metadata["frame_id"]).issubset(set(all_stats["frame_id"])):
146        raise ValueError("Not all frame_ids are present in the morphometric statistics")
147    # Merge together using the frame_id, then drop the frame_id column from features
148    events.add_features(pd.DataFrame({"frame_id": events.metadata["frame_id"]}))
149    # Must be a left join to keep features in the same order! Finding that took me 2h
150    events.features = pd.merge(events.features, all_stats, on=["frame_id"], how="left")
151    events.features = events.features.drop(columns=["frame_id"])
152    return events

Merges frame-level and slide-level morphometric statistics into the EventArray.

Parameters
  • events: EventArray object
  • frame_stats: frame-level morphometric statistics
  • slide_stats: slide-level morphometric statistics
Returns

a new EventArray object with the merged data

def filter_and_generate_statistics( events: csi_images.csi_events.EventArray, morphs_to_drop: list[str] = None, morphs_to_keep: list[str] = None) -> csi_images.csi_events.EventArray:
155def filter_and_generate_statistics(
156    events: EventArray,
157    morphs_to_drop: list[str] = None,
158    morphs_to_keep: list[str] = None,
159) -> EventArray:
160    """
161
162    :param events:
163    :param morphs_to_drop:
164    :param morphs_to_keep:
165    :return:
166    """
167    # Do not modify the original events
168    events = events.copy()
169    # columns to keep are the columns kept after pull
170    # initialize that variable to all columns for starters
171
172    channels = {
173        "D": "dapi",
174        "CK": "tritc",
175        "V": "fitc",
176        "CD": "cy5",
177    }
178
179    if morphs_to_drop is None:
180        morphs_to_drop = []
181    if morphs_to_keep is None:
182        morphs_to_keep = []
183    # 'multi_channel' problematic
184
185    # Identify columns that should be kept, considering the morphs_to_drop
186    columns_to_keep = list(events.features.columns)
187    if "haralick" in morphs_to_drop:
188        columns_to_keep = [col for col in columns_to_keep if ".h." not in col]
189    # remove theta
190    if "theta" in morphs_to_drop:
191        columns_to_keep = [col for col in columns_to_keep if "theta" not in col]
192    # remove blurred and then extracted
193    if "blurred" in morphs_to_drop:
194        for channel in channels.keys():
195            columns_to_keep = [
196                col for col in columns_to_keep if f"B{channels[channel]}" not in col
197            ]
198    # Remove everything that is not only for one channel
199    if "multi_channel" in morphs_to_drop:
200        for channel in channels.keys():
201            columns_to_keep = [
202                col for col in columns_to_keep if f".{channels[channel]}." in col
203            ]
204    # keep only mean, sd, and median
205    if "mean_sd_q05" in morphs_to_keep:
206        columns_to_keep = [
207            col
208            for col in columns_to_keep
209            if (".mean" in col) or (".sd" in col) or (".q05" in col)
210        ]
211    # keep only mean and median
212    if "mean_q05" in morphs_to_keep:
213        columns_to_keep = [
214            col for col in columns_to_keep if (".mean" in col) or (".q05" in col)
215        ]
216    # remove slide level info
217    if "slide" in morphs_to_drop:
218        columns_to_keep = [col for col in columns_to_keep if "slide" not in col]
219    # remove frame level info
220    if "frame" in morphs_to_drop:
221        columns_to_keep = [col for col in columns_to_keep if "frame" not in col]
222    # drop duplicates
223    columns_to_keep = list(set(columns_to_keep))
224
225    cell_features_for_sdom_frame_level = [
226        "cellf.tritc.b.mean",
227        "cellf.tritc.b.sd",
228        "cellf.tritc.b.mad",
229        "cellf.tritc.b.q001",
230        "cellf.tritc.b.q005",
231        "cellf.tritc.b.q05",
232        "cellf.tritc.b.q095",
233        "cellf.tritc.b.q099",
234        "tritc_cy5_ratio",
235    ]
236    cell_features_for_sdom_frame_level += [
237        "cellf.fitc.b.mean",
238        "cellf.fitc.b.sd",
239        "cellf.fitc.b.mad",
240        "cellf.fitc.b.q001",
241        "cellf.fitc.b.q005",
242        "cellf.fitc.b.q05",
243        "cellf.fitc.b.q095",
244        "cellf.fitc.b.q099",
245    ]
246    cell_features_for_sdom_frame_level += [
247        "cellf.cy5.b.mean",
248        "cellf.cy5.b.sd",
249        "cellf.cy5.b.mad",
250        "cellf.cy5.b.q001",
251        "cellf.cy5.b.q005",
252        "cellf.cy5.b.q05",
253        "cellf.cy5.b.q095",
254        "cellf.cy5.b.q099",
255    ]
256    cell_features_for_sdom_frame_level += [
257        "nucleusf.dapi.b.mean",
258        "nucleusf.dapi.b.sd",
259        "nucleusf.dapi.b.mad",
260        "nucleusf.dapi.b.q001",
261        "nucleusf.dapi.b.q005",
262        "nucleusf.dapi.b.q05",
263        "nucleusf.dapi.b.q095",
264        "nucleusf.dapi.b.q099",
265    ]
266    # Calculate SDOMs
267    computed_features = []
268    sdom_prefix = "cell_sdom_frame_level_"
269    computed_features += [
270        sdom_prefix + item for item in cell_features_for_sdom_frame_level
271    ]
272    events.features = _generate_computed_features(events.features, computed_features)
273
274    columns_to_keep += computed_features
275    events.features = events.features[columns_to_keep]
276
277    return events
Parameters
  • events:
  • morphs_to_drop:
  • morphs_to_keep:
Returns