csi_images.ocular_files
1import os 2 3import numpy as np 4import pandas as pd 5 6from csi_images.csi_events import EventArray 7 8FRAME_INFO_FILE = "frameinfo.csv" 9FRAME_MORPHOMETRICS_FILES = ["framestat-means.csv", "framestat-dev.csv"] 10SLIDE_MORPHOMETRICS_FILE = "slidestat-calc.csv" 11 12 13def get_cells(report_path: str) -> EventArray: 14 """ 15 Convenience function to read the cells (post-clustering) from OCULAR. 16 :param report_path: 17 :return: 18 """ 19 return EventArray.load_ocular(report_path, event_type="cells") 20 21 22def get_others(report_path: str) -> EventArray: 23 """ 24 Convenience function to read the DAPI- events (post-clustering from OCULAR. 25 :param report_path: 26 :return: 27 """ 28 return EventArray.load_ocular(report_path, event_type="others") 29 30 31def save_cells(report_path: str, events: EventArray): 32 """ 33 Convenience function to save the cells (post-clustering) from OCULAR. 34 :param report_path: 35 :param events: 36 :return: 37 """ 38 return events.save_ocular(report_path, event_type="cells") 39 40 41def save_others(report_path: str, events: EventArray): 42 """ 43 Convenience function to save the DAPI- events (post-clustering) from OCULAR. 44 :param report_path: 45 :param events: 46 :return: 47 """ 48 return events.save_ocular(report_path, event_type="others") 49 50 51def get_frame_info(report_path: str) -> pd.DataFrame: 52 """ 53 Reads frameinfo.csv with high-level frame metadata. 54 :param report_path: path to the OCULAR report folder. 55 :return: DataFrame with frame info 56 """ 57 file_path = os.path.join(report_path, FRAME_INFO_FILE) 58 if not os.path.isfile(file_path): 59 raise FileNotFoundError(f"{file_path} not found") 60 # Read, dropping the repetitive first column if it exists 61 data = pd.read_csv(file_path).drop(columns=["Unnamed: 0"], errors="ignore") 62 return data 63 64 65def get_frame_statistics(report_path: str) -> pd.DataFrame: 66 """ 67 Reads framestat-means.csv and framestat-dev.csv and merges them. 68 :param report_path: path to the OCULAR report folder. 69 :return: 70 """ 71 # Check for existence of all files 72 file_paths = [] 73 for file in FRAME_MORPHOMETRICS_FILES: 74 file_path = os.path.join(report_path, file) 75 if not os.path.isfile(file_path): 76 raise FileNotFoundError(f"{file_path} not found") 77 file_paths.append(file_path) 78 79 data = [] 80 # Read in the data from each of the files 81 for file in file_paths: 82 file_data = pd.read_csv(file) 83 # Rename unnamed column to frame_id 84 file_data = file_data.rename(columns={"Unnamed: 0": "frame_id"}) 85 # Add an appropriate prefix to the column names 86 if "means" in file: 87 prefix_name = "frame_mean_" 88 elif "dev" in file: 89 prefix_name = "frame_sdev_" 90 else: 91 # Unexpected file name; no prefix 92 prefix_name = "" 93 file_data = file_data.add_prefix(prefix_name) 94 # Strip the prefix from the frame_id column 95 file_data = file_data.rename(columns={prefix_name + "frame_id": "frame_id"}) 96 data.append(file_data) 97 data = pd.merge(data[0], data[1], on="frame_id") 98 return data 99 100 101def get_slide_statistics(report_path: str) -> pd.DataFrame: 102 """ 103 Gets slide-level morphometric statistics from slidestat-calc.csv. 104 :param report_path: path to the OCULAR report folder. 105 :return: 106 """ 107 file_path = os.path.join(report_path, SLIDE_MORPHOMETRICS_FILE) 108 if not os.path.isfile(file_path): 109 raise FileNotFoundError(f"{file_path} not found") 110 111 data = pd.read_csv(file_path) 112 113 # Row 0 is mean; convert to dataframe and transpose 114 mean = data.iloc[0, 1:].to_frame().transpose().reset_index(drop=True) 115 mean = mean.add_prefix("slide_mean_") 116 117 # Row 1 is standard deviations; convert to dataframe and transpose 118 sdev = data.iloc[1, 1:].to_frame().transpose().reset_index(drop=True) 119 sdev = sdev.add_prefix("slide_sdev_") 120 121 data = pd.concat([mean, sdev], axis=1) 122 return data 123 124 125def merge_statistics( 126 events: EventArray, 127 frame_stats: pd.DataFrame, 128 slide_stats: pd.DataFrame, 129) -> EventArray: 130 """ 131 Merges frame-level and slide-level morphometric statistics into the EventArray. 132 :param events: EventArray object 133 :param frame_stats: frame-level morphometric statistics 134 :param slide_stats: slide-level morphometric statistics 135 :return: a new EventArray object with the merged data 136 """ 137 # Create a combined slide and frame stats dataframe (1 + 2*761 columns) 138 slide_stats = pd.concat([slide_stats] * len(frame_stats), axis=0, ignore_index=True) 139 all_stats = pd.concat([frame_stats, slide_stats], axis=1) 140 141 # Do not modify the original events 142 events = events.copy() 143 # Check that all event frame_ids are in the stats 144 if not set(events.metadata["frame_id"]).issubset(set(all_stats["frame_id"])): 145 raise ValueError("Not all frame_ids are present in the morphometric statistics") 146 # Merge together using the frame_id, then drop the frame_id column from features 147 events.add_features(pd.DataFrame({"frame_id": events.metadata["frame_id"]})) 148 # Must be a left join to keep features in the same order! Finding that took me 2h 149 events.features = pd.merge(events.features, all_stats, on=["frame_id"], how="left") 150 events.features = events.features.drop(columns=["frame_id"]) 151 return events 152 153 154def filter_and_generate_statistics( 155 events: EventArray, 156 morphs_to_drop: list[str] = None, 157 morphs_to_keep: list[str] = None, 158) -> EventArray: 159 """ 160 161 :param events: 162 :param morphs_to_drop: 163 :param morphs_to_keep: 164 :return: 165 """ 166 # Do not modify the original events 167 events = events.copy() 168 # columns to keep are the columns kept after pull 169 # initialize that variable to all columns for starters 170 171 channels = { 172 "D": "dapi", 173 "CK": "tritc", 174 "V": "fitc", 175 "CD": "cy5", 176 } 177 178 if morphs_to_drop is None: 179 morphs_to_drop = [] 180 if morphs_to_keep is None: 181 morphs_to_keep = [] 182 # 'multi_channel' problematic 183 184 # Identify columns that should be kept, considering the morphs_to_drop 185 columns_to_keep = list(events.features.columns) 186 if "haralick" in morphs_to_drop: 187 columns_to_keep = [col for col in columns_to_keep if ".h." not in col] 188 # remove theta 189 if "theta" in morphs_to_drop: 190 columns_to_keep = [col for col in columns_to_keep if "theta" not in col] 191 # remove blurred and then extracted 192 if "blurred" in morphs_to_drop: 193 for channel in channels.keys(): 194 columns_to_keep = [ 195 col for col in columns_to_keep if f"B{channels[channel]}" not in col 196 ] 197 # Remove everything that is not only for one channel 198 if "multi_channel" in morphs_to_drop: 199 for channel in channels.keys(): 200 columns_to_keep = [ 201 col for col in columns_to_keep if f".{channels[channel]}." in col 202 ] 203 # keep only mean, sd, and median 204 if "mean_sd_q05" in morphs_to_keep: 205 columns_to_keep = [ 206 col 207 for col in columns_to_keep 208 if (".mean" in col) or (".sd" in col) or (".q05" in col) 209 ] 210 # keep only mean and median 211 if "mean_q05" in morphs_to_keep: 212 columns_to_keep = [ 213 col for col in columns_to_keep if (".mean" in col) or (".q05" in col) 214 ] 215 # remove slide level info 216 if "slide" in morphs_to_drop: 217 columns_to_keep = [col for col in columns_to_keep if "slide" not in col] 218 # remove frame level info 219 if "frame" in morphs_to_drop: 220 columns_to_keep = [col for col in columns_to_keep if "frame" not in col] 221 # drop duplicates 222 columns_to_keep = list(set(columns_to_keep)) 223 224 cell_features_for_sdom_frame_level = [ 225 "cellf.tritc.b.mean", 226 "cellf.tritc.b.sd", 227 "cellf.tritc.b.mad", 228 "cellf.tritc.b.q001", 229 "cellf.tritc.b.q005", 230 "cellf.tritc.b.q05", 231 "cellf.tritc.b.q095", 232 "cellf.tritc.b.q099", 233 "tritc_cy5_ratio", 234 ] 235 cell_features_for_sdom_frame_level += [ 236 "cellf.fitc.b.mean", 237 "cellf.fitc.b.sd", 238 "cellf.fitc.b.mad", 239 "cellf.fitc.b.q001", 240 "cellf.fitc.b.q005", 241 "cellf.fitc.b.q05", 242 "cellf.fitc.b.q095", 243 "cellf.fitc.b.q099", 244 ] 245 cell_features_for_sdom_frame_level += [ 246 "cellf.cy5.b.mean", 247 "cellf.cy5.b.sd", 248 "cellf.cy5.b.mad", 249 "cellf.cy5.b.q001", 250 "cellf.cy5.b.q005", 251 "cellf.cy5.b.q05", 252 "cellf.cy5.b.q095", 253 "cellf.cy5.b.q099", 254 ] 255 cell_features_for_sdom_frame_level += [ 256 "nucleusf.dapi.b.mean", 257 "nucleusf.dapi.b.sd", 258 "nucleusf.dapi.b.mad", 259 "nucleusf.dapi.b.q001", 260 "nucleusf.dapi.b.q005", 261 "nucleusf.dapi.b.q05", 262 "nucleusf.dapi.b.q095", 263 "nucleusf.dapi.b.q099", 264 ] 265 # Calculate SDOMs 266 computed_features = [] 267 sdom_prefix = "cell_sdom_frame_level_" 268 computed_features += [ 269 sdom_prefix + item for item in cell_features_for_sdom_frame_level 270 ] 271 events.features = _generate_computed_features(events.features, computed_features) 272 273 columns_to_keep += computed_features 274 events.features = events.features[columns_to_keep] 275 276 return events 277 278 279# computed features is a list containing the features that you want 280# e.g. 'cell_sdom_framelevel_'+ cell feature will automatically generate the correct features 281def _generate_computed_features(df: pd.DataFrame, features_to_compute: list[str]): 282 """ 283 Calculates SDOMs for the given features, adding them as a column in the dataframe. 284 :param df: DataFrame with the features. 285 :param features_to_compute: list of features to compute SDOMs for. 286 :return: 287 """ 288 # TODO: figure out if there were supposed to be more than 1 type... 289 # Type 1: cell_sdom_frame_level_ features: 290 feature_prefix = "cell_sdom_frame_level_" 291 # identify these computed features 292 sdom_frame_features = [s for s in features_to_compute if feature_prefix in s] 293 294 for feature in sdom_frame_features: 295 df[feature] = _calculate_sdom(df, feature, sdom_string_prefix=feature_prefix) 296 297 return df 298 299 300def _calculate_sdom( 301 df: pd.DataFrame, 302 sdom_frame_feature: str, 303 sdom_string_prefix: str = "cell_sdom_frame_level_", 304 fill_na=0, 305) -> pd.DataFrame: 306 # split string 307 split_string = sdom_frame_feature.split(sdom_string_prefix) 308 # get cell feature 309 cell_feature = split_string[1] 310 # get corresponding frame feature 311 frame_feature = "frame_mean_" + cell_feature 312 # get corresponding frame sdev feature 313 frame_feature_sd = "frame_sdev_" + cell_feature 314 315 df[sdom_frame_feature] = df[cell_feature] - df[frame_feature] 316 # if a frame has only 1 cell then this will divide by zero 317 # that is why i use the divide function of pandas 318 df[sdom_frame_feature] = df[sdom_frame_feature].divide(df[frame_feature_sd]) 319 320 # replace infinity by zero 321 df[sdom_frame_feature].replace(np.inf, 0) 322 323 if fill_na is not None and df[sdom_frame_feature].isna().any(): 324 n_of_nas = df[sdom_frame_feature][df[sdom_frame_feature].isna()].index 325 print( 326 f"Replacing nan with zero for {sdom_frame_feature} , number of entries: {len(n_of_nas)}" 327 ) 328 df[sdom_frame_feature].fillna(fill_na, inplace=True) 329 330 # TODO: clean this up, it either needs to replace or return (probably return) 331 return df[sdom_frame_feature]
FRAME_INFO_FILE =
'frameinfo.csv'
FRAME_MORPHOMETRICS_FILES =
['framestat-means.csv', 'framestat-dev.csv']
SLIDE_MORPHOMETRICS_FILE =
'slidestat-calc.csv'
14def get_cells(report_path: str) -> EventArray: 15 """ 16 Convenience function to read the cells (post-clustering) from OCULAR. 17 :param report_path: 18 :return: 19 """ 20 return EventArray.load_ocular(report_path, event_type="cells")
Convenience function to read the cells (post-clustering) from OCULAR.
Parameters
- report_path:
Returns
23def get_others(report_path: str) -> EventArray: 24 """ 25 Convenience function to read the DAPI- events (post-clustering from OCULAR. 26 :param report_path: 27 :return: 28 """ 29 return EventArray.load_ocular(report_path, event_type="others")
Convenience function to read the DAPI- events (post-clustering from OCULAR.
Parameters
- report_path:
Returns
32def save_cells(report_path: str, events: EventArray): 33 """ 34 Convenience function to save the cells (post-clustering) from OCULAR. 35 :param report_path: 36 :param events: 37 :return: 38 """ 39 return events.save_ocular(report_path, event_type="cells")
Convenience function to save the cells (post-clustering) from OCULAR.
Parameters
- report_path:
- events:
Returns
42def save_others(report_path: str, events: EventArray): 43 """ 44 Convenience function to save the DAPI- events (post-clustering) from OCULAR. 45 :param report_path: 46 :param events: 47 :return: 48 """ 49 return events.save_ocular(report_path, event_type="others")
Convenience function to save the DAPI- events (post-clustering) from OCULAR.
Parameters
- report_path:
- events:
Returns
def
get_frame_info(report_path: str) -> pandas.core.frame.DataFrame:
52def get_frame_info(report_path: str) -> pd.DataFrame: 53 """ 54 Reads frameinfo.csv with high-level frame metadata. 55 :param report_path: path to the OCULAR report folder. 56 :return: DataFrame with frame info 57 """ 58 file_path = os.path.join(report_path, FRAME_INFO_FILE) 59 if not os.path.isfile(file_path): 60 raise FileNotFoundError(f"{file_path} not found") 61 # Read, dropping the repetitive first column if it exists 62 data = pd.read_csv(file_path).drop(columns=["Unnamed: 0"], errors="ignore") 63 return data
Reads frameinfo.csv with high-level frame metadata.
Parameters
- report_path: path to the OCULAR report folder.
Returns
DataFrame with frame info
def
get_frame_statistics(report_path: str) -> pandas.core.frame.DataFrame:
66def get_frame_statistics(report_path: str) -> pd.DataFrame: 67 """ 68 Reads framestat-means.csv and framestat-dev.csv and merges them. 69 :param report_path: path to the OCULAR report folder. 70 :return: 71 """ 72 # Check for existence of all files 73 file_paths = [] 74 for file in FRAME_MORPHOMETRICS_FILES: 75 file_path = os.path.join(report_path, file) 76 if not os.path.isfile(file_path): 77 raise FileNotFoundError(f"{file_path} not found") 78 file_paths.append(file_path) 79 80 data = [] 81 # Read in the data from each of the files 82 for file in file_paths: 83 file_data = pd.read_csv(file) 84 # Rename unnamed column to frame_id 85 file_data = file_data.rename(columns={"Unnamed: 0": "frame_id"}) 86 # Add an appropriate prefix to the column names 87 if "means" in file: 88 prefix_name = "frame_mean_" 89 elif "dev" in file: 90 prefix_name = "frame_sdev_" 91 else: 92 # Unexpected file name; no prefix 93 prefix_name = "" 94 file_data = file_data.add_prefix(prefix_name) 95 # Strip the prefix from the frame_id column 96 file_data = file_data.rename(columns={prefix_name + "frame_id": "frame_id"}) 97 data.append(file_data) 98 data = pd.merge(data[0], data[1], on="frame_id") 99 return data
Reads framestat-means.csv and framestat-dev.csv and merges them.
Parameters
- report_path: path to the OCULAR report folder.
Returns
def
get_slide_statistics(report_path: str) -> pandas.core.frame.DataFrame:
102def get_slide_statistics(report_path: str) -> pd.DataFrame: 103 """ 104 Gets slide-level morphometric statistics from slidestat-calc.csv. 105 :param report_path: path to the OCULAR report folder. 106 :return: 107 """ 108 file_path = os.path.join(report_path, SLIDE_MORPHOMETRICS_FILE) 109 if not os.path.isfile(file_path): 110 raise FileNotFoundError(f"{file_path} not found") 111 112 data = pd.read_csv(file_path) 113 114 # Row 0 is mean; convert to dataframe and transpose 115 mean = data.iloc[0, 1:].to_frame().transpose().reset_index(drop=True) 116 mean = mean.add_prefix("slide_mean_") 117 118 # Row 1 is standard deviations; convert to dataframe and transpose 119 sdev = data.iloc[1, 1:].to_frame().transpose().reset_index(drop=True) 120 sdev = sdev.add_prefix("slide_sdev_") 121 122 data = pd.concat([mean, sdev], axis=1) 123 return data
Gets slide-level morphometric statistics from slidestat-calc.csv.
Parameters
- report_path: path to the OCULAR report folder.
Returns
def
merge_statistics( events: csi_images.csi_events.EventArray, frame_stats: pandas.core.frame.DataFrame, slide_stats: pandas.core.frame.DataFrame) -> csi_images.csi_events.EventArray:
126def merge_statistics( 127 events: EventArray, 128 frame_stats: pd.DataFrame, 129 slide_stats: pd.DataFrame, 130) -> EventArray: 131 """ 132 Merges frame-level and slide-level morphometric statistics into the EventArray. 133 :param events: EventArray object 134 :param frame_stats: frame-level morphometric statistics 135 :param slide_stats: slide-level morphometric statistics 136 :return: a new EventArray object with the merged data 137 """ 138 # Create a combined slide and frame stats dataframe (1 + 2*761 columns) 139 slide_stats = pd.concat([slide_stats] * len(frame_stats), axis=0, ignore_index=True) 140 all_stats = pd.concat([frame_stats, slide_stats], axis=1) 141 142 # Do not modify the original events 143 events = events.copy() 144 # Check that all event frame_ids are in the stats 145 if not set(events.metadata["frame_id"]).issubset(set(all_stats["frame_id"])): 146 raise ValueError("Not all frame_ids are present in the morphometric statistics") 147 # Merge together using the frame_id, then drop the frame_id column from features 148 events.add_features(pd.DataFrame({"frame_id": events.metadata["frame_id"]})) 149 # Must be a left join to keep features in the same order! Finding that took me 2h 150 events.features = pd.merge(events.features, all_stats, on=["frame_id"], how="left") 151 events.features = events.features.drop(columns=["frame_id"]) 152 return events
Merges frame-level and slide-level morphometric statistics into the EventArray.
Parameters
- events: EventArray object
- frame_stats: frame-level morphometric statistics
- slide_stats: slide-level morphometric statistics
Returns
a new EventArray object with the merged data
def
filter_and_generate_statistics( events: csi_images.csi_events.EventArray, morphs_to_drop: list[str] = None, morphs_to_keep: list[str] = None) -> csi_images.csi_events.EventArray:
155def filter_and_generate_statistics( 156 events: EventArray, 157 morphs_to_drop: list[str] = None, 158 morphs_to_keep: list[str] = None, 159) -> EventArray: 160 """ 161 162 :param events: 163 :param morphs_to_drop: 164 :param morphs_to_keep: 165 :return: 166 """ 167 # Do not modify the original events 168 events = events.copy() 169 # columns to keep are the columns kept after pull 170 # initialize that variable to all columns for starters 171 172 channels = { 173 "D": "dapi", 174 "CK": "tritc", 175 "V": "fitc", 176 "CD": "cy5", 177 } 178 179 if morphs_to_drop is None: 180 morphs_to_drop = [] 181 if morphs_to_keep is None: 182 morphs_to_keep = [] 183 # 'multi_channel' problematic 184 185 # Identify columns that should be kept, considering the morphs_to_drop 186 columns_to_keep = list(events.features.columns) 187 if "haralick" in morphs_to_drop: 188 columns_to_keep = [col for col in columns_to_keep if ".h." not in col] 189 # remove theta 190 if "theta" in morphs_to_drop: 191 columns_to_keep = [col for col in columns_to_keep if "theta" not in col] 192 # remove blurred and then extracted 193 if "blurred" in morphs_to_drop: 194 for channel in channels.keys(): 195 columns_to_keep = [ 196 col for col in columns_to_keep if f"B{channels[channel]}" not in col 197 ] 198 # Remove everything that is not only for one channel 199 if "multi_channel" in morphs_to_drop: 200 for channel in channels.keys(): 201 columns_to_keep = [ 202 col for col in columns_to_keep if f".{channels[channel]}." in col 203 ] 204 # keep only mean, sd, and median 205 if "mean_sd_q05" in morphs_to_keep: 206 columns_to_keep = [ 207 col 208 for col in columns_to_keep 209 if (".mean" in col) or (".sd" in col) or (".q05" in col) 210 ] 211 # keep only mean and median 212 if "mean_q05" in morphs_to_keep: 213 columns_to_keep = [ 214 col for col in columns_to_keep if (".mean" in col) or (".q05" in col) 215 ] 216 # remove slide level info 217 if "slide" in morphs_to_drop: 218 columns_to_keep = [col for col in columns_to_keep if "slide" not in col] 219 # remove frame level info 220 if "frame" in morphs_to_drop: 221 columns_to_keep = [col for col in columns_to_keep if "frame" not in col] 222 # drop duplicates 223 columns_to_keep = list(set(columns_to_keep)) 224 225 cell_features_for_sdom_frame_level = [ 226 "cellf.tritc.b.mean", 227 "cellf.tritc.b.sd", 228 "cellf.tritc.b.mad", 229 "cellf.tritc.b.q001", 230 "cellf.tritc.b.q005", 231 "cellf.tritc.b.q05", 232 "cellf.tritc.b.q095", 233 "cellf.tritc.b.q099", 234 "tritc_cy5_ratio", 235 ] 236 cell_features_for_sdom_frame_level += [ 237 "cellf.fitc.b.mean", 238 "cellf.fitc.b.sd", 239 "cellf.fitc.b.mad", 240 "cellf.fitc.b.q001", 241 "cellf.fitc.b.q005", 242 "cellf.fitc.b.q05", 243 "cellf.fitc.b.q095", 244 "cellf.fitc.b.q099", 245 ] 246 cell_features_for_sdom_frame_level += [ 247 "cellf.cy5.b.mean", 248 "cellf.cy5.b.sd", 249 "cellf.cy5.b.mad", 250 "cellf.cy5.b.q001", 251 "cellf.cy5.b.q005", 252 "cellf.cy5.b.q05", 253 "cellf.cy5.b.q095", 254 "cellf.cy5.b.q099", 255 ] 256 cell_features_for_sdom_frame_level += [ 257 "nucleusf.dapi.b.mean", 258 "nucleusf.dapi.b.sd", 259 "nucleusf.dapi.b.mad", 260 "nucleusf.dapi.b.q001", 261 "nucleusf.dapi.b.q005", 262 "nucleusf.dapi.b.q05", 263 "nucleusf.dapi.b.q095", 264 "nucleusf.dapi.b.q099", 265 ] 266 # Calculate SDOMs 267 computed_features = [] 268 sdom_prefix = "cell_sdom_frame_level_" 269 computed_features += [ 270 sdom_prefix + item for item in cell_features_for_sdom_frame_level 271 ] 272 events.features = _generate_computed_features(events.features, computed_features) 273 274 columns_to_keep += computed_features 275 events.features = events.features[columns_to_keep] 276 277 return events
Parameters
- events:
- morphs_to_drop:
- morphs_to_keep: