hcitools.process

hcitools : process -------- ------- Functions for loading and processing data.

  1"""
  2hcitools : process
  3--------   -------
  4Functions for loading and processing data.
  5"""
  6
  7# Imports
  8from sklearn.preprocessing import StandardScaler, MinMaxScaler
  9from sklearn.feature_selection import VarianceThreshold
 10from sklearn.decomposition import PCA
 11from sklearn.manifold import TSNE
 12from umap import UMAP
 13
 14# from dash.html import P
 15
 16import pandas as pd
 17import numpy as np
 18import pathlib
 19import base64
 20import io
 21import re
 22
 23# Paths to Data Files
 24BASE_PATH = pathlib.Path(__file__).parent.resolve()
 25ASSET_PATH = BASE_PATH.joinpath('assets/').resolve()
 26
 27# Define feature groups
 28groups = {
 29    'Nucleus': {'rgx': r'Nucleus|Number of Objects|HOECHST 33342',
 30                'col': '#008AC9'},
 31    'TMRM': {'rgx': r'Alexa 594',
 32             'col': '#FFC000'},
 33    'ROS': {'rgx': r'Alexa 488',
 34            'col': '#70AD47'},
 35    'Mito Mass': {'rgx': r'Alexa 647|spots|MCTracker Deep Red',
 36                  'col': '#D75156'},
 37}
 38
 39
 40## ------------------------------ DEFINITIONS ------------------------------- ##
 41
 42def drop_high_corr(data: pd.DataFrame, thresh: float=0.95):
 43    """
 44    Remove features with high pearson correlations (> thresh) from a dataframe
 45    """
 46
 47    # Compute correlations
 48    corr = data.corr('pearson').abs()
 49    corr = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
 50
 51    # Create a dictionary to keep track of features that exhibit high 
 52    # correlations
 53    #   keys = feature to keep
 54    #   values = features to drop
 55    dropped = dict()
 56    for col in corr.columns:
 57        I = corr[col] > thresh
 58        if any(I):
 59            dropped[col] = corr.columns[I].tolist()
 60
 61    # List of features to drop
 62    drop = [x for sub in dropped.values() for x in sub if x not in dropped]
 63
 64    return data.drop(drop, axis=1), drop
 65
 66
 67def drop_low_variance(data: pd.DataFrame, thresh: float=0.0, 
 68                      na_replacement: int=-999):
 69    """
 70    Remove all low-variance features from a dataframe.
 71    """
 72
 73    df = data.copy()
 74    selector = VarianceThreshold(thresh)
 75    selector.fit(df.fillna(na_replacement))
 76
 77    return df.loc[:, selector.get_support(indices=False)]
 78
 79
 80def printif(cond: bool, *args, **kwargs):
 81    # Print if cond is true
 82    if cond: print(*args, **kwargs)
 83
 84
 85def intersperse(lst, item):
 86    # Insert item between each entry in lst
 87    result = [item] * (len(lst) * 2 - 1)
 88    result[0::2] = lst
 89    return result
 90
 91
 92# def clean_data(data, meta_cols, dropna=False, drop_low_var=False, 
 93#                cor_thr=None, intens_norm=False, log=False,
 94#                num_objs =  'non-border cells - number of objects'):
 95#     """
 96#     Clean a dataset and perform necessary preprocessing
 97
 98#     Parameters
 99#     ----------
100#     data : pd.DataFrame
101#         Dataframe
102#     meta_cols : list
103#         List of metadata columns
104#     dropna : bool
105#         Drop NA-only columns and any rows with NAs
106#     drop_low_var : bool
107#         Drop zero variance features
108#     corr_thr : float
109#         Correlation threshold. Remove features with correlations > corr_thr
110#     intens_norm : bool
111#         Should intensity-based features be normalized?
112#     log : bool
113#         Return log of preprocessing steps
114
115#     Returns
116#     -------
117#     Data, Metadata and LOG
118#     """
119
120#     # Create a 'Compound Concentration' column
121#     data['cmpd_conc'] = (data['compound'].astype(str) + 
122#                          ' (' + data['conc'].astype(str) + ')')
123#     meta_cols.append('cmpd_conc')
124
125#     # Sort data so that metadata columns are first
126#     OTHERCOLS = [x for x in data.columns if x not in meta_cols]
127#     data = data[meta_cols + OTHERCOLS].set_index(meta_cols)
128
129#     # Initialize log
130#     if log:
131#         LOG = [P(f"Original data shape: {data.shape}")]
132#     else:
133#         LOG = None
134
135#     # Keep track of any dropped features
136#     original_features = set(data.columns).difference(meta_cols)
137#     dropped = dict()
138
139#     if dropna:
140#         data = (data.dropna(axis=1, how='all')  # NA-only columns
141#                     .dropna(axis=0, how='any')) # Rows with NAs
142#         dropped['missing'] = list( original_features.difference(data.columns) )
143#         if log:
144#             LOG += [P(f"After dropping NAs, data.shape: {data.shape}")]
145
146#     if drop_low_var:
147#         data = drop_low_variance(data, thresh=0.0)
148#         dropped['low var'] = list( original_features.difference(data.columns) )
149#         if log:
150#             LOG += [P(f"After removing zero-variance features, data shape: {data.shape}")]
151
152#     # Normalize intensity based features by number of objects
153#     # This assumes that the `number of objects` is in the column `num_objs`
154#     if intens_norm:
155#         intens_features = [x for x in data.columns if re.search('Intensity', x)]
156#         data[intens_features] = (data[intens_features]
157#             .div(data[num_objs], axis=0))
158        
159#         if log:
160#             LOG += [P(f"Intensity-based features were normalized by '{num_objs}'")]
161
162#     if cor_thr is not None:
163#         data, dropped['high corr'] = drop_high_corr(data, thresh=cor_thr)
164#         if log:
165#             LOG += [P(f"After removing highly correlated features, data shape: {data.shape}")]
166
167#     # Keep track of which columns are metadata
168#     data = data.reset_index()
169#     data.attrs['meta_cols'] = meta_cols
170#     data.attrs['features'] = list( set(data.columns).difference(meta_cols) )
171#     data.attrs['compounds'] = data['compound'].unique().tolist()
172
173#     return data, dropped, LOG
174    
175
176def normalize_df(df, method='MinMax'):
177    """
178    Normalize a data frame
179    """
180
181    if method == 'MinMax':
182        X = MinMaxScaler().fit_transform(df.values)
183    elif method == 'z':
184        X = StandardScaler().fit_transform(df.values)
185    else:
186        raise ValueError("method must be one of 'MinMax' or 'z'")
187
188    return pd.DataFrame(data=X, columns=df.columns, index=df.index)
189
190
191def dim_reduction(data, method=['PCA', 'tSNE', 'UMAP'],
192                  pca_kws=None, tsne_kws=None, umap_kws=None) -> pd.DataFrame:
193    """
194    Perfrom Dimensionality Reduction on data and Return the projections
195
196    Parameters
197    ----------
198    data : pd.DataFrame
199        Data set. Should have any metadata columns in its index only.
200    method : str | list
201        What dimensionality reduction technique to use.
202        Must be one or / or a combination of: 'PCA', 'tSNE', 'UMAP'
203    """
204
205    data.columns = [x.lower() for x in data.columns]
206
207    if isinstance(method, str):
208        method = [method]
209    method = [x.lower() for x in method]
210    if np.all([x not in ['pca', 'tsne', 'umap'] for x in method]):
211        raise ValueError("method must be one of 'UMAP', 'tSNE', or 'PCA'")
212
213    if pca_kws is None:
214        pca_kws = dict(n_components=5, random_state=69)
215    else:
216        pca_kws['random_state'] = 69
217
218    if tsne_kws is None:
219        tsne_kws = dict(n_components=3, perplexity=30.0, learning_rate='auto', 
220                        init='random', random_state=69)
221    else:
222        tsne_kws['random_state'] = 69
223
224    if umap_kws is None:
225        umap_kws = dict(n_components=3, init='random', n_neighbors=20, 
226                        min_dist=0.2, random_state=69)
227    else:
228        umap_kws['random_state'] = 69
229
230    # Initiazte transformers
231    transformers = dict()
232    for m in method:
233        if m == 'pca':
234            transformers[m] = PCA(**pca_kws).fit(data)
235        elif m == 'tsne':
236            transformers[m] = TSNE(**tsne_kws)
237        elif m == 'umap':
238            transformers[m] = UMAP(**umap_kws)
239        else:
240            raise ValueError("You're not supposed to be here.")
241
242    # Compute projections
243    proj = []
244    for m, tr in transformers.items():
245        if m == 'pca':
246            proj.append( tr.transform(data) )
247        else:
248            proj.append( tr.fit_transform(data) )
249    proj = np.concatenate(proj, axis=1)
250
251    # Create column names for output data frame
252    cols = []
253    for m in method:
254        if m == 'pca':
255            n = transformers['pca'].n_components + 1
256            cols.append([f'PCA {x}' for x in range(1,n)] )
257        else:
258            cols.append([f'{m.upper()} {x}' for x in range(1,4)])
259    cols = [x for sub in cols for x in sub]
260
261    # Prepare dataframe
262    proj = pd.DataFrame(
263        data=proj,
264        columns=cols,
265        index=data.index
266    ).melt(ignore_index=False)
267    proj['comp'] = proj.variable.apply(lambda x: re.search(r'\d+', x)[0]).astype(int)
268    proj['variable'] = proj.variable.apply(lambda x: re.search(r'^\w*', x)[0])
269    proj = (proj
270        .pivot_table(values='value', columns='comp', 
271                     index=list(data.index.names) + ['variable'])
272        .reset_index())
273    proj.columns = proj.columns.astype(str)
274
275    if 'pca' in method: 
276        exvar = transformers['pca'].explained_variance_ratio_ * 100
277        return proj, exvar
278
279    return proj
280
281
282def assign_groups(features, groups=groups):
283    """
284    Assign a list (array-like) of features to various groups
285
286    Parameters
287    ----------
288    features : list or np.array
289        Array-like of features
290    groups : dict
291        Dictionary defining regular expressions and colors for each group.
292    """
293
294    # Check input
295    try:
296        features = np.asarray(features)
297    except:
298        raise ValueError('features must be array-like')
299    assert isinstance(groups, dict), "groups must be a dictionary"
300
301    # Assign features to groups
302    feature_groups = {f: ['Other'] for f in features}
303    for grp, prop in groups.items():
304        r = np.vectorize(lambda x: bool(re.search(prop['rgx'], x)))
305        for f in features[r(features)]:
306            feature_groups[f] = [grp]
307
308    # Define colors for each group
309    group_colors = {k: v['col'] for k, v in groups.items()}
310    group_colors['Other'] = 'black'
311
312    return feature_groups, group_colors
313
314
315def parse_content(contents, filename):
316    """
317    Parse content uploaded via a dcc.Upload component
318    """
319
320    # Extract content
321    _, content_string = contents.split(',')
322
323    # Extract file extension
324    ext = pathlib.Path(filename).suffix        
325
326    # Decode & parse data
327    decoded = base64.b64decode(content_string)
328    try: 
329        if ext in ['.csv', '.tsv', '.txt']:
330            data = pd.read_csv(
331                io.StringIO(decoded.decode('utf-8')), 
332                sep=None, engine='python'
333            )
334        elif 'xls' in ext:
335            data = pd.read_excel(io.BytesIO(decoded))
336        else:
337            raise ValueError('Unsupported file type')
338    except ValueError:
339        return {'error': ['bad-ftype']}
340    except Exception as e:
341        return {'error': ['loading-error', e.__repr__()]}
342
343    return data
344
345
346def search_opts(opts, ptn):
347    """
348    Search a list of options for a value that matches a regex
349    """
350
351    return np.any([ bool(re.search(ptn, x)) for x in opts ])
def drop_high_corr(data: pandas.core.frame.DataFrame, thresh: float = 0.95):
43def drop_high_corr(data: pd.DataFrame, thresh: float=0.95):
44    """
45    Remove features with high pearson correlations (> thresh) from a dataframe
46    """
47
48    # Compute correlations
49    corr = data.corr('pearson').abs()
50    corr = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
51
52    # Create a dictionary to keep track of features that exhibit high 
53    # correlations
54    #   keys = feature to keep
55    #   values = features to drop
56    dropped = dict()
57    for col in corr.columns:
58        I = corr[col] > thresh
59        if any(I):
60            dropped[col] = corr.columns[I].tolist()
61
62    # List of features to drop
63    drop = [x for sub in dropped.values() for x in sub if x not in dropped]
64
65    return data.drop(drop, axis=1), drop

Remove features with high pearson correlations (> thresh) from a dataframe

def drop_low_variance( data: pandas.core.frame.DataFrame, thresh: float = 0.0, na_replacement: int = -999):
68def drop_low_variance(data: pd.DataFrame, thresh: float=0.0, 
69                      na_replacement: int=-999):
70    """
71    Remove all low-variance features from a dataframe.
72    """
73
74    df = data.copy()
75    selector = VarianceThreshold(thresh)
76    selector.fit(df.fillna(na_replacement))
77
78    return df.loc[:, selector.get_support(indices=False)]

Remove all low-variance features from a dataframe.

def printif(cond: bool, *args, **kwargs):
81def printif(cond: bool, *args, **kwargs):
82    # Print if cond is true
83    if cond: print(*args, **kwargs)
def intersperse(lst, item):
86def intersperse(lst, item):
87    # Insert item between each entry in lst
88    result = [item] * (len(lst) * 2 - 1)
89    result[0::2] = lst
90    return result
def normalize_df(df, method='MinMax'):
177def normalize_df(df, method='MinMax'):
178    """
179    Normalize a data frame
180    """
181
182    if method == 'MinMax':
183        X = MinMaxScaler().fit_transform(df.values)
184    elif method == 'z':
185        X = StandardScaler().fit_transform(df.values)
186    else:
187        raise ValueError("method must be one of 'MinMax' or 'z'")
188
189    return pd.DataFrame(data=X, columns=df.columns, index=df.index)

Normalize a data frame

def dim_reduction( data, method=['PCA', 'tSNE', 'UMAP'], pca_kws=None, tsne_kws=None, umap_kws=None) -> pandas.core.frame.DataFrame:
192def dim_reduction(data, method=['PCA', 'tSNE', 'UMAP'],
193                  pca_kws=None, tsne_kws=None, umap_kws=None) -> pd.DataFrame:
194    """
195    Perfrom Dimensionality Reduction on data and Return the projections
196
197    Parameters
198    ----------
199    data : pd.DataFrame
200        Data set. Should have any metadata columns in its index only.
201    method : str | list
202        What dimensionality reduction technique to use.
203        Must be one or / or a combination of: 'PCA', 'tSNE', 'UMAP'
204    """
205
206    data.columns = [x.lower() for x in data.columns]
207
208    if isinstance(method, str):
209        method = [method]
210    method = [x.lower() for x in method]
211    if np.all([x not in ['pca', 'tsne', 'umap'] for x in method]):
212        raise ValueError("method must be one of 'UMAP', 'tSNE', or 'PCA'")
213
214    if pca_kws is None:
215        pca_kws = dict(n_components=5, random_state=69)
216    else:
217        pca_kws['random_state'] = 69
218
219    if tsne_kws is None:
220        tsne_kws = dict(n_components=3, perplexity=30.0, learning_rate='auto', 
221                        init='random', random_state=69)
222    else:
223        tsne_kws['random_state'] = 69
224
225    if umap_kws is None:
226        umap_kws = dict(n_components=3, init='random', n_neighbors=20, 
227                        min_dist=0.2, random_state=69)
228    else:
229        umap_kws['random_state'] = 69
230
231    # Initiazte transformers
232    transformers = dict()
233    for m in method:
234        if m == 'pca':
235            transformers[m] = PCA(**pca_kws).fit(data)
236        elif m == 'tsne':
237            transformers[m] = TSNE(**tsne_kws)
238        elif m == 'umap':
239            transformers[m] = UMAP(**umap_kws)
240        else:
241            raise ValueError("You're not supposed to be here.")
242
243    # Compute projections
244    proj = []
245    for m, tr in transformers.items():
246        if m == 'pca':
247            proj.append( tr.transform(data) )
248        else:
249            proj.append( tr.fit_transform(data) )
250    proj = np.concatenate(proj, axis=1)
251
252    # Create column names for output data frame
253    cols = []
254    for m in method:
255        if m == 'pca':
256            n = transformers['pca'].n_components + 1
257            cols.append([f'PCA {x}' for x in range(1,n)] )
258        else:
259            cols.append([f'{m.upper()} {x}' for x in range(1,4)])
260    cols = [x for sub in cols for x in sub]
261
262    # Prepare dataframe
263    proj = pd.DataFrame(
264        data=proj,
265        columns=cols,
266        index=data.index
267    ).melt(ignore_index=False)
268    proj['comp'] = proj.variable.apply(lambda x: re.search(r'\d+', x)[0]).astype(int)
269    proj['variable'] = proj.variable.apply(lambda x: re.search(r'^\w*', x)[0])
270    proj = (proj
271        .pivot_table(values='value', columns='comp', 
272                     index=list(data.index.names) + ['variable'])
273        .reset_index())
274    proj.columns = proj.columns.astype(str)
275
276    if 'pca' in method: 
277        exvar = transformers['pca'].explained_variance_ratio_ * 100
278        return proj, exvar
279
280    return proj

Perfrom Dimensionality Reduction on data and Return the projections

Parameters
  • data (pd.DataFrame): Data set. Should have any metadata columns in its index only.
  • method (str | list): What dimensionality reduction technique to use. Must be one or / or a combination of: 'PCA', 'tSNE', 'UMAP'
def assign_groups( features, groups={'Nucleus': {'rgx': 'Nucleus|Number of Objects|HOECHST 33342', 'col': '#008AC9'}, 'TMRM': {'rgx': 'Alexa 594', 'col': '#FFC000'}, 'ROS': {'rgx': 'Alexa 488', 'col': '#70AD47'}, 'Mito Mass': {'rgx': 'Alexa 647|spots|MCTracker Deep Red', 'col': '#D75156'}}):
283def assign_groups(features, groups=groups):
284    """
285    Assign a list (array-like) of features to various groups
286
287    Parameters
288    ----------
289    features : list or np.array
290        Array-like of features
291    groups : dict
292        Dictionary defining regular expressions and colors for each group.
293    """
294
295    # Check input
296    try:
297        features = np.asarray(features)
298    except:
299        raise ValueError('features must be array-like')
300    assert isinstance(groups, dict), "groups must be a dictionary"
301
302    # Assign features to groups
303    feature_groups = {f: ['Other'] for f in features}
304    for grp, prop in groups.items():
305        r = np.vectorize(lambda x: bool(re.search(prop['rgx'], x)))
306        for f in features[r(features)]:
307            feature_groups[f] = [grp]
308
309    # Define colors for each group
310    group_colors = {k: v['col'] for k, v in groups.items()}
311    group_colors['Other'] = 'black'
312
313    return feature_groups, group_colors

Assign a list (array-like) of features to various groups

Parameters
  • features (list or np.array): Array-like of features
  • groups (dict): Dictionary defining regular expressions and colors for each group.
def parse_content(contents, filename):
316def parse_content(contents, filename):
317    """
318    Parse content uploaded via a dcc.Upload component
319    """
320
321    # Extract content
322    _, content_string = contents.split(',')
323
324    # Extract file extension
325    ext = pathlib.Path(filename).suffix        
326
327    # Decode & parse data
328    decoded = base64.b64decode(content_string)
329    try: 
330        if ext in ['.csv', '.tsv', '.txt']:
331            data = pd.read_csv(
332                io.StringIO(decoded.decode('utf-8')), 
333                sep=None, engine='python'
334            )
335        elif 'xls' in ext:
336            data = pd.read_excel(io.BytesIO(decoded))
337        else:
338            raise ValueError('Unsupported file type')
339    except ValueError:
340        return {'error': ['bad-ftype']}
341    except Exception as e:
342        return {'error': ['loading-error', e.__repr__()]}
343
344    return data

Parse content uploaded via a dcc.Upload component

def search_opts(opts, ptn):
347def search_opts(opts, ptn):
348    """
349    Search a list of options for a value that matches a regex
350    """
351
352    return np.any([ bool(re.search(ptn, x)) for x in opts ])

Search a list of options for a value that matches a regex