pychemstation.analysis.spec_utils
Module contains various utility function for spectral data processing and analysis.
1""" 2Module contains various utility function for spectral data processing and 3analysis. 4""" 5 6import numpy as np 7import scipy 8 9from .utils import find_nearest_value_index 10 11 12def create_binary_peak_map(data): 13 """Return binary map of the peaks within data points. 14 15 True values are assigned to potential peak points, False - to baseline. 16 17 Args: 18 data (:obj:np.array): 1D array with data points. 19 20 Returns: 21 :obj:np.array, dtype=bool: Mapping of data points, where True is 22 potential peak region point, False - baseline. 23 """ 24 # copying array 25 data_c = np.copy(data) 26 27 # placeholder for the peak mapping 28 peak_map = np.full_like(data_c, False, dtype=bool) 29 30 for _ in range(100500): # shouldn't take more iterations 31 32 # looking for peaks 33 peaks_found = np.logical_or( 34 data_c > np.mean(data_c) + np.std(data_c) * 3, 35 data_c < np.mean(data_c) - np.std(data_c) * 3, 36 ) 37 38 # merging with peak mapping 39 np.logical_or(peak_map, peaks_found, out=peak_map) 40 41 # if no peaks found - break 42 if not peaks_found.any(): 43 break 44 45 # setting values to 0 and iterating again 46 data_c[peaks_found] = 0 47 48 return peak_map 49 50 51def combine_map_to_regions(mapping): 52 """Combine True values into their indexes arrays. 53 54 Args: 55 mapping (:obj:np.array): Boolean mapping array to extract the indexes 56 from. 57 58 Returns: 59 :obj:np.array: 2D array with left and right borders of regions, where 60 mapping is True. 61 62 Example: 63 >>> combine_map_to_regions(np.array([True, True, False, True, False])) 64 array([[0, 1], 65 [3, 3]]) 66 """ 67 68 # No peaks identified, i.e. mapping is all False 69 if not mapping.any(): 70 return np.array([], dtype="int64") 71 72 # region borders 73 region_borders = np.diff(mapping) 74 75 # corresponding indexes 76 border_indexes = np.argwhere(region_borders) 77 78 lefts = border_indexes[::2] + 1 # because diff was used to get the index 79 80 # edge case, where first peak doesn't have left border 81 if mapping[border_indexes][0]: 82 # just preppend 0 as first left border 83 # mind the vstack, as np.argwhere produces a vector array 84 lefts = np.vstack((0, lefts)) 85 86 rights = border_indexes[1::2] 87 88 # another edge case, where last peak doesn't have a right border 89 if mapping[-1]: # True if last point identified as potential peak 90 # just append -1 as last peak right border 91 rights = np.vstack((rights, -1)) 92 93 # columns as borders, rows as regions, i.e. 94 # :output:[0] -> first peak region 95 return np.hstack((lefts, rights)) 96 97 98def filter_regions(x_data, peaks_regions): 99 """Filter peak regions. 100 101 Peak regions are filtered to remove potential false positives (e.g. noise 102 spikes). 103 104 Args: 105 x_data (:obj:np.array): X data points, needed to pick up the data 106 resolution and map the region indexes to the corresponding data 107 points. 108 y_data (:obj:np.array): Y data points, needed to validate if the peaks 109 are actually present in the region and remove invalid regions. 110 peaks_regions (:obj:np.array): 2D Nx2 array with peak regions indexes 111 (rows) as left and right borders (columns). 112 113 Returns: 114 :obj:np.array: 2D Mx2 array with filtered peak regions indexes(rows) as 115 left and right borders (columns). 116 """ 117 118 # filter peaks where region is smaller than spectrum resolution 119 # like single spikes, e.g. noise 120 # compute the regions first 121 x_data_regions = np.copy(x_data[peaks_regions]) 122 123 # get arguments where absolute difference is greater than data resolution 124 resolution = np.absolute(np.mean(np.diff(x_data))) 125 126 # (N, 1) array! 127 valid_regions_map = np.absolute(np.diff(x_data_regions)) > resolution 128 129 # get their indexes, mind the flattening of all arrays! 130 valid_regions_indexes = np.argwhere(valid_regions_map.flatten()).flatten() 131 132 # filtering! 133 peaks_regions = peaks_regions[valid_regions_indexes] 134 135 return peaks_regions 136 137 138def filter_noisy_regions(y_data, peaks_regions): 139 """Remove noisy regions from given regions array. 140 141 Peak regions are filtered to remove false positive noise regions, e.g. 142 incorrectly assigned due to curvy baseline. Filtering is performed by 143 computing average peak points/data points ratio. 144 145 Args: 146 y_data (:obj:np.array): Y data points, needed to validate if the peaks 147 are actually present in the region and remove invalid regions. 148 peaks_regions (:obj:np.array): 2D Nx2 array with peak regions indexes 149 (rows) as left and right borders (columns). 150 151 Returns: 152 :obj:np.array: 2D Mx2 array with filtered peak regions indexes(rows) as 153 left and right borders (columns). 154 """ 155 156 # compute the actual regions data points 157 y_data_regions = [] 158 for region in peaks_regions: 159 y_data_regions.append(y_data[region[0] : region[-1]]) 160 161 # compute noise data regions, i.e. in between peak regions 162 noise_data_regions = [] 163 for row, _ in enumerate(peaks_regions): 164 try: 165 noise_data_regions.append( 166 y_data[peaks_regions[row][1] : peaks_regions[row + 1][0]] 167 ) 168 except IndexError: 169 # exception for the last row -> discard 170 pass 171 172 # compute average peaks/data points ratio for noisy regions 173 noise_peaks_ratio = [] 174 for region in noise_data_regions: 175 # protection from empty regions 176 if region.size != 0: 177 # minimum height is pretty low to ensure enough noise is picked 178 peaks, _ = scipy.signal.find_peaks(region, height=region.max() * 0.2) 179 noise_peaks_ratio.append(peaks.size / region.size) 180 181 # compute average with weights equal to the region length 182 noise_peaks_ratio = np.average( 183 noise_peaks_ratio, weights=[region.size for region in noise_data_regions] 184 ) 185 186 # filtering! 187 valid_regions_indexes = [] 188 for row, region in enumerate(y_data_regions): 189 peaks, _ = scipy.signal.find_peaks(region, height=region.max() * 0.2) 190 if peaks.size != 0 and peaks.size / region.size < noise_peaks_ratio: 191 valid_regions_indexes.append(row) 192 193 # protecting from complete cleaning 194 if not valid_regions_indexes: 195 return peaks_regions 196 197 peaks_regions = peaks_regions[np.array(valid_regions_indexes)] 198 199 return peaks_regions 200 201 202def merge_regions(x_data, peaks_regions, d_merge, recursively=True): 203 """Merge peak regions if distance between is less than delta. 204 205 Args: 206 x_data (:obj:np.array): X data points. 207 peaks_regions (:obj:np.array): 2D Nx2 array with peak regions indexes 208 (rows) as left and right borders (columns). 209 d_merge (float): Minimum distance in X data points to merge two or more 210 regions together. 211 recursively (bool, optional): If True - will repeat the procedure until 212 all regions with distance < than d_merge will merge. 213 214 Returns: 215 :obj:np.array: 2D Mx2 array with peak regions indexes (rows) as left and 216 right borders (columns), merged according to predefined minimal 217 distance. 218 219 Example: 220 >>> regions = np.array([ 221 [1, 10], 222 [11, 20], 223 [25, 45], 224 [50, 75], 225 [100, 120], 226 [122, 134] 227 ]) 228 >>> data = np.ones_like(regions) # ones as example 229 >>> merge_regions(data, regions, 1) 230 array([[ 1, 20], 231 [ 25, 45], 232 [ 50, 75], 233 [100, 120], 234 [122, 134]]) 235 >>> merge_regions(data, regions, 20, True) 236 array([[ 1, 75], 237 [100, 134]]) 238 """ 239 # the code is pretty ugly but works 240 merged_regions = [] 241 242 # converting to list to drop the data of the fly 243 regions = peaks_regions.tolist() 244 245 for i, _ in enumerate(regions): 246 try: 247 # check left border of i regions with right of i+1 248 if abs(x_data[regions[i][-1]] - x_data[regions[i + 1][0]]) <= d_merge: 249 # if lower append merge the regions 250 merged_regions.append([regions[i][0], regions[i + 1][-1]]) 251 # drop the merged one 252 regions.pop(i + 1) 253 else: 254 # if nothing to merge, just append the current region 255 merged_regions.append(regions[i]) 256 except IndexError: 257 # last row 258 merged_regions.append(regions[i]) 259 260 merged_regions = np.array(merged_regions) 261 262 if not recursively: 263 return merged_regions 264 265 # if recursively, check for the difference 266 if (merged_regions == regions).all(): 267 # done 268 return merged_regions 269 270 return merge_regions(x_data, merged_regions, d_merge, recursively=True) 271 272 273def expand_regions(x_data, peaks_regions, d_expand): 274 """Expand the peak regions by the desired value. 275 276 Args: 277 x_data (:obj:np.array): X data points. 278 peaks_regions (:obj:np.array): 2D Nx2 array with peak regions indexes 279 (rows) as left and right borders (columns). 280 d_expand (float): Value to expand borders to (in X data scale). 281 282 Returns: 283 :obj:np.array: 2D Nx2 array with expanded peak regions indexes (rows) as 284 left and right borders (columns). 285 """ 286 287 data_regions = np.copy(x_data[peaks_regions]) 288 289 # determine scale orientation, i.e. decreasing (e.g. ppm on NMR spectrum) 290 # or increasing (e.g. wavelength on UV spectrum) 291 if (data_regions[:, 0] - data_regions[:, 1]).mean() > 0: 292 # ppm-like scale 293 data_regions[:, 0] += d_expand 294 data_regions[:, -1] -= d_expand 295 else: 296 # wavelength-like scale 297 data_regions[:, 0] -= d_expand 298 data_regions[:, -1] += d_expand 299 300 # converting new values to new indexes 301 for index_, value in np.ndenumerate(data_regions): 302 data_regions[index_] = find_nearest_value_index(x_data, value)[1] 303 304 return data_regions.astype(int)
13def create_binary_peak_map(data): 14 """Return binary map of the peaks within data points. 15 16 True values are assigned to potential peak points, False - to baseline. 17 18 Args: 19 data (:obj:np.array): 1D array with data points. 20 21 Returns: 22 :obj:np.array, dtype=bool: Mapping of data points, where True is 23 potential peak region point, False - baseline. 24 """ 25 # copying array 26 data_c = np.copy(data) 27 28 # placeholder for the peak mapping 29 peak_map = np.full_like(data_c, False, dtype=bool) 30 31 for _ in range(100500): # shouldn't take more iterations 32 33 # looking for peaks 34 peaks_found = np.logical_or( 35 data_c > np.mean(data_c) + np.std(data_c) * 3, 36 data_c < np.mean(data_c) - np.std(data_c) * 3, 37 ) 38 39 # merging with peak mapping 40 np.logical_or(peak_map, peaks_found, out=peak_map) 41 42 # if no peaks found - break 43 if not peaks_found.any(): 44 break 45 46 # setting values to 0 and iterating again 47 data_c[peaks_found] = 0 48 49 return peak_map
Return binary map of the peaks within data points.
True values are assigned to potential peak points, False - to baseline.
Args: data (:obj:np.array): 1D array with data points.
Returns: :obj:np.array, dtype=bool: Mapping of data points, where True is potential peak region point, False - baseline.
52def combine_map_to_regions(mapping): 53 """Combine True values into their indexes arrays. 54 55 Args: 56 mapping (:obj:np.array): Boolean mapping array to extract the indexes 57 from. 58 59 Returns: 60 :obj:np.array: 2D array with left and right borders of regions, where 61 mapping is True. 62 63 Example: 64 >>> combine_map_to_regions(np.array([True, True, False, True, False])) 65 array([[0, 1], 66 [3, 3]]) 67 """ 68 69 # No peaks identified, i.e. mapping is all False 70 if not mapping.any(): 71 return np.array([], dtype="int64") 72 73 # region borders 74 region_borders = np.diff(mapping) 75 76 # corresponding indexes 77 border_indexes = np.argwhere(region_borders) 78 79 lefts = border_indexes[::2] + 1 # because diff was used to get the index 80 81 # edge case, where first peak doesn't have left border 82 if mapping[border_indexes][0]: 83 # just preppend 0 as first left border 84 # mind the vstack, as np.argwhere produces a vector array 85 lefts = np.vstack((0, lefts)) 86 87 rights = border_indexes[1::2] 88 89 # another edge case, where last peak doesn't have a right border 90 if mapping[-1]: # True if last point identified as potential peak 91 # just append -1 as last peak right border 92 rights = np.vstack((rights, -1)) 93 94 # columns as borders, rows as regions, i.e. 95 # :output:[0] -> first peak region 96 return np.hstack((lefts, rights))
Combine True values into their indexes arrays.
Args: mapping (:obj:np.array): Boolean mapping array to extract the indexes from.
Returns: :obj:np.array: 2D array with left and right borders of regions, where mapping is True.
Example:
combine_map_to_regions(np.array([True, True, False, True, False])) array([[0, 1], [3, 3]])
99def filter_regions(x_data, peaks_regions): 100 """Filter peak regions. 101 102 Peak regions are filtered to remove potential false positives (e.g. noise 103 spikes). 104 105 Args: 106 x_data (:obj:np.array): X data points, needed to pick up the data 107 resolution and map the region indexes to the corresponding data 108 points. 109 y_data (:obj:np.array): Y data points, needed to validate if the peaks 110 are actually present in the region and remove invalid regions. 111 peaks_regions (:obj:np.array): 2D Nx2 array with peak regions indexes 112 (rows) as left and right borders (columns). 113 114 Returns: 115 :obj:np.array: 2D Mx2 array with filtered peak regions indexes(rows) as 116 left and right borders (columns). 117 """ 118 119 # filter peaks where region is smaller than spectrum resolution 120 # like single spikes, e.g. noise 121 # compute the regions first 122 x_data_regions = np.copy(x_data[peaks_regions]) 123 124 # get arguments where absolute difference is greater than data resolution 125 resolution = np.absolute(np.mean(np.diff(x_data))) 126 127 # (N, 1) array! 128 valid_regions_map = np.absolute(np.diff(x_data_regions)) > resolution 129 130 # get their indexes, mind the flattening of all arrays! 131 valid_regions_indexes = np.argwhere(valid_regions_map.flatten()).flatten() 132 133 # filtering! 134 peaks_regions = peaks_regions[valid_regions_indexes] 135 136 return peaks_regions
Filter peak regions.
Peak regions are filtered to remove potential false positives (e.g. noise spikes).
Args: x_data (:obj:np.array): X data points, needed to pick up the data resolution and map the region indexes to the corresponding data points. y_data (:obj:np.array): Y data points, needed to validate if the peaks are actually present in the region and remove invalid regions. peaks_regions (:obj:np.array): 2D Nx2 array with peak regions indexes (rows) as left and right borders (columns).
Returns: :obj:np.array: 2D Mx2 array with filtered peak regions indexes(rows) as left and right borders (columns).
139def filter_noisy_regions(y_data, peaks_regions): 140 """Remove noisy regions from given regions array. 141 142 Peak regions are filtered to remove false positive noise regions, e.g. 143 incorrectly assigned due to curvy baseline. Filtering is performed by 144 computing average peak points/data points ratio. 145 146 Args: 147 y_data (:obj:np.array): Y data points, needed to validate if the peaks 148 are actually present in the region and remove invalid regions. 149 peaks_regions (:obj:np.array): 2D Nx2 array with peak regions indexes 150 (rows) as left and right borders (columns). 151 152 Returns: 153 :obj:np.array: 2D Mx2 array with filtered peak regions indexes(rows) as 154 left and right borders (columns). 155 """ 156 157 # compute the actual regions data points 158 y_data_regions = [] 159 for region in peaks_regions: 160 y_data_regions.append(y_data[region[0] : region[-1]]) 161 162 # compute noise data regions, i.e. in between peak regions 163 noise_data_regions = [] 164 for row, _ in enumerate(peaks_regions): 165 try: 166 noise_data_regions.append( 167 y_data[peaks_regions[row][1] : peaks_regions[row + 1][0]] 168 ) 169 except IndexError: 170 # exception for the last row -> discard 171 pass 172 173 # compute average peaks/data points ratio for noisy regions 174 noise_peaks_ratio = [] 175 for region in noise_data_regions: 176 # protection from empty regions 177 if region.size != 0: 178 # minimum height is pretty low to ensure enough noise is picked 179 peaks, _ = scipy.signal.find_peaks(region, height=region.max() * 0.2) 180 noise_peaks_ratio.append(peaks.size / region.size) 181 182 # compute average with weights equal to the region length 183 noise_peaks_ratio = np.average( 184 noise_peaks_ratio, weights=[region.size for region in noise_data_regions] 185 ) 186 187 # filtering! 188 valid_regions_indexes = [] 189 for row, region in enumerate(y_data_regions): 190 peaks, _ = scipy.signal.find_peaks(region, height=region.max() * 0.2) 191 if peaks.size != 0 and peaks.size / region.size < noise_peaks_ratio: 192 valid_regions_indexes.append(row) 193 194 # protecting from complete cleaning 195 if not valid_regions_indexes: 196 return peaks_regions 197 198 peaks_regions = peaks_regions[np.array(valid_regions_indexes)] 199 200 return peaks_regions
Remove noisy regions from given regions array.
Peak regions are filtered to remove false positive noise regions, e.g. incorrectly assigned due to curvy baseline. Filtering is performed by computing average peak points/data points ratio.
Args: y_data (:obj:np.array): Y data points, needed to validate if the peaks are actually present in the region and remove invalid regions. peaks_regions (:obj:np.array): 2D Nx2 array with peak regions indexes (rows) as left and right borders (columns).
Returns: :obj:np.array: 2D Mx2 array with filtered peak regions indexes(rows) as left and right borders (columns).
203def merge_regions(x_data, peaks_regions, d_merge, recursively=True): 204 """Merge peak regions if distance between is less than delta. 205 206 Args: 207 x_data (:obj:np.array): X data points. 208 peaks_regions (:obj:np.array): 2D Nx2 array with peak regions indexes 209 (rows) as left and right borders (columns). 210 d_merge (float): Minimum distance in X data points to merge two or more 211 regions together. 212 recursively (bool, optional): If True - will repeat the procedure until 213 all regions with distance < than d_merge will merge. 214 215 Returns: 216 :obj:np.array: 2D Mx2 array with peak regions indexes (rows) as left and 217 right borders (columns), merged according to predefined minimal 218 distance. 219 220 Example: 221 >>> regions = np.array([ 222 [1, 10], 223 [11, 20], 224 [25, 45], 225 [50, 75], 226 [100, 120], 227 [122, 134] 228 ]) 229 >>> data = np.ones_like(regions) # ones as example 230 >>> merge_regions(data, regions, 1) 231 array([[ 1, 20], 232 [ 25, 45], 233 [ 50, 75], 234 [100, 120], 235 [122, 134]]) 236 >>> merge_regions(data, regions, 20, True) 237 array([[ 1, 75], 238 [100, 134]]) 239 """ 240 # the code is pretty ugly but works 241 merged_regions = [] 242 243 # converting to list to drop the data of the fly 244 regions = peaks_regions.tolist() 245 246 for i, _ in enumerate(regions): 247 try: 248 # check left border of i regions with right of i+1 249 if abs(x_data[regions[i][-1]] - x_data[regions[i + 1][0]]) <= d_merge: 250 # if lower append merge the regions 251 merged_regions.append([regions[i][0], regions[i + 1][-1]]) 252 # drop the merged one 253 regions.pop(i + 1) 254 else: 255 # if nothing to merge, just append the current region 256 merged_regions.append(regions[i]) 257 except IndexError: 258 # last row 259 merged_regions.append(regions[i]) 260 261 merged_regions = np.array(merged_regions) 262 263 if not recursively: 264 return merged_regions 265 266 # if recursively, check for the difference 267 if (merged_regions == regions).all(): 268 # done 269 return merged_regions 270 271 return merge_regions(x_data, merged_regions, d_merge, recursively=True)
Merge peak regions if distance between is less than delta.
Args: x_data (:obj:np.array): X data points. peaks_regions (:obj:np.array): 2D Nx2 array with peak regions indexes (rows) as left and right borders (columns). d_merge (float): Minimum distance in X data points to merge two or more regions together. recursively (bool, optional): If True - will repeat the procedure until all regions with distance < than d_merge will merge.
Returns: :obj:np.array: 2D Mx2 array with peak regions indexes (rows) as left and right borders (columns), merged according to predefined minimal distance.
Example:
regions = np.array([ [1, 10], [11, 20], [25, 45], [50, 75], [100, 120], [122, 134] ]) data = np.ones_like(regions) # ones as example merge_regions(data, regions, 1) array([[ 1, 20], [ 25, 45], [ 50, 75], [100, 120], [122, 134]]) merge_regions(data, regions, 20, True) array([[ 1, 75], [100, 134]])
274def expand_regions(x_data, peaks_regions, d_expand): 275 """Expand the peak regions by the desired value. 276 277 Args: 278 x_data (:obj:np.array): X data points. 279 peaks_regions (:obj:np.array): 2D Nx2 array with peak regions indexes 280 (rows) as left and right borders (columns). 281 d_expand (float): Value to expand borders to (in X data scale). 282 283 Returns: 284 :obj:np.array: 2D Nx2 array with expanded peak regions indexes (rows) as 285 left and right borders (columns). 286 """ 287 288 data_regions = np.copy(x_data[peaks_regions]) 289 290 # determine scale orientation, i.e. decreasing (e.g. ppm on NMR spectrum) 291 # or increasing (e.g. wavelength on UV spectrum) 292 if (data_regions[:, 0] - data_regions[:, 1]).mean() > 0: 293 # ppm-like scale 294 data_regions[:, 0] += d_expand 295 data_regions[:, -1] -= d_expand 296 else: 297 # wavelength-like scale 298 data_regions[:, 0] -= d_expand 299 data_regions[:, -1] += d_expand 300 301 # converting new values to new indexes 302 for index_, value in np.ndenumerate(data_regions): 303 data_regions[index_] = find_nearest_value_index(x_data, value)[1] 304 305 return data_regions.astype(int)
Expand the peak regions by the desired value.
Args: x_data (:obj:np.array): X data points. peaks_regions (:obj:np.array): 2D Nx2 array with peak regions indexes (rows) as left and right borders (columns). d_expand (float): Value to expand borders to (in X data scale).
Returns: :obj:np.array: 2D Nx2 array with expanded peak regions indexes (rows) as left and right borders (columns).