pychemstation.analysis.spec_utils

Module contains various utility function for spectral data processing and analysis.

  1"""
  2Module contains various utility function for spectral data processing and
  3analysis.
  4"""
  5
  6import numpy as np
  7import scipy
  8
  9from .utils import find_nearest_value_index
 10
 11
 12def create_binary_peak_map(data):
 13    """Return binary map of the peaks within data points.
 14
 15    True values are assigned to potential peak points, False - to baseline.
 16
 17    Args:
 18        data (:obj:np.array): 1D array with data points.
 19
 20    Returns:
 21        :obj:np.array, dtype=bool: Mapping of data points, where True is
 22            potential peak region point, False - baseline.
 23    """
 24    # copying array
 25    data_c = np.copy(data)
 26
 27    # placeholder for the peak mapping
 28    peak_map = np.full_like(data_c, False, dtype=bool)
 29
 30    for _ in range(100500):  # shouldn't take more iterations
 31
 32        # looking for peaks
 33        peaks_found = np.logical_or(
 34            data_c > np.mean(data_c) + np.std(data_c) * 3,
 35            data_c < np.mean(data_c) - np.std(data_c) * 3,
 36        )
 37
 38        # merging with peak mapping
 39        np.logical_or(peak_map, peaks_found, out=peak_map)
 40
 41        # if no peaks found - break
 42        if not peaks_found.any():
 43            break
 44
 45        # setting values to 0 and iterating again
 46        data_c[peaks_found] = 0
 47
 48    return peak_map
 49
 50
 51def combine_map_to_regions(mapping):
 52    """Combine True values into their indexes arrays.
 53
 54    Args:
 55        mapping (:obj:np.array): Boolean mapping array to extract the indexes
 56            from.
 57
 58    Returns:
 59        :obj:np.array: 2D array with left and right borders of regions, where
 60            mapping is True.
 61
 62    Example:
 63        >>> combine_map_to_regions(np.array([True, True, False, True, False]))
 64        array([[0, 1],
 65                [3, 3]])
 66    """
 67
 68    # No peaks identified, i.e. mapping is all False
 69    if not mapping.any():
 70        return np.array([], dtype="int64")
 71
 72    # region borders
 73    region_borders = np.diff(mapping)
 74
 75    # corresponding indexes
 76    border_indexes = np.argwhere(region_borders)
 77
 78    lefts = border_indexes[::2] + 1  # because diff was used to get the index
 79
 80    # edge case, where first peak doesn't have left border
 81    if mapping[border_indexes][0]:
 82        # just preppend 0 as first left border
 83        # mind the vstack, as np.argwhere produces a vector array
 84        lefts = np.vstack((0, lefts))
 85
 86    rights = border_indexes[1::2]
 87
 88    # another edge case, where last peak doesn't have a right border
 89    if mapping[-1]:  # True if last point identified as potential peak
 90        # just append -1 as last peak right border
 91        rights = np.vstack((rights, -1))
 92
 93    # columns as borders, rows as regions, i.e.
 94    # :output:[0] -> first peak region
 95    return np.hstack((lefts, rights))
 96
 97
 98def filter_regions(x_data, peaks_regions):
 99    """Filter peak regions.
100
101    Peak regions are filtered to remove potential false positives (e.g. noise
102        spikes).
103
104    Args:
105        x_data (:obj:np.array): X data points, needed to pick up the data
106            resolution and map the region indexes to the corresponding data
107            points.
108        y_data (:obj:np.array): Y data points, needed to validate if the peaks
109            are actually present in the region and remove invalid regions.
110        peaks_regions (:obj:np.array): 2D Nx2 array with peak regions indexes
111            (rows) as left and right borders (columns).
112
113    Returns:
114        :obj:np.array: 2D Mx2 array with filtered peak regions indexes(rows) as
115            left and right borders (columns).
116    """
117
118    # filter peaks where region is smaller than spectrum resolution
119    # like single spikes, e.g. noise
120    # compute the regions first
121    x_data_regions = np.copy(x_data[peaks_regions])
122
123    # get arguments where absolute difference is greater than data resolution
124    resolution = np.absolute(np.mean(np.diff(x_data)))
125
126    # (N, 1) array!
127    valid_regions_map = np.absolute(np.diff(x_data_regions)) > resolution
128
129    # get their indexes, mind the flattening of all arrays!
130    valid_regions_indexes = np.argwhere(valid_regions_map.flatten()).flatten()
131
132    # filtering!
133    peaks_regions = peaks_regions[valid_regions_indexes]
134
135    return peaks_regions
136
137
138def filter_noisy_regions(y_data, peaks_regions):
139    """Remove noisy regions from given regions array.
140
141    Peak regions are filtered to remove false positive noise regions, e.g.
142        incorrectly assigned due to curvy baseline. Filtering is performed by
143        computing average peak points/data points ratio.
144
145    Args:
146        y_data (:obj:np.array): Y data points, needed to validate if the peaks
147            are actually present in the region and remove invalid regions.
148        peaks_regions (:obj:np.array): 2D Nx2 array with peak regions indexes
149            (rows) as left and right borders (columns).
150
151    Returns:
152        :obj:np.array: 2D Mx2 array with filtered peak regions indexes(rows) as
153            left and right borders (columns).
154    """
155
156    # compute the actual regions data points
157    y_data_regions = []
158    for region in peaks_regions:
159        y_data_regions.append(y_data[region[0] : region[-1]])
160
161    # compute noise data regions, i.e. in between peak regions
162    noise_data_regions = []
163    for row, _ in enumerate(peaks_regions):
164        try:
165            noise_data_regions.append(
166                y_data[peaks_regions[row][1] : peaks_regions[row + 1][0]]
167            )
168        except IndexError:
169            # exception for the last row -> discard
170            pass
171
172    # compute average peaks/data points ratio for noisy regions
173    noise_peaks_ratio = []
174    for region in noise_data_regions:
175        # protection from empty regions
176        if region.size != 0:
177            # minimum height is pretty low to ensure enough noise is picked
178            peaks, _ = scipy.signal.find_peaks(region, height=region.max() * 0.2)
179            noise_peaks_ratio.append(peaks.size / region.size)
180
181    # compute average with weights equal to the region length
182    noise_peaks_ratio = np.average(
183        noise_peaks_ratio, weights=[region.size for region in noise_data_regions]
184    )
185
186    # filtering!
187    valid_regions_indexes = []
188    for row, region in enumerate(y_data_regions):
189        peaks, _ = scipy.signal.find_peaks(region, height=region.max() * 0.2)
190        if peaks.size != 0 and peaks.size / region.size < noise_peaks_ratio:
191            valid_regions_indexes.append(row)
192
193    # protecting from complete cleaning
194    if not valid_regions_indexes:
195        return peaks_regions
196
197    peaks_regions = peaks_regions[np.array(valid_regions_indexes)]
198
199    return peaks_regions
200
201
202def merge_regions(x_data, peaks_regions, d_merge, recursively=True):
203    """Merge peak regions if distance between is less than delta.
204
205    Args:
206        x_data (:obj:np.array): X data points.
207        peaks_regions (:obj:np.array): 2D Nx2 array with peak regions indexes
208            (rows) as left and right borders (columns).
209        d_merge (float): Minimum distance in X data points to merge two or more
210            regions together.
211        recursively (bool, optional): If True - will repeat the procedure until
212            all regions with distance < than d_merge will merge.
213
214    Returns:
215        :obj:np.array: 2D Mx2 array with peak regions indexes (rows) as left and
216            right borders (columns), merged according to predefined minimal
217            distance.
218
219    Example:
220        >>> regions = np.array([
221                [1, 10],
222                [11, 20],
223                [25, 45],
224                [50, 75],
225                [100, 120],
226                [122, 134]
227            ])
228        >>> data = np.ones_like(regions) # ones as example
229        >>> merge_regions(data, regions, 1)
230        array([[  1,  20],
231               [ 25,  45],
232               [ 50,  75],
233               [100, 120],
234               [122, 134]])
235        >>> merge_regions(data, regions, 20, True)
236        array([[  1,  75],
237               [100, 134]])
238    """
239    # the code is pretty ugly but works
240    merged_regions = []
241
242    # converting to list to drop the data of the fly
243    regions = peaks_regions.tolist()
244
245    for i, _ in enumerate(regions):
246        try:
247            # check left border of i regions with right of i+1
248            if abs(x_data[regions[i][-1]] - x_data[regions[i + 1][0]]) <= d_merge:
249                # if lower append merge the regions
250                merged_regions.append([regions[i][0], regions[i + 1][-1]])
251                # drop the merged one
252                regions.pop(i + 1)
253            else:
254                # if nothing to merge, just append the current region
255                merged_regions.append(regions[i])
256        except IndexError:
257            # last row
258            merged_regions.append(regions[i])
259
260    merged_regions = np.array(merged_regions)
261
262    if not recursively:
263        return merged_regions
264
265    # if recursively, check for the difference
266    if (merged_regions == regions).all():
267        # done
268        return merged_regions
269
270    return merge_regions(x_data, merged_regions, d_merge, recursively=True)
271
272
273def expand_regions(x_data, peaks_regions, d_expand):
274    """Expand the peak regions by the desired value.
275
276    Args:
277        x_data (:obj:np.array): X data points.
278        peaks_regions (:obj:np.array): 2D Nx2 array with peak regions indexes
279            (rows) as left and right borders (columns).
280        d_expand (float): Value to expand borders to (in X data scale).
281
282    Returns:
283        :obj:np.array: 2D Nx2 array with expanded peak regions indexes (rows) as
284            left and right borders (columns).
285    """
286
287    data_regions = np.copy(x_data[peaks_regions])
288
289    # determine scale orientation, i.e. decreasing (e.g. ppm on NMR spectrum)
290    # or increasing (e.g. wavelength on UV spectrum)
291    if (data_regions[:, 0] - data_regions[:, 1]).mean() > 0:
292        # ppm-like scale
293        data_regions[:, 0] += d_expand
294        data_regions[:, -1] -= d_expand
295    else:
296        # wavelength-like scale
297        data_regions[:, 0] -= d_expand
298        data_regions[:, -1] += d_expand
299
300    # converting new values to new indexes
301    for index_, value in np.ndenumerate(data_regions):
302        data_regions[index_] = find_nearest_value_index(x_data, value)[1]
303
304    return data_regions.astype(int)
def create_binary_peak_map(data):
13def create_binary_peak_map(data):
14    """Return binary map of the peaks within data points.
15
16    True values are assigned to potential peak points, False - to baseline.
17
18    Args:
19        data (:obj:np.array): 1D array with data points.
20
21    Returns:
22        :obj:np.array, dtype=bool: Mapping of data points, where True is
23            potential peak region point, False - baseline.
24    """
25    # copying array
26    data_c = np.copy(data)
27
28    # placeholder for the peak mapping
29    peak_map = np.full_like(data_c, False, dtype=bool)
30
31    for _ in range(100500):  # shouldn't take more iterations
32
33        # looking for peaks
34        peaks_found = np.logical_or(
35            data_c > np.mean(data_c) + np.std(data_c) * 3,
36            data_c < np.mean(data_c) - np.std(data_c) * 3,
37        )
38
39        # merging with peak mapping
40        np.logical_or(peak_map, peaks_found, out=peak_map)
41
42        # if no peaks found - break
43        if not peaks_found.any():
44            break
45
46        # setting values to 0 and iterating again
47        data_c[peaks_found] = 0
48
49    return peak_map

Return binary map of the peaks within data points.

True values are assigned to potential peak points, False - to baseline.

Args: data (:obj:np.array): 1D array with data points.

Returns: :obj:np.array, dtype=bool: Mapping of data points, where True is potential peak region point, False - baseline.

def combine_map_to_regions(mapping):
52def combine_map_to_regions(mapping):
53    """Combine True values into their indexes arrays.
54
55    Args:
56        mapping (:obj:np.array): Boolean mapping array to extract the indexes
57            from.
58
59    Returns:
60        :obj:np.array: 2D array with left and right borders of regions, where
61            mapping is True.
62
63    Example:
64        >>> combine_map_to_regions(np.array([True, True, False, True, False]))
65        array([[0, 1],
66                [3, 3]])
67    """
68
69    # No peaks identified, i.e. mapping is all False
70    if not mapping.any():
71        return np.array([], dtype="int64")
72
73    # region borders
74    region_borders = np.diff(mapping)
75
76    # corresponding indexes
77    border_indexes = np.argwhere(region_borders)
78
79    lefts = border_indexes[::2] + 1  # because diff was used to get the index
80
81    # edge case, where first peak doesn't have left border
82    if mapping[border_indexes][0]:
83        # just preppend 0 as first left border
84        # mind the vstack, as np.argwhere produces a vector array
85        lefts = np.vstack((0, lefts))
86
87    rights = border_indexes[1::2]
88
89    # another edge case, where last peak doesn't have a right border
90    if mapping[-1]:  # True if last point identified as potential peak
91        # just append -1 as last peak right border
92        rights = np.vstack((rights, -1))
93
94    # columns as borders, rows as regions, i.e.
95    # :output:[0] -> first peak region
96    return np.hstack((lefts, rights))

Combine True values into their indexes arrays.

Args: mapping (:obj:np.array): Boolean mapping array to extract the indexes from.

Returns: :obj:np.array: 2D array with left and right borders of regions, where mapping is True.

Example:

combine_map_to_regions(np.array([True, True, False, True, False])) array([[0, 1], [3, 3]])

def filter_regions(x_data, peaks_regions):
 99def filter_regions(x_data, peaks_regions):
100    """Filter peak regions.
101
102    Peak regions are filtered to remove potential false positives (e.g. noise
103        spikes).
104
105    Args:
106        x_data (:obj:np.array): X data points, needed to pick up the data
107            resolution and map the region indexes to the corresponding data
108            points.
109        y_data (:obj:np.array): Y data points, needed to validate if the peaks
110            are actually present in the region and remove invalid regions.
111        peaks_regions (:obj:np.array): 2D Nx2 array with peak regions indexes
112            (rows) as left and right borders (columns).
113
114    Returns:
115        :obj:np.array: 2D Mx2 array with filtered peak regions indexes(rows) as
116            left and right borders (columns).
117    """
118
119    # filter peaks where region is smaller than spectrum resolution
120    # like single spikes, e.g. noise
121    # compute the regions first
122    x_data_regions = np.copy(x_data[peaks_regions])
123
124    # get arguments where absolute difference is greater than data resolution
125    resolution = np.absolute(np.mean(np.diff(x_data)))
126
127    # (N, 1) array!
128    valid_regions_map = np.absolute(np.diff(x_data_regions)) > resolution
129
130    # get their indexes, mind the flattening of all arrays!
131    valid_regions_indexes = np.argwhere(valid_regions_map.flatten()).flatten()
132
133    # filtering!
134    peaks_regions = peaks_regions[valid_regions_indexes]
135
136    return peaks_regions

Filter peak regions.

Peak regions are filtered to remove potential false positives (e.g. noise spikes).

Args: x_data (:obj:np.array): X data points, needed to pick up the data resolution and map the region indexes to the corresponding data points. y_data (:obj:np.array): Y data points, needed to validate if the peaks are actually present in the region and remove invalid regions. peaks_regions (:obj:np.array): 2D Nx2 array with peak regions indexes (rows) as left and right borders (columns).

Returns: :obj:np.array: 2D Mx2 array with filtered peak regions indexes(rows) as left and right borders (columns).

def filter_noisy_regions(y_data, peaks_regions):
139def filter_noisy_regions(y_data, peaks_regions):
140    """Remove noisy regions from given regions array.
141
142    Peak regions are filtered to remove false positive noise regions, e.g.
143        incorrectly assigned due to curvy baseline. Filtering is performed by
144        computing average peak points/data points ratio.
145
146    Args:
147        y_data (:obj:np.array): Y data points, needed to validate if the peaks
148            are actually present in the region and remove invalid regions.
149        peaks_regions (:obj:np.array): 2D Nx2 array with peak regions indexes
150            (rows) as left and right borders (columns).
151
152    Returns:
153        :obj:np.array: 2D Mx2 array with filtered peak regions indexes(rows) as
154            left and right borders (columns).
155    """
156
157    # compute the actual regions data points
158    y_data_regions = []
159    for region in peaks_regions:
160        y_data_regions.append(y_data[region[0] : region[-1]])
161
162    # compute noise data regions, i.e. in between peak regions
163    noise_data_regions = []
164    for row, _ in enumerate(peaks_regions):
165        try:
166            noise_data_regions.append(
167                y_data[peaks_regions[row][1] : peaks_regions[row + 1][0]]
168            )
169        except IndexError:
170            # exception for the last row -> discard
171            pass
172
173    # compute average peaks/data points ratio for noisy regions
174    noise_peaks_ratio = []
175    for region in noise_data_regions:
176        # protection from empty regions
177        if region.size != 0:
178            # minimum height is pretty low to ensure enough noise is picked
179            peaks, _ = scipy.signal.find_peaks(region, height=region.max() * 0.2)
180            noise_peaks_ratio.append(peaks.size / region.size)
181
182    # compute average with weights equal to the region length
183    noise_peaks_ratio = np.average(
184        noise_peaks_ratio, weights=[region.size for region in noise_data_regions]
185    )
186
187    # filtering!
188    valid_regions_indexes = []
189    for row, region in enumerate(y_data_regions):
190        peaks, _ = scipy.signal.find_peaks(region, height=region.max() * 0.2)
191        if peaks.size != 0 and peaks.size / region.size < noise_peaks_ratio:
192            valid_regions_indexes.append(row)
193
194    # protecting from complete cleaning
195    if not valid_regions_indexes:
196        return peaks_regions
197
198    peaks_regions = peaks_regions[np.array(valid_regions_indexes)]
199
200    return peaks_regions

Remove noisy regions from given regions array.

Peak regions are filtered to remove false positive noise regions, e.g. incorrectly assigned due to curvy baseline. Filtering is performed by computing average peak points/data points ratio.

Args: y_data (:obj:np.array): Y data points, needed to validate if the peaks are actually present in the region and remove invalid regions. peaks_regions (:obj:np.array): 2D Nx2 array with peak regions indexes (rows) as left and right borders (columns).

Returns: :obj:np.array: 2D Mx2 array with filtered peak regions indexes(rows) as left and right borders (columns).

def merge_regions(x_data, peaks_regions, d_merge, recursively=True):
203def merge_regions(x_data, peaks_regions, d_merge, recursively=True):
204    """Merge peak regions if distance between is less than delta.
205
206    Args:
207        x_data (:obj:np.array): X data points.
208        peaks_regions (:obj:np.array): 2D Nx2 array with peak regions indexes
209            (rows) as left and right borders (columns).
210        d_merge (float): Minimum distance in X data points to merge two or more
211            regions together.
212        recursively (bool, optional): If True - will repeat the procedure until
213            all regions with distance < than d_merge will merge.
214
215    Returns:
216        :obj:np.array: 2D Mx2 array with peak regions indexes (rows) as left and
217            right borders (columns), merged according to predefined minimal
218            distance.
219
220    Example:
221        >>> regions = np.array([
222                [1, 10],
223                [11, 20],
224                [25, 45],
225                [50, 75],
226                [100, 120],
227                [122, 134]
228            ])
229        >>> data = np.ones_like(regions) # ones as example
230        >>> merge_regions(data, regions, 1)
231        array([[  1,  20],
232               [ 25,  45],
233               [ 50,  75],
234               [100, 120],
235               [122, 134]])
236        >>> merge_regions(data, regions, 20, True)
237        array([[  1,  75],
238               [100, 134]])
239    """
240    # the code is pretty ugly but works
241    merged_regions = []
242
243    # converting to list to drop the data of the fly
244    regions = peaks_regions.tolist()
245
246    for i, _ in enumerate(regions):
247        try:
248            # check left border of i regions with right of i+1
249            if abs(x_data[regions[i][-1]] - x_data[regions[i + 1][0]]) <= d_merge:
250                # if lower append merge the regions
251                merged_regions.append([regions[i][0], regions[i + 1][-1]])
252                # drop the merged one
253                regions.pop(i + 1)
254            else:
255                # if nothing to merge, just append the current region
256                merged_regions.append(regions[i])
257        except IndexError:
258            # last row
259            merged_regions.append(regions[i])
260
261    merged_regions = np.array(merged_regions)
262
263    if not recursively:
264        return merged_regions
265
266    # if recursively, check for the difference
267    if (merged_regions == regions).all():
268        # done
269        return merged_regions
270
271    return merge_regions(x_data, merged_regions, d_merge, recursively=True)

Merge peak regions if distance between is less than delta.

Args: x_data (:obj:np.array): X data points. peaks_regions (:obj:np.array): 2D Nx2 array with peak regions indexes (rows) as left and right borders (columns). d_merge (float): Minimum distance in X data points to merge two or more regions together. recursively (bool, optional): If True - will repeat the procedure until all regions with distance < than d_merge will merge.

Returns: :obj:np.array: 2D Mx2 array with peak regions indexes (rows) as left and right borders (columns), merged according to predefined minimal distance.

Example:

regions = np.array([ [1, 10], [11, 20], [25, 45], [50, 75], [100, 120], [122, 134] ]) data = np.ones_like(regions) # ones as example merge_regions(data, regions, 1) array([[ 1, 20], [ 25, 45], [ 50, 75], [100, 120], [122, 134]]) merge_regions(data, regions, 20, True) array([[ 1, 75], [100, 134]])

def expand_regions(x_data, peaks_regions, d_expand):
274def expand_regions(x_data, peaks_regions, d_expand):
275    """Expand the peak regions by the desired value.
276
277    Args:
278        x_data (:obj:np.array): X data points.
279        peaks_regions (:obj:np.array): 2D Nx2 array with peak regions indexes
280            (rows) as left and right borders (columns).
281        d_expand (float): Value to expand borders to (in X data scale).
282
283    Returns:
284        :obj:np.array: 2D Nx2 array with expanded peak regions indexes (rows) as
285            left and right borders (columns).
286    """
287
288    data_regions = np.copy(x_data[peaks_regions])
289
290    # determine scale orientation, i.e. decreasing (e.g. ppm on NMR spectrum)
291    # or increasing (e.g. wavelength on UV spectrum)
292    if (data_regions[:, 0] - data_regions[:, 1]).mean() > 0:
293        # ppm-like scale
294        data_regions[:, 0] += d_expand
295        data_regions[:, -1] -= d_expand
296    else:
297        # wavelength-like scale
298        data_regions[:, 0] -= d_expand
299        data_regions[:, -1] += d_expand
300
301    # converting new values to new indexes
302    for index_, value in np.ndenumerate(data_regions):
303        data_regions[index_] = find_nearest_value_index(x_data, value)[1]
304
305    return data_regions.astype(int)

Expand the peak regions by the desired value.

Args: x_data (:obj:np.array): X data points. peaks_regions (:obj:np.array): 2D Nx2 array with peak regions indexes (rows) as left and right borders (columns). d_expand (float): Value to expand borders to (in X data scale).

Returns: :obj:np.array: 2D Nx2 array with expanded peak regions indexes (rows) as left and right borders (columns).