Source code for spike.Algo.sane_old

#!/usr/bin/env python 
# encoding: utf-8

"""
sane.py
#########
Algorithm for denoising time series, named sane (standing for "Support Selection for Noise Elimination")

main function is 
sane(data, rank)
data : the series to be denoised
rank : the rank of the analysis

Copyright (c) 2015 IGBMC. All rights reserved.
Marc-Andr\'e Delsuc <madelsuc@unistra.fr>
Lionel Chiron <lionel.chiron@gmail.com>

This software is a computer program whose purpose is to compute sane denoising.

This software is governed by the CeCILL  license under French law and
abiding by the rules of distribution of free software.  You can  use, 
modify and/ or redistribute the software under the terms of the CeCILL
license as circulated by CEA, CNRS and INRIA at the following URL
"http://www.cecill.info". 

As a counterpart to the access to the source code and  rights to copy,
modify and redistribute granted by the license, users are provided only
with a limited warranty  and the software's author,  the holder of the
economic rights,  and the successive licensors  have only  limited
liability. 

In this respect, the user's attention is drawn to the risks associated
with loading,  using,  modifying and/or developing or reproducing the
software by the user in light of its specific status of free software,
that may mean  that it is complicated to manipulate,  and  that  also
therefore means  that it is reserved for developers  and  experienced
professionals having in-depth computer knowledge. Users are therefore
encouraged to load and test the software's suitability as regards their
requirements in conditions enabling the security of their systems and/or 
data to be ensured and,  more generally, to use and operate it in the 
same conditions as regards security. 

The fact that you are presently reading this means that you have had
knowledge of the CeCILL license and that you accept its terms.

First Created by Lionel Chiron and Marc-Andr\'e on september 2015.

associated publications
- Bray, F., Bouclon, J., Chiron, L., Witt, M., Delsuc, M.-A., & Rolando, C. (2017).
  Nonuniform Sampling Acquisition of Two-Dimensional Fourier Transform Ion Cyclotron Resonance Mass Spectrometry for Increased Mass Resolution of Tandem Mass Spectrometry Precursor Ions.
  Analytical Chemistry, acs.analchem.7b01850. http://doi.org/10.1021/acs.analchem.7b01850

"""

from __future__ import print_function, division
import numpy as np
import numpy.linalg as linalg
from numpy.fft import fft, ifft
import unittest
import time
from scipy.linalg import norm
from math import sqrt
from ..util.signal_tools import findnoiselevel, mfft, mrfft

debug = 0 # put to 1 for debuging message
###################
#The following code allows to speed-up fft
# borrowed from scipy.signals
def _next_regular(target):
    """
    Find the next regular number greater than or equal to target.
    Regular numbers are composites of the prime factors 2, 3, and 5.
    Also known as 5-smooth numbers or Hamming numbers, these are the optimal
    size for inputs to FFTPACK.

    Target must be a positive integer.
    """
    if target <= 6:
        return target
    # Quickly check if it's already a power of 2
    if not (target & (target-1)):
        return target
    match = float('inf')  # Anything found will be smaller
    p5 = 1
    while p5 < target:
        p35 = p5
        while p35 < target:
            # Ceiling integer division, avoiding conversion to float
            # (quotient = ceil(target / p35))
            quotient = -(-target // p35)
            # Quickly find next power of 2 >= quotient
            try:
                p2 = 2**((quotient - 1).bit_length())
            except AttributeError:
                # Fallback for Python <2.7
                p2 = 2**(len(bin(quotient - 1)) - 2)
            N = p2 * p35
            if N == target:
                return N
            elif N < match:
                match = N
            p35 *= 3
            if p35 == target:
                return p35
        if p35 < match:
            match = p35
        p5 *= 5
        if p5 == target:
            return p5
    if p5 < match:
        match = p5
    return match
#####################
[docs]def sane(data, k, orda = None, iterations = 1, trick = True, optk = False, ktrick = False): """ sane algorithm. Name stands for Support Selection for Noise Elimination. From a data series return a denoised series denoised data : the series to be denoised - a (normally complex) numpy buffer k : the rank of the analysis orda : is the order of the analysis internally, a Hankel matrix (M,N) is constructed, with M = orda and N = len(data)-orda+1 if None (default) orda = (len(data)+1)/2 iterations : the number of time the operation should be repeated optk : if set to True will calculate the rank giving the best recovery for an automatic estimated noise level. trick : permits to enhanced the denoising by using a cleaned signal as the projective space. "Support Selection" ktrick : if a value is given, it permits to change the rank on the second pass. The idea is that for the first pass a rank large enough as to be used to compensate for the noise while for the second pass a lower rank can be used. ######## values are such that orda <= (len(data)+1)//2 k < orda N = len(data)-orda+1 Omega is (N x k) Sane is based on the same idea than urQRd, however, there is a much clever selection of the basis on which the random projection is performed. This allows a much better recovery of small signals buried into the noise, compared to urQRd. the flags trick, optk, ktrick control the program when all are false, sane folds back to urQRd algorithm. Optimal is certainly trick = True, optk = True, ktrick = True but not fully tested yet. ########## sane uses a new trick for performing a better denoising. A rank a little above the number of peaks as to be given. this permit to make a filtering matrix containing the signal quasi only after a first pass. On a second pass the full signal is projected on a new filtering subspace done from preceding denoising. A higher number of iterations will decrease even more the smallest amplitudes. ########## """ if not orda: orda = (data.size)//2 # defining orda if not given. if optk: # optimal rank, ensures that all the signal part is retrieved. optrank = OPTK(data, orda) k = optrank.find_best_rank() if np.allclose(data, 0.0): # dont do anything if data is empty return data #### optimize sizes to next regular L = len(data) if L>340: # last big step is _next_regular(325) == 360; then max extension is 6% Lr = _next_regular(L) orda_r = 2*Lr - _next_regular(2*Lr-orda) if debug>0: if L != Lr or orda != orda_r: print("SANE regularisation %d %d => %d %d"%(L, orda, Lr, orda_r)) else: Lr = L orda_r = orda if L != Lr: data_r = np.concatenate((data,np.zeros(Lr-L))) # create a copy and add zero else: data_r = data # just a link ##### if (2*orda_r > data_r.size): # checks if orda not too large. raise(Exception('order is too large')) ##### if (k >= orda_r): # checks if rank not too large # print('type(k) ', type(k)) raise(Exception('rank is too large, or orda is too small')) N = len(data_r)-orda_r + 1 dd = data_r.copy() for i in range(iterations+1): if i == 1 and ktrick: Omega = np.random.normal(size = (N, ktrick)) # Omega random real gaussian matrix Nxk else: Omega = np.random.normal(size = (N, k)) # Omega random real gaussian matrix Nxk if i == 1 and trick: dataproj = data_r.copy() # will project orignal dataset "data.copy()" on denoised basis "dd" else: dataproj = dd.copy() # Makes normal urQRd iterations, for sane_trick, it is the first passage. if trick: Q, QstarH = saneCore(dd, dataproj, Omega) # Projection : H = QQ*H dd = Fast_Hankel2dt(Q, QstarH) elif i != 1 and not trick: # eliminate from classical urQrd the case i == 1. Q, QstarH = saneCore(dd, dataproj, Omega) # Projection : H = QQ*H dd = Fast_Hankel2dt(Q, QstarH) denoised = dd if data.dtype == "float": # this is a kludge, as a complex data-set is to be passed - use the analytic signal if your data are real denoised = np.real(denoised) return denoised[:L]
[docs]def saneCore(dd, data, Omega): ''' Core of sane algorithm ''' Y = FastHankel_prod_mat_mat(dd, Omega) Q, r = linalg.qr(Y) # QR decompsition of Y del(r) # we don't need it any more QstarH = FastHankel_prod_mat_mat(data.conj(), Q).conj().T# return Q, QstarH # H approximation given by QQ*H
[docs]def vec_mean(M, L): ''' Vector for calculating the mean from the sum on the antidiagonal. data = vec_sum*vec_mean ''' vec_prod_diag = [1/float((i+1)) for i in range(M)] vec_prod_middle = [1/float(M) for i in range(L-2*M)] vec_mean_prod_tot = vec_prod_diag + vec_prod_middle + vec_prod_diag[::-1] return np.array(vec_mean_prod_tot)
[docs]def FastHankel_prod_mat_mat(gene_vect, matrix): ''' Fast Hankel structured matrix matrix product based on FastHankel_prod_mat_vec ''' N,K = matrix.shape L = len(gene_vect) M = L-N+1 data = np.zeros(shape = (M, K), dtype = complex) for k in range(K): prod_vect = matrix[:, k] data[:,k] = FastHankel_prod_mat_vec(gene_vect, prod_vect) return data
[docs]def FastHankel_prod_mat_vec(gene_vect, prod_vect): """ Compute product of Hankel matrix (gene_vect) by vector prod_vect. H is not computed M is the length of the result """ L = len(gene_vect) # length of generator vector N = len(prod_vect) # length of the vector that is multiplied by the matrix. M = L-N+1 prod_vect_zero = np.concatenate((np.zeros(M-1), prod_vect[::-1])) # prod_vect is completed with zero to length L fft0, fft1 = fft(gene_vect), fft(prod_vect_zero) # FFT transforms of generator vector and prod = fft0*fft1 # FFT product performing the convolution product. c = ifft(prod) # IFFT for going back return np.roll(c, +1)[:M]
[docs]def Fast_Hankel2dt(Q,QH): ''' returning to data from Q and QstarH Based on FastHankel_prod_mat_vec. ''' M,K = Q.shape K,N = QH.shape L = M+N-1 vec_sum = np.zeros((L,), dtype = complex) for k in range(K): prod_vect = QH[k,:] gene_vect = np.concatenate((np.zeros(N-1), Q[:, k], np.zeros(N-1))) # generator vector for Toeplitz matrix vec_k = FastHankel_prod_mat_vec(gene_vect, prod_vect[::-1]) # used as fast Toeplitz vec_sum += vec_k datadenoised = vec_sum*vec_mean(M, L) # from the sum on the antidiagonal to the mean return datadenoised
[docs]class OPTK(object): ''' Class for finding the best rank for classical sane. The rank is calculated so as to permit the retrieving of the signal power at high enough level. Passed parameters are the Fid "fid", the estimated number of lines "estim_nbpeaks" and the order "orda" ''' def __init__(self, fid, orda, prec = 0.9, debug = False): self.norm_show = False # Show the evolution of the norm self.list_xis = [] # Subspace signal self.list_xin = [] # Subspace noise self.psig = 0 # signal power self.pnoise = 0 # noise power self.fid = fid # initial Fid. self.xis = 0 # initial signal dimension retrieved self.xin = 0 # initial noise dimension retrieved self.above_noise = 4 # correction for noise level.. threshold = noiselevel*above_noise self.step_k = 20 # rank step for calculating the norm self.list_sane_norm = [] # list of norm values with sane. self.estim_nbpeaks = None # estimation of the number of lines. self.orda = orda self.prec = prec self.debug = debug
[docs] def subspace_filling_with_rank(self): ''' Estimation of the signal and noise subspace filling in function of the rank. ''' self.separate_signal_from_noise() # M = self.orda - self.estim_nbpeaks self.psig = norm(self.spec_trunc)**2 # Calculation of the signal power self.pnoise = norm(self.spec - self.spec_trunc)**2 # Calculation of the noise power for i in range(self.orda): empty = (self.psig*(1-self.xis)**2 + self.pnoise*(1 - self.xin)**2) # power not yet retrieved. self.xis += self.psig*(1-self.xis)**2//empty//self.estim_nbpeaks # part of signal dimension retrieved self.xin += self.pnoise*(1-self.xin)**2//empty//M # part of noise dimension retrieved self.list_xis.append(self.xis) self.list_xin.append(self.xin)
[docs] def separate_signal_from_noise(self): ''' Signal is separated from Noise and kept in self.spec self.noiselev : level of the noise self.estim_nbpeaks : estimation of the number of peaks in the spectrum ''' if self.fid.dtype == 'complex': self.spec = mfft(self.fid) if self.debug: print("complex fid") elif self.fid.dtype == 'float': self.spec = mrfft(self.fid) if self.debug: print("real fid") self.spec_trunc = self.spec.copy() self.noiselev = findnoiselevel(self.spec, nbseg = 10) # finds noise level ### nbseg = 20 less = len(self.spec)%nbseg # rest of division of length of data by nb of segment restpeaks = self.spec[less:] # remove the points that avoid to divide correctly the data in segment of same size. mean_level = restpeaks.mean() self.noiselev += mean_level if self.debug: print("noiseleve found is ", self.noiselev) self.spec_trunc[self.spec < self.above_noise*self.noiselev] = 0 # truncates spectrum under noise level peaks = self.peaks1d(self.spec, threshold = self.above_noise*self.noiselev) self.estim_nbpeaks = len(peaks) if self.debug: print("self.estim_nbpeaks ", self.estim_nbpeaks) print("self.above_noise*self.noiselev ", self.above_noise*self.noiselev)
[docs] def find_best_rank(self): ''' Finds the optimal rank optk : optimal rank ''' self.subspace_filling_with_rank() # Subspaces filling diff = abs(np.array(self.list_xis) - self.prec) minval = (diff).min() optk = list(diff).index(minval) if self.debug: print("optimal rank is ", optk) return optk
[docs] def peaks1d(self, fid, threshold = 0.1): ''' Extracts peaks from 1d from FID Returns listpk[0] ''' listpk = np.where(((fid > threshold*np.ones(fid.shape))& # thresholding (fid > np.roll(fid, 1, 0)) & (fid > np.roll(fid, -1, 0)) )) # roll 1 and -1 on axis 0 return listpk[0]
[docs]def test_sane_gene( lendata = 10000, rank = 100, orda = 4000, nbpeaks = 2, noise = 50.0, noisetype = "additive", nb_iterat = 1, trick = False ): """ ============== example of use of sane on a synthetic data-set =============== """ from ..Display import testplot # testplot.PLOT = True plt = testplot.plot() from ..util.dynsubplot import subpl from ..util.signal_tools import fid_signoise, fid_signoise_type, SNR_dB, mfft superimpose = False nb_iterat = nb_iterat ########### print("=== Running rQR algo ===", end=' ') print("lendata:", lendata, end=' ') print(" orda:", orda, end=' ') print(' rank:', rank) data = fid_signoise_type(nbpeaks,lendata, noise, noisetype) # noisy signal fdatanoise = mfft(data) noise = 0 data0 = fid_signoise_type(nbpeaks,lendata, noise, noisetype) # clean signal fdata = mfft(data0) iSNR = SNR_dB(data,data0) print("Initial Noisy Data SNR: %.2f dB - noise type : %s"%(iSNR, noisetype)) t0 = time.time() datasane = sane(data, k = rank, orda = orda, optk = False, iterations = nb_iterat, trick = trick ) # denoise signal with sane tsane = time.time()-t0 fdatasane = mfft(datasane )# FFT of sane denoised signal print("=== Result ===") fSNR = SNR_dB(datasane, data0) print("Denoised SNR: %.2f dB - processing gain : %.2f dB"%( fSNR, fSNR-iSNR )) print("processing time for sane : %f sec"%tsane) print(fSNR-iSNR) ##### sub = subpl(2, plt) sub.next() ####################### sub.plot(data0,'b',label = "clean signal") # original signal sub.title('data series') sub.next() sub.plot(fdata,'b',label = "clean spectrum") # original spectrum sub.title('FFT spectrum') ###################### sub.next() sub.plot(data,'k', label = "noisy signal") # plot the noisy fid sub.next() if superimpose: sub.plot(fdatasane ,'r', label = 'sane {} iteration(s)'.format(nb_iterat)) sub.plot(fdatanoise,'k', label = "noisy spectrum") # plot the noisy spectrum ####################### sub.next() sub.plot(datasane ,'r', label = 'sane filtered signal') # plot the fid denoised with sane sub.next() sub.plot(fdatasane ,'r', label = 'sane filtered spectrum') # plot the spectrum denoised with sane sub.title("Noise type : " + noisetype) ############################ sub.show() return (iSNR, fSNR) #datasane
[docs]def test_sane( lendata = 10000, rank = 100, orda = 4000, noise = 200.0, iterations=1, noisetype = "additive"): """ ============== example of use of Sane on a synthetic data-set =============== """ import time import matplotlib.pyplot as plt from matplotlib.ticker import MaxNLocator def plot_param(fig,fignumb): ax = fig.add_subplot(fignumb) ax.xaxis.set_major_locator(MaxNLocator(4)) ax.yaxis.set_major_locator(MaxNLocator(4)) def mfft(v): "utility that returns the modulus of the fft of v" import scipy.fftpack as fft s0 = fft.fft(v) ### s0 = np.real(np.sqrt(s0*s0.conj())) # ref spectrum return s0 def SNR(noisy,target): "computes and return SNR value, in dB" return 10*np.log10(sum(abs(target)**2)/sum(abs(noisy - target)**2)) ################################################ # Data built for tests ################################################ Create the data nbpeaks = 8 # number of simulated signals LB = 1.11 # linewidth Freq = [(i+1+np.sqrt(10))*np.pi*500.0j for i in range(nbpeaks)] # frequencies Amp = [(i+1)*20 for i in range(nbpeaks)] # amplitudes data0 = np.zeros(lendata,dtype=complex) if noisetype == "additive": x = np.arange(lendata*1.0)/lendata # time series for i in range(nbpeaks): data0 += Amp[i] * np.exp(Freq[i]*x) * np.exp(-LB*x) dataadd = data0 + noise*(np.random.randn(x.size)+1j*np.random.randn(x.size)) # additive complex noise data=dataadd elif noisetype == "multiplicative": x = np.arange(lendata*1.0)/lendata # time series for i in range(nbpeaks): data0 += Amp[i] * np.exp(Freq[i]*x) * np.exp(-LB*x) data = np.zeros(lendata,dtype=complex) Anoise = noise/2 Fnoise = noise/200 for i in range(nbpeaks): nAmp = Amp[i] + Anoise*np.random.randn(x.size) nFreq = Freq[i] + Fnoise*np.random.randn(x.size) data += nAmp * np.exp(nFreq*x) * np.exp(-LB*x) elif noisetype == "sampling": x = np.arange(lendata*1.0)/lendata # time series xn = x + 0.5*np.random.randn(x.size)/lendata # time series with noisy jitter for i in range(nbpeaks): data0 += Amp[i] * np.exp(Freq[i]*x) * np.exp(-LB*x) data = np.zeros(lendata,dtype=complex) for i in range(nbpeaks): data += Amp[i] * np.exp(Freq[i]*xn) * np.exp(-LB*xn) elif noisetype == "missing points": x = np.arange(lendata*1.0)/lendata # time series for i in range(nbpeaks): data0 += Amp[i] * np.exp(Freq[i]*x) * np.exp(-LB*x) miss = np.random.randint(2, size=len(x)) dataadd = data0*miss data=dataadd else: raise Exception("unknown noise type") iSNR = SNR(data,data0) print ("Initial Noisy Data SNR: %.2f dB - noise type : %s"%(iSNR,noisetype)) ###########---- fdata = mfft(data0) # FFT of noiseless signal fdatanoise = mfft(data)# FFT of noisy signal ########### print ("=== Running Sane algo ===") print ("lendata:",lendata) print (" orda:",orda) print (' rank:',rank) t0 = time.time() datasane = sane(data, k=rank, orda=orda, iterations=iterations) # denoise signal tsane = time.time()-t0 fdatasane = mfft(datasane )# FFT of sane denoised signal print( "=== Result ===") fSNR = SNR(datasane, data0) print ("Denoised SNR: %.2f dB - processing gain : %.2f dB"%( fSNR, fSNR-iSNR )) print ("processing time for sane : %.2f sec"%tsane) ################################################################# Plotting fig = plt.figure() plot_param(fig,321) plt.plot(data0.real,'b',label="clean signal")# plot the clean data plt.legend() plt.title('data series') plot_param(fig,323) plt.plot(data.real,'k', label="noisy signal")# plot the noisy data plt.legend() plot_param(fig,325) plt.plot(datasane.real ,'r', label='sane filtered signal') # plot the signal denoised plt.legend() plot_param(fig,322) plt.plot(fdata,'b',label="clean spectrum") plt.legend() plt.title('FFT spectrum') plot_param(fig,324) plt.plot(fdatanoise,'k', label="noisy spectrum")# plot the noisy data plt.legend() plot_param(fig,326) plt.plot(fdatasane ,'r', label= 'sane filtered spectrum') # plot the signal denoised plt.suptitle("Noise type : "+noisetype) plt.legend() plt.show()
[docs]class sane_Tests(unittest.TestCase):
[docs] def test_sane(self): ''' Makes sane without trick and 1 iteration. ''' iSNR, fSNR = test_sane_gene(lendata = 10000, rank = 30, orda = 4000, nbpeaks = 2, noise = 10.0, noisetype = "additive", nb_iterat = 1 ) self.assertAlmostEqual(iSNR, 6, 0) self.assertTrue(fSNR > 28)
[docs] def test_sane_iter_trick(self): ''' Makes sane with trick and varying the number of iterations. ''' for it in range(1,4): test_sane_gene(lendata = 10000, rank = 4, orda = 4000, nbpeaks = 2, noise = 50.0, noisetype = "additive", nb_iterat = it, trick = True )
def _test_optim(self): ''' Test of the rank optimization. The algorithm finds the minimal rank restituting the signal with complete power. ''' from ..Display import testplot from numpy.fft import fft plt = testplot.plot() from ..util.signal_tools import fid_signoise nbpeaks = 15 # number of peaks ampl = 2 # amplitude for the peaks lengthfid = 10000 # length of the Fid. noise = 20 # white noise amplitude fid = fid_signoise(nbpeaks, ampl , lengthfid = lengthfid, noise = noise) # builds the signal plt.plot(abs(fft(fid))) ######## orda = lengthfid//4 optrk = OPTK(fid, orda = orda, debug = True) # instantiate the class optk = optrk.find_best_rank() # optimal rank estimation. print("optk", optk) self.assertAlmostEqual(optk, 66, 0) plt.show()