Source code for frontend.features

# -*- coding: utf-8 -*-
#
# This file is part of SIDEKIT.
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#    
# SIDEKIT is free software: you can redistribute it and/or modify
# it under the terms of the GNU LLesser General Public License as 
# published by the Free Software Foundation, either version 3 of the License, 
# or (at your option) any later version.
#
# SIDEKIT is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with SIDEKIT.  If not, see <http://www.gnu.org/licenses/>.

"""
Copyright 2014-2015 Anthony Larcher and Sylvain Meignier

:mod:`frontend` provides methods to process an audio signal in order to extract
useful parameters for speaker verification.
"""

__author__ = "Anthony Larcher and Sylvain Meignier"
__copyright__ = "Copyright 2014-2015 Anthony Larcher and Sylvain Meignier"
__license__ = "LGPL"
__version__ = "1.0"
__maintainer__ = "Anthony Larcher"
__email__ = "anthony.larcher@univ-lemans.fr"
__status__ = "Production"
__docformat__ = 'reStructuredText'


import numpy as np
import multiprocessing
from scipy.signal import hamming
from scipy.fftpack.realtransforms import dct
from sidekit.frontend.vad import *
from sidekit.frontend.io import *
from sidekit.frontend.normfeat import *
#from memory_profiler import profile
import gc

[docs]def hz2mel(f): """Convert an array of frequency in Hz into mel. :param f: frequency to convert :return: the equivalene on the mel scale. """ return 1127.01048 * np.log(f/700 +1)
[docs]def mel2hz(m): """Convert an array of mel values in Hz. :param m: ndarray of frequencies to convert in Hz. :return: the equivalent values in Hertz. """ return (np.exp(m / 1127.01048) - 1) * 700
[docs]def compute_delta(features, win=3, method='filter', filt=np.array([.25, .5, .25, 0, -.25, -.5, -.25])): """features is a 2D-ndarray each row of features is a a frame :param features: the feature frames to compute the delta coefficients :param win: parameter that set the length of the computation window. The eize of the window is (win x 2) + 1 :param methods: method used to compute the delta coefficients can be diff or filter :param filt: definition of the filter to use in "filter" mode, default one is similar to SPRO4: filt=np.array([.2, .1, 0, -.1, -.2]) :return: the delta coefficients computed on the original features. """ # First and last features are appended to the begining and the end of the # stream to avoid border effect x = np.zeros((features.shape[0] + 2* win, features.shape[1])) x[:win, :] = features[0, :] x[win:-win,:] = features x[-win:, :] = features[-1, :] delta = np.zeros(x.shape) if method == 'diff': filt = np.zeros(2 * win + 1) filt[0] = -1 filt[-1] = 1 for i in range(features.shape[1]): delta[:, i] = np.convolve(features[:, i], filt) return delta[win:-win, :]
[docs]def trfbank(fs, nfft, lowfreq, maxfreq, nlinfilt, nlogfilt, midfreq=1000): """Compute triangular filterbank for cepstral coefficient computation. :param fs: sampling frequency of the original signal. :param nfft: number of points for the Fourier Transform :param lowfreq: lower limit of the frequency band filtered :param maxfreq: higher limit of the frequency band filtered :param nlinfilt: number of linear filters to use in low frequencies :param nlogfilt: number of log-linear filters to use in high frequencies :param midfreq: frequency boundary between linear and log-linear filters :return: the filter bank and the central frequencies of each filter """ # Total number of filters nfilt = nlinfilt + nlogfilt #------------------------ # Compute the filter bank #------------------------ # Compute start/middle/end points of the triangular filters in spectral # domain freqs = np.zeros(nfilt + 2) if nlogfilt == 0: linsc = (maxfreq - lowfreq)/ (nlinfilt + 1) freqs[:nlinfilt + 2] = lowfreq + np.arange(nlinfilt + 2) * linsc elif (nlinfilt == 0): lowMel = hz2mel(lowfreq) maxMel = hz2mel(maxfreq) mels = np.zeros(nlogfilt+2) mels[nlinfilt:] melsc = (maxMel - lowMel)/ (nfilt + 1) mels[:nlogfilt + 2] = lowMel + np.arange(nlogfilt + 2) * melsc # Back to the frequency domain freqs = mel2hz(mels) else: # Compute linear filters on [0;1000Hz] linsc = (min([midfreq,maxfreq]) - lowfreq)/ (nlinfilt + 1) freqs[:nlinfilt] = lowfreq + np.arange(nlinfilt) * linsc # Compute log-linear filters on [1000;maxfreq] lowMel = hz2mel(min([1000,maxfreq])) maxMel = hz2mel(maxfreq) mels = np.zeros(nlogfilt+2) melsc = (maxMel - lowMel)/ (nlogfilt + 1) # Verify that mel2hz(melsc)>linsc while (mel2hz(melsc)<linsc): logging.debug('nlinfilt = ',nlinfilt,' nlogfilt = ',nlogfilt,' ne fonctionne pas') # in this case, we add a linear filter nlinfilt += 1 nlogfilt -= 1 freqs[:nlinfilt] = lowfreq + np.arange(nlinfilt) * linsc lowMel = hz2mel(freqs[nlinfilt-1]+2*linsc) maxMel = hz2mel(maxfreq) mels = np.zeros(nlogfilt+2) melsc = (maxMel - lowMel)/ (nlogfilt + 1) mels[:nlogfilt + 2] = lowMel + np.arange(nlogfilt + 2) * melsc # Back to the frequency domain freqs[nlinfilt:] = mel2hz(mels) heights = 2./(freqs[2:] - freqs[0:-2]) # Compute filterbank coeff (in fft domain, in bins) fbank = np.zeros((nfilt, np.floor(nfft/2)+1)) # FFT bins (in Hz) nfreqs = np.arange(nfft) / (1. * nfft) * fs for i in range(nfilt): low = freqs[i] cen = freqs[i+1] hi = freqs[i+2] lid = np.arange(np.floor(low * nfft / fs) + 1, np.floor(cen * nfft / fs) + 1, dtype=np.int) lslope = heights[i] / (cen - low) rid = np.arange(np.floor(cen * nfft / fs) + 1, min(np.floor(hi * nfft / fs) + 1,nfft), dtype=np.int) rslope = heights[i] / (hi - cen) fbank[i][lid] = lslope * (nfreqs[lid] - low) fbank[i][rid[:-1]] = rslope * (hi - nfreqs[rid[:-1]]) return fbank, freqs
[docs]def mfcc(input, lowfreq=100, maxfreq=8000, nlinfilt=0, nlogfilt=24, nwin=256, nfft=512, fs=16000, nceps=13, midfreq = 1000, shift=0.01, get_spec=False, get_mspec=False): """Compute Mel Frequency Cepstral Coefficients. :param input: input signal from which the coefficients are computed. Input audio is supposed to be RAW PCM 16bits :param lowfreq: lower limit of the frequency band filtered. Default is 100Hz. :param maxfreq: higher limit of the frequency band filtered. Default is 8000Hz. :param nlinfilt: number of linear filters to use in low frequencies. Default is 0. :param nlogfilt: number of log-linear filters to use in high frequencies. Default is 24. :param nwin: length of the sliding window. Default is 256. :param nfft: number of points for the Fourier Transform. Default is 512. :param fs: sampling frequency of the original signal. Default is 16000Hz. :param nceps: number of cepstral coefficients to extract. Default is 13. :param midfreq: frequency boundary between linear and log-linear filters. Default is 1000Hz. :param shift: shift between two analyses. Default is 0.01 (10ms). :return: the cepstral coefficients in a ndaray as well as the Log-spectrum in the mel-domain in a ndarray. .. note:: MFCC are computed as follows: - Pre-processing in time-domain (pre-emphasizing) - Compute the spectrum amplitude by windowing with a Hamming window - Filter the signal in the spectral domain with a triangular filter-bank, whose filters are approximatively linearly spaced on the mel scale, and have equal bandwith in the mel scale - Compute the DCT of the log-spectrom - Log-energy is returned as first coefficient of the feature vector. For more details, refer to [Davis80]_. """ # Pre-emphasis factor (to take into account the -6dB/octave rolloff of the # radiation at the lips level) prefac = 0.97 logging.debug('pre emphasis') extract = pre_emphasis(input, prefac) # Compute the overlap of frames and cut the signal in frames of length nwin # overlaping by "overlap" samples logging.debug('axis') w = hamming(nwin, sym=0) overlap = nwin - int(shift * fs) framed = segment_axis(extract, nwin, overlap) l = framed.shape[0] spec = np.ones((l, nfft/2+1)) logEnergy = np.ones(l) dec = 10000 start = 0 stop = min(dec, l) while start < l: # logging.debug('fft start: %d stop: %d', start, stop) # Compute the spectrum magnitude tmp = framed[start:stop,:] * w spec[start:stop,:] = np.abs(np.fft.rfft(tmp, nfft, axis=-1)) # Compute the log-energy of each frame logEnergy[start:stop] = 2.0 * np.log(np.sqrt(np.sum(np.square(tmp), axis=1))) start = stop stop = min(stop + dec, l) del framed del extract logging.debug('log10') # Filter the spectrum through the triangle filterbank # Prepare the hamming window and the filter bank logging.debug('trf bank') fbank = trfbank(fs, nfft, lowfreq, maxfreq, nlinfilt, nlogfilt)[0] mspec = np.log10(np.dot(spec, fbank.T)) del fbank logging.debug('dct') # Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain) # The C0 term is removed as it is the constant term ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:, 1:nceps + 1] lst = list() lst.append(ceps) lst.append(logEnergy) if get_spec: lst.append(spec) else: lst.append(None) del spec if get_mspec: lst.append(mspec) else: lst.append(None) del mspec return lst