# -*- coding: utf-8 -*-
#
# This file is part of SIDEKIT.
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is free software: you can redistribute it and/or modify
# it under the terms of the GNU LLesser General Public License as
# published by the Free Software Foundation, either version 3 of the License,
# or (at your option) any later version.
#
# SIDEKIT is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with SIDEKIT. If not, see <http://www.gnu.org/licenses/>.
"""
Copyright 2014-2015 Anthony Larcher and Sylvain Meignier
:mod:`frontend` provides methods to process an audio signal in order to extract
useful parameters for speaker verification.
"""
__author__ = "Anthony Larcher and Sylvain Meignier"
__copyright__ = "Copyright 2014-2015 Anthony Larcher and Sylvain Meignier"
__license__ = "LGPL"
__version__ = "1.0"
__maintainer__ = "Anthony Larcher"
__email__ = "anthony.larcher@univ-lemans.fr"
__status__ = "Production"
__docformat__ = 'reStructuredText'
import numpy as np
import scipy.stats as stats
from scipy.signal import lfilter
[docs]def rasta_filt(x):
"""Apply RASTA filtering to the input signal.
:param x: the input audio signal to filter.
cols of x = critical bands, rows of x = frame
same for y but after filtering
default filter is single pole at 0.94
"""
x = x.T
numer = np.arange(.2,-.3,-.1)
denom = np.array([1,-0.98])
# Initialize the state. This avoids a big spike at the beginning
# resulting from the dc offset level in each band.
# (this is effectively what rasta/rasta_filt.c does).
# Because Matlab uses a DF2Trans implementation, we have to
# specify the FIR part to get the state right (but not the IIR part)
y = np.zeros(x.shape)
zf = np.zeros((x.shape[0], 4))
for i in range(y.shape[0]):
#y[i, :4], zf[i, :4] = lfilter(numer, 1, x[i, :4], axis=-1, zi=[0, 0, 0, 0])
y[i, :4], zf[i, :4] = lfilter(numer, 1, x[i, :4], axis=-1, zi=[0, 0, 0, 0])
# .. but don't keep any of these values, just output zero at the beginning
y = np.zeros(x.shape)
# Apply the full filter to the rest of the signal, append it
for i in range(y.shape[0]):
y[i, 4:] = lfilter(numer, denom, x[i, 4:], axis=-1, zi=zf[i, :])[0]
return y.T
[docs]def cms(features, label=[]):
"""Performs cepstral mean subtraction
:param features: a feature stream of dimension dim x nframes
where dim is the dimension of the acoustic features and nframes the
number of frames in the stream
:param label: a logical verctor
:return: a feature stream
"""
# If no label file as input: all speech are speech
if label == []:
label = np.ones(features.shape[0]).astype(bool)
if all(label == False):
normFeatures = features
else:
speechFeatures = features[label, :]
mu = speechFeatures.mean(0)
normFeatures = features - mu
return normFeatures
[docs]def cmvn(features, label=[]):
"""Performs mean and variance normalization
:param features: a feature stream of dimension dim x nframes
where dim is the dimension of the acoustic features and nframes the
number of frames in the stream
:param label: a logical verctor
:return: a sequence of features
"""
# If no label file as input: all speech are speech
if label == []:
label = np.ones(features.shape[0]).astype(bool)
if all(label == False):
normFeatures = features
else:
speechFeatures = features[label, :]
mu = speechFeatures.mean(0)
stdev = np.std(speechFeatures, axis=0)
normFeatures = features - mu
normFeatures = normFeatures / stdev
return normFeatures
[docs]def stg(features, label=[], win=301):
"""Performs feature warping on a sliding window
:param features: a feature stream of dimension dim x nframes
where dim is the dimension of the acoustic features and nframes the
number of frames in the stream
:return: a sequence of features
"""
# If no label file as input: all speech are speech
if label == []:
label = np.ones(features.shape[0]).astype(bool)
speechFeatures = features[label, :]
add_a_feature = False
if win % 2 == 1:
# one feature per line
nframes, dim = np.shape(speechFeatures)
# If the number of frames is not enough for one window
if nframes < win:
# if the number of frames is not odd, duplicate the last frame
#if nframes % 2 == 1:
if not nframes % 2 == 1:
nframes += 1
add_a_feature = True
speechFeatures = np.concatenate((speechFeatures, [speechFeatures[-1, ]]))
win = nframes
# create the output feature stream
stgFeatures = np.zeros(np.shape(speechFeatures))
# Process first window
R = np.argsort(speechFeatures[:win, ], axis=0)
R = np.argsort(R, axis=0)
arg = (R[: (win - 1) / 2] + 0.5) / win
stgFeatures[: (win - 1) / 2, :] = stats.norm.ppf(arg, 0, 1)
# process all follwing windows except the last one
for m in range(int((win - 1) / 2), int(nframes - (win - 1) / 2)):
idx = list(range(int(m - (win - 1) / 2),
int(m + (win - 1) / 2 + 1)))
foo = speechFeatures[idx, :]
R = np.sum(foo < foo[(win - 1) / 2], axis=0) + 1
arg = (R - 0.5) / win
stgFeatures[m, :] = stats.norm.ppf(arg, 0, 1)
# Process the last window
R = np.argsort(speechFeatures[list(range(nframes - win, nframes)), ], axis=0)
R = np.argsort(R, axis=0)
arg = (R[(win + 1) / 2: win, :] + 0.5) / win
stgFeatures[list(range(int(nframes - (win - 1) / 2), nframes)), ] \
= stats.norm.ppf(arg, 0, 1)
else:
# Raise an exception
raise Exception('Sliding window should have an odd length')
wrapFeatures = np.copy(features)
if add_a_feature:
stgFeatures = stgFeatures[:-1]
wrapFeatures[label, :] = stgFeatures
return wrapFeatures
[docs]def normalize_feature_stream(features, label=[], mode='cmvn', win='301',
normVar=False, keepAllFeatures=False):
"""Normalize features from a feature stream by using either
'cms', 'cmvn' or 'stg'
:param features: a feature stream to normalize
:param label: a logical vector True if the frame should be processed.
By default, all frames are considered True.
:param mode: normalization to apply: 'cms', 'cmvn' or 'stg'.
Default is 'cmvn'.
:param win: for 'stg' mode only, size of the sliding window.
Default is 301.
:param normVar: for 'cmvn' mode only, if True normalize the variance.
Default is False.
:param keepAllFeatures: boolean, if True, keep also non-processed features
:return: a sequence of features
"""
# if no label, use all features
if label == []:
label = np.ones(features.shape[0], dtype='bool')
if mode == 'cmvn':
features = cmvn(features, label=label)
elif mode == 'cms':
features = cms(features, label=label)
elif mode == 'stg':
speechFeatures = features[label, :]
speechFeatures = stg(speechFeatures, win)
features[label, :] = speechFeatures
if keepAllFeatures:
normfeatures = features
else:
normfeatures = features[label, :]
return normfeatures