Package timeside :: Package analyzer :: Module irit_speech_4hz
[hide private]
[frames] | no frames]

Source Code for Module timeside.analyzer.irit_speech_4hz

  1  # -*- coding: utf-8 -*- 
  2  # 
  3  # Copyright (c) 2013 Maxime Le Coz <lecoz@irit.fr> 
  4   
  5  # This file is part of TimeSide. 
  6   
  7  # TimeSide is free software: you can redistribute it and/or modify 
  8  # it under the terms of the GNU General Public License as published by 
  9  # the Free Software Foundation, either version 2 of the License, or 
 10  # (at your option) any later version. 
 11   
 12  # TimeSide is distributed in the hope that it will be useful, 
 13  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 14  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 15  # GNU General Public License for more details. 
 16   
 17  # You should have received a copy of the GNU General Public License 
 18  # along with TimeSide.  If not, see <http://www.gnu.org/licenses/>. 
 19   
 20  # Author: Maxime Le Coz <lecoz@irit.fr> 
 21   
 22  from timeside.core import implements, interfacedoc 
 23  from timeside.analyzer.core import Analyzer 
 24  from timeside.analyzer.utils import melFilterBank, computeModulation 
 25  from timeside.analyzer.utils import segmentFromValues 
 26  from timeside.api import IAnalyzer 
 27  from numpy import array, hamming, dot, mean, float 
 28  from numpy.fft import rfft 
 29  from scipy.signal import firwin, lfilter 
30 31 32 -class IRITSpeech4Hz(Analyzer):
33 implements(IAnalyzer) 34 ''' 35 Segmentor based on the analysis of the 4Hz energy modulation. 36 37 Properties: 38 - energy4hz (list) : List of the 4Hz energy by frame for the modulation computation 39 - threshold (float) : Threshold for the classification Speech/NonSpeech 40 - frequency_center (float) : Center of the frequency range where the energy is extracted 41 - frequency_width (float) : Width of the frequency range where the energy is extracted 42 - orderFilter (int) : Order of the pass-band filter extracting the frequency range 43 - normalizeEnergy (boolean) : Whether the energy must be normalized or not 44 - nFFT (int) : Number of points for the FFT. Better if 512 <= nFFT <= 2048 45 - nbFilters (int) : Length of the Mel Filter bank 46 - melFilter (numpy array) : Mel Filter bank 47 - modulLen (float) : Length (in second) of the modulation computation window 48 ''' 49 50 @interfacedoc
51 - def setup(self, channels=None, samplerate=None, blocksize=None, totalframes=None):
52 super(IRITSpeech4Hz, self).setup( 53 channels, samplerate, blocksize, totalframes) 54 self.energy4hz = [] 55 # Classification 56 self.threshold = 2.0 57 58 # Pass-band Filter 59 self.frequency_center = 4.0 60 self.frequency_width = 0.5 61 self.orderFilter = 100 62 63 self.normalizeEnergy = True 64 self.nFFT = 2048 65 self.nbFilters = 30 66 self.modulLen = 2.0 67 self.melFilter = melFilterBank(self.nbFilters, self.nFFT, samplerate)
68 69 @staticmethod 70 @interfacedoc
71 - def id():
72 return "irit_speech_4hz"
73 74 @staticmethod 75 @interfacedoc
76 - def name():
77 return "IRIT Speech 4Hz Modulation"
78 79 @staticmethod 80 @interfacedoc
81 - def unit():
82 return ""
83
84 - def __str__(self):
85 return "Speech confidences indexes"
86
87 - def process(self, frames, eod=False):
88 ''' 89 90 ''' 91 92 frames = frames.T[0] 93 # windowing of the frame (could be a changeable property) 94 w = frames * hamming(len(frames)) 95 96 # Mel scale spectrum extraction 97 f = abs(rfft(w, n=2 * self.nFFT)[0:self.nFFT]) 98 e = dot(f ** 2, self.melFilter) 99 100 self.energy4hz.append(e) 101 102 return frames, eod
103
104 - def post_process(self):
105 ''' 106 107 ''' 108 # Creation of the pass-band filter 109 Wo = self.frequency_center / self.samplerate() 110 Wn = [Wo - (self.frequency_width / 2) / self.samplerate(), 111 Wo + (self.frequency_width / 2) / self.samplerate()] 112 num = firwin(self.orderFilter, Wn, pass_zero=False) 113 114 # Energy on the frequency range 115 self.energy4hz = array(self.energy4hz) 116 energy = lfilter(num, 1, self.energy4hz.T, 0) 117 energy = sum(energy) 118 119 # Normalization 120 if self.normalizeEnergy: 121 energy = energy / mean(energy) 122 123 # Energy Modulation 124 frameLenModulation = int( 125 self.modulLen * self.samplerate() / self.blocksize()) 126 modEnergyValue = computeModulation(energy, frameLenModulation, True) 127 128 # Confidence Index 129 conf = array(modEnergyValue - self.threshold) / self.threshold 130 conf[conf > 1] = 1 131 132 modEnergy = self.new_result(data_mode='value', time_mode='framewise') 133 modEnergy.id_metadata.id += '.' + 'energy_confidence' 134 modEnergy.id_metadata.name += ' ' + 'Energy Confidence' 135 136 modEnergy.data_object.value = conf 137 138 self._results.add(modEnergy) 139 140 # Segment 141 convert = {False: 0, True: 1} 142 label = {0: 'nonSpeech', 1: 'Speech'} 143 144 segList = segmentFromValues(modEnergyValue > self.threshold) 145 146 segs = self.new_result(data_mode='label', time_mode='segment') 147 segs.id_metadata.id += '.' + 'segments' 148 segs.id_metadata.name += ' ' + 'Segments' 149 150 segs.label_metadata.label = label 151 152 segs.data_object.label = [convert[s[2]] for s in segList] 153 segs.data_object.time = [(float(s[0]) * self.blocksize() / 154 self.samplerate()) 155 for s in segList] 156 segs.data_object.duration = [(float(s[1]-s[0]) * self.blocksize() / 157 self.samplerate()) 158 for s in segList] 159 160 self._results.add(segs) 161 162 return
163