1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 from timeside.core import implements, interfacedoc
23 from timeside.analyzer.core import Analyzer
24 from timeside.analyzer.utils import melFilterBank, computeModulation
25 from timeside.analyzer.utils import segmentFromValues
26 from timeside.api import IAnalyzer
27 from numpy import array, hamming, dot, mean, float
28 from numpy.fft import rfft
29 from scipy.signal import firwin, lfilter
33 implements(IAnalyzer)
34 '''
35 Segmentor based on the analysis of the 4Hz energy modulation.
36
37 Properties:
38 - energy4hz (list) : List of the 4Hz energy by frame for the modulation computation
39 - threshold (float) : Threshold for the classification Speech/NonSpeech
40 - frequency_center (float) : Center of the frequency range where the energy is extracted
41 - frequency_width (float) : Width of the frequency range where the energy is extracted
42 - orderFilter (int) : Order of the pass-band filter extracting the frequency range
43 - normalizeEnergy (boolean) : Whether the energy must be normalized or not
44 - nFFT (int) : Number of points for the FFT. Better if 512 <= nFFT <= 2048
45 - nbFilters (int) : Length of the Mel Filter bank
46 - melFilter (numpy array) : Mel Filter bank
47 - modulLen (float) : Length (in second) of the modulation computation window
48 '''
49
50 @interfacedoc
51 - def setup(self, channels=None, samplerate=None, blocksize=None, totalframes=None):
52 super(IRITSpeech4Hz, self).setup(
53 channels, samplerate, blocksize, totalframes)
54 self.energy4hz = []
55
56 self.threshold = 2.0
57
58
59 self.frequency_center = 4.0
60 self.frequency_width = 0.5
61 self.orderFilter = 100
62
63 self.normalizeEnergy = True
64 self.nFFT = 2048
65 self.nbFilters = 30
66 self.modulLen = 2.0
67 self.melFilter = melFilterBank(self.nbFilters, self.nFFT, samplerate)
68
69 @staticmethod
70 @interfacedoc
72 return "irit_speech_4hz"
73
74 @staticmethod
75 @interfacedoc
77 return "IRIT Speech 4Hz Modulation"
78
79 @staticmethod
80 @interfacedoc
83
85 return "Speech confidences indexes"
86
87 - def process(self, frames, eod=False):
88 '''
89
90 '''
91
92 frames = frames.T[0]
93
94 w = frames * hamming(len(frames))
95
96
97 f = abs(rfft(w, n=2 * self.nFFT)[0:self.nFFT])
98 e = dot(f ** 2, self.melFilter)
99
100 self.energy4hz.append(e)
101
102 return frames, eod
103
104 - def post_process(self):
105 '''
106
107 '''
108
109 Wo = self.frequency_center / self.samplerate()
110 Wn = [Wo - (self.frequency_width / 2) / self.samplerate(),
111 Wo + (self.frequency_width / 2) / self.samplerate()]
112 num = firwin(self.orderFilter, Wn, pass_zero=False)
113
114
115 self.energy4hz = array(self.energy4hz)
116 energy = lfilter(num, 1, self.energy4hz.T, 0)
117 energy = sum(energy)
118
119
120 if self.normalizeEnergy:
121 energy = energy / mean(energy)
122
123
124 frameLenModulation = int(
125 self.modulLen * self.samplerate() / self.blocksize())
126 modEnergyValue = computeModulation(energy, frameLenModulation, True)
127
128
129 conf = array(modEnergyValue - self.threshold) / self.threshold
130 conf[conf > 1] = 1
131
132 modEnergy = self.new_result(data_mode='value', time_mode='framewise')
133 modEnergy.id_metadata.id += '.' + 'energy_confidence'
134 modEnergy.id_metadata.name += ' ' + 'Energy Confidence'
135
136 modEnergy.data_object.value = conf
137
138 self._results.add(modEnergy)
139
140
141 convert = {False: 0, True: 1}
142 label = {0: 'nonSpeech', 1: 'Speech'}
143
144 segList = segmentFromValues(modEnergyValue > self.threshold)
145
146 segs = self.new_result(data_mode='label', time_mode='segment')
147 segs.id_metadata.id += '.' + 'segments'
148 segs.id_metadata.name += ' ' + 'Segments'
149
150 segs.label_metadata.label = label
151
152 segs.data_object.label = [convert[s[2]] for s in segList]
153 segs.data_object.time = [(float(s[0]) * self.blocksize() /
154 self.samplerate())
155 for s in segList]
156 segs.data_object.duration = [(float(s[1]-s[0]) * self.blocksize() /
157 self.samplerate())
158 for s in segList]
159
160 self._results.add(segs)
161
162 return
163