-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcomp_wss.m
executable file
·303 lines (249 loc) · 11.4 KB
/
comp_wss.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
function wss_dist= comp_wss(cleanFile, enhancedFile, fs);
% ----------------------------------------------------------------------
%
% Weighted Spectral Slope (WSS) Objective Speech Quality Measure
%
% This function implements the Weighted Spectral Slope (WSS)
% distance measure originally proposed in [1]. The algorithm
% works by first decomposing the speech signal into a set of
% frequency bands (this is done for both the test and reference
% frame). The intensities within each critical band are
% measured. Then, a weighted distances between the measured
% slopes of the log-critical band spectra are computed.
% This measure is also described in Section 2.2.9 (pages 56-58)
% of [2].
%
% Whereas Klatt's original measure used 36 critical-band
% filters to estimate the smoothed short-time spectrum, this
% implementation considers a bank of 25 filters spanning
% the 4 kHz bandwidth.
%
% Usage: wss_dist=comp_wss(cleanFile.wav, enhancedFile.wav)
%
% cleanFile.wav - clean input file in .wav format
% enhancedFile - enhanced output file in .wav format
% wss_dist - computed spectral slope distance
%
% Example call: ws =comp_wss('sp04.wav','enhanced.wav')
%
% References:
%
% [1] D. H. Klatt, "Prediction of Perceived Phonetic Distance
% from Critical-Band Spectra: A First Step", Proc. IEEE
% ICASSP'82, Volume 2, pp. 1278-1281, May, 1982.
%
% [2] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
% Objective Measures of Speech Quality. Prentice Hall
% Advanced Reference Series, Englewood Cliffs, NJ, 1988,
% ISBN: 0-13-629056-6.
%
% Authors: Bryan L. Pellom and John H. L. Hansen (July 1998)
% Modified by: Philipos C. Loizou (Oct 2006)
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
%
% ----------------------------------------------------------------------
if nargin~=3
fprintf('USAGE: WSS=comp_wss(cleanFile.wav, enhancedFile.wav)\n');
fprintf('For more help, type: help comp_wss\n\n');
return;
end
alpha= 0.95;
% [data1, Srate1, Nbits1]= wavread(cleanFile);
% [data2, Srate2, Nbits2]= wavread(enhancedFile);
% if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
% error( 'The two files do not match!\n');
% end
data1 = cleanFile;
data2 = enhancedFile;
Srate1 = fs;
len= min( length( data1), length( data2));
data1= data1( 1: len)+eps;
data2= data2( 1: len)+eps;
wss_dist_vec= wss( data1, data2,Srate1);
wss_dist_vec= sort( wss_dist_vec);
wss_dist= mean( wss_dist_vec( 1: round( length( wss_dist_vec)*alpha)));
function distortion = wss(clean_speech, processed_speech,sample_rate)
% ----------------------------------------------------------------------
% Check the length of the clean and processed speech. Must be the same.
% ----------------------------------------------------------------------
clean_length = length(clean_speech);
processed_length = length(processed_speech);
if (clean_length ~= processed_length)
disp('Error: Files musthave same length.');
return
end
% ----------------------------------------------------------------------
% Global Variables
% ----------------------------------------------------------------------
winlength = round(30*sample_rate/1000); % window length in samples
skiprate = floor(winlength/4); % window skip in samples
max_freq = sample_rate/2; % maximum bandwidth
num_crit = 25; % number of critical bands
USE_FFT_SPECTRUM = 1; % defaults to 10th order LP spectrum
n_fft = 2^nextpow2(2*winlength);
n_fftby2 = n_fft/2; % FFT size/2
Kmax = 20; % value suggested by Klatt, pg 1280
Klocmax = 1; % value suggested by Klatt, pg 1280
% ----------------------------------------------------------------------
% Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
% ----------------------------------------------------------------------
cent_freq(1) = 50.0000; bandwidth(1) = 70.0000;
cent_freq(2) = 120.000; bandwidth(2) = 70.0000;
cent_freq(3) = 190.000; bandwidth(3) = 70.0000;
cent_freq(4) = 260.000; bandwidth(4) = 70.0000;
cent_freq(5) = 330.000; bandwidth(5) = 70.0000;
cent_freq(6) = 400.000; bandwidth(6) = 70.0000;
cent_freq(7) = 470.000; bandwidth(7) = 70.0000;
cent_freq(8) = 540.000; bandwidth(8) = 77.3724;
cent_freq(9) = 617.372; bandwidth(9) = 86.0056;
cent_freq(10) = 703.378; bandwidth(10) = 95.3398;
cent_freq(11) = 798.717; bandwidth(11) = 105.411;
cent_freq(12) = 904.128; bandwidth(12) = 116.256;
cent_freq(13) = 1020.38; bandwidth(13) = 127.914;
cent_freq(14) = 1148.30; bandwidth(14) = 140.423;
cent_freq(15) = 1288.72; bandwidth(15) = 153.823;
cent_freq(16) = 1442.54; bandwidth(16) = 168.154;
cent_freq(17) = 1610.70; bandwidth(17) = 183.457;
cent_freq(18) = 1794.16; bandwidth(18) = 199.776;
cent_freq(19) = 1993.93; bandwidth(19) = 217.153;
cent_freq(20) = 2211.08; bandwidth(20) = 235.631;
cent_freq(21) = 2446.71; bandwidth(21) = 255.255;
cent_freq(22) = 2701.97; bandwidth(22) = 276.072;
cent_freq(23) = 2978.04; bandwidth(23) = 298.126;
cent_freq(24) = 3276.17; bandwidth(24) = 321.465;
cent_freq(25) = 3597.63; bandwidth(25) = 346.136;
bw_min = bandwidth (1); % minimum critical bandwidth
% ----------------------------------------------------------------------
% Set up the critical band filters. Note here that Gaussianly shaped
% filters are used. Also, the sum of the filter weights are equivalent
% for each critical band filter. Filter less than -30 dB and set to
% zero.
% ----------------------------------------------------------------------
min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter
for i = 1:num_crit
f0 = (cent_freq (i) / max_freq) * (n_fftby2);
all_f0(i) = floor(f0);
bw = (bandwidth (i) / max_freq) * (n_fftby2);
norm_factor = log(bw_min) - log(bandwidth(i));
j = 0:1:n_fftby2-1;
crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
end
% ----------------------------------------------------------------------
% For each frame of input speech, calculate the Weighted Spectral
% Slope Measure
% ----------------------------------------------------------------------
num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
start = 1; % starting sample
window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
for frame_count = 1:num_frames
% ----------------------------------------------------------
% (1) Get the Frames for the test and reference speech.
% Multiply by Hanning Window.
% ----------------------------------------------------------
clean_frame = clean_speech(start:start+winlength-1);
processed_frame = processed_speech(start:start+winlength-1);
clean_frame = clean_frame.*window;
processed_frame = processed_frame.*window;
% ----------------------------------------------------------
% (2) Compute the Power Spectrum of Clean and Processed
% ----------------------------------------------------------
if (USE_FFT_SPECTRUM)
clean_spec = (abs(fft(clean_frame,n_fft)).^2);
processed_spec = (abs(fft(processed_frame,n_fft)).^2);
else
a_vec = zeros(1,n_fft);
a_vec(1:11) = lpc(clean_frame,10);
clean_spec = 1.0/(abs(fft(a_vec,n_fft)).^2)';
a_vec = zeros(1,n_fft);
a_vec(1:11) = lpc(processed_frame,10);
processed_spec = 1.0/(abs(fft(a_vec,n_fft)).^2)';
end
% ----------------------------------------------------------
% (3) Compute Filterbank Output Energies (in dB scale)
% ----------------------------------------------------------
for i = 1:num_crit
clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
.*crit_filter(i,:)');
processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
.*crit_filter(i,:)');
end
clean_energy = 10*log10(max(clean_energy,1E-10));
processed_energy = 10*log10(max(processed_energy,1E-10));
% ----------------------------------------------------------
% (4) Compute Spectral Slope (dB[i+1]-dB[i])
% ----------------------------------------------------------
clean_slope = clean_energy(2:num_crit) - ...
clean_energy(1:num_crit-1);
processed_slope = processed_energy(2:num_crit) - ...
processed_energy(1:num_crit-1);
% ----------------------------------------------------------
% (5) Find the nearest peak locations in the spectra to
% each critical band. If the slope is negative, we
% search to the left. If positive, we search to the
% right.
% ----------------------------------------------------------
for i = 1:num_crit-1
% find the peaks in the clean speech signal
if (clean_slope(i)>0) % search to the right
n = i;
while ((n<num_crit) & (clean_slope(n) > 0))
n = n+1;
end
clean_loc_peak(i) = clean_energy(n-1);
else % search to the left
n = i;
while ((n>0) & (clean_slope(n) <= 0))
n = n-1;
end
clean_loc_peak(i) = clean_energy(n+1);
end
% find the peaks in the processed speech signal
if (processed_slope(i)>0) % search to the right
n = i;
while ((n<num_crit) & (processed_slope(n) > 0))
n = n+1;
end
processed_loc_peak(i) = processed_energy(n-1);
else % search to the left
n = i;
while ((n>0) & (processed_slope(n) <= 0))
n = n-1;
end
processed_loc_peak(i) = processed_energy(n+1);
end
end
% ----------------------------------------------------------
% (6) Compute the WSS Measure for this frame. This
% includes determination of the weighting function.
% ----------------------------------------------------------
dBMax_clean = max(clean_energy);
dBMax_processed = max(processed_energy);
% The weights are calculated by averaging individual
% weighting factors from the clean and processed frame.
% These weights W_clean and W_processed should range
% from 0 to 1 and place more emphasis on spectral
% peaks and less emphasis on slope differences in spectral
% valleys. This procedure is described on page 1280 of
% Klatt's 1982 ICASSP paper.
Wmax_clean = Kmax ./ (Kmax + dBMax_clean - ...
clean_energy(1:num_crit-1));
Wlocmax_clean = Klocmax ./ ( Klocmax + clean_loc_peak - ...
clean_energy(1:num_crit-1));
W_clean = Wmax_clean .* Wlocmax_clean;
Wmax_processed = Kmax ./ (Kmax + dBMax_processed - ...
processed_energy(1:num_crit-1));
Wlocmax_processed = Klocmax ./ ( Klocmax + processed_loc_peak - ...
processed_energy(1:num_crit-1));
W_processed = Wmax_processed .* Wlocmax_processed;
W = (W_clean + W_processed)./2.0;
distortion(frame_count) = sum(W.*(clean_slope(1:num_crit-1) - ...
processed_slope(1:num_crit-1)).^2);
% this normalization is not part of Klatt's paper, but helps
% to normalize the measure. Here we scale the measure by the
% sum of the weights.
distortion(frame_count) = distortion(frame_count)/sum(W);
start = start + skiprate;
end