| /* |
| * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. |
| * |
| * Use of this source code is governed by a BSD-style license |
| * that can be found in the LICENSE file in the root of the source |
| * tree. An additional intellectual property rights grant can be found |
| * in the file PATENTS. All contributing project authors may |
| * be found in the AUTHORS file in the root of the source tree. |
| */ |
| |
| #include "vad_core.h" |
| |
| #include "signal_processing_library.h" |
| #include "typedefs.h" |
| #include "vad_defines.h" |
| #include "vad_filterbank.h" |
| #include "vad_gmm.h" |
| #include "vad_sp.h" |
| |
| // Spectrum Weighting |
| static const WebRtc_Word16 kSpectrumWeight[6] = { 6, 8, 10, 12, 14, 16 }; |
| static const WebRtc_Word16 kNoiseUpdateConst = 655; // Q15 |
| static const WebRtc_Word16 kSpeechUpdateConst = 6554; // Q15 |
| static const WebRtc_Word16 kBackEta = 154; // Q8 |
| // Minimum difference between the two models, Q5 |
| static const WebRtc_Word16 kMinimumDifference[6] = { |
| 544, 544, 576, 576, 576, 576 }; |
| // Upper limit of mean value for speech model, Q7 |
| static const WebRtc_Word16 kMaximumSpeech[6] = { |
| 11392, 11392, 11520, 11520, 11520, 11520 }; |
| // Minimum value for mean value |
| static const WebRtc_Word16 kMinimumMean[2] = { 640, 768 }; |
| // Upper limit of mean value for noise model, Q7 |
| static const WebRtc_Word16 kMaximumNoise[6] = { |
| 9216, 9088, 8960, 8832, 8704, 8576 }; |
| // Start values for the Gaussian models, Q7 |
| // Weights for the two Gaussians for the six channels (noise) |
| static const WebRtc_Word16 kNoiseDataWeights[12] = { |
| 34, 62, 72, 66, 53, 25, 94, 66, 56, 62, 75, 103 }; |
| // Weights for the two Gaussians for the six channels (speech) |
| static const WebRtc_Word16 kSpeechDataWeights[12] = { |
| 48, 82, 45, 87, 50, 47, 80, 46, 83, 41, 78, 81 }; |
| // Means for the two Gaussians for the six channels (noise) |
| static const WebRtc_Word16 kNoiseDataMeans[12] = { |
| 6738, 4892, 7065, 6715, 6771, 3369, 7646, 3863, 7820, 7266, 5020, 4362 }; |
| // Means for the two Gaussians for the six channels (speech) |
| static const WebRtc_Word16 kSpeechDataMeans[12] = { |
| 8306, 10085, 10078, 11823, 11843, 6309, 9473, 9571, 10879, 7581, 8180, 7483 |
| }; |
| // Stds for the two Gaussians for the six channels (noise) |
| static const WebRtc_Word16 kNoiseDataStds[12] = { |
| 378, 1064, 493, 582, 688, 593, 474, 697, 475, 688, 421, 455 }; |
| // Stds for the two Gaussians for the six channels (speech) |
| static const WebRtc_Word16 kSpeechDataStds[12] = { |
| 555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850 }; |
| |
| // Constants used in GmmProbability(). |
| // |
| // Maximum number of counted speech (VAD = 1) frames in a row. |
| static const int16_t kMaxSpeechFrames = 6; |
| // Minimum standard deviation for both speech and noise. |
| static const int16_t kMinStd = 384; |
| |
| static const int kInitCheck = 42; |
| |
| // Calculates the probabilities for both speech and background noise using |
| // Gaussian Mixture Models. A hypothesis-test is performed to decide which type |
| // of signal is most probable. |
| // |
| // - inst [i/o] : Pointer to VAD instance |
| // - feature_vector [i] : Feature vector = log10(energy in frequency band) |
| // - total_power [i] : Total power in audio frame. |
| // - frame_length [i] : Number of input samples |
| // |
| // - returns : the VAD decision (0 - noise, 1 - speech). |
| static int16_t GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector, |
| WebRtc_Word16 total_power, int frame_length) |
| { |
| int n, k; |
| WebRtc_Word16 backval; |
| WebRtc_Word16 h0, h1; |
| WebRtc_Word16 ratvec, xval; |
| WebRtc_Word16 vadflag; |
| WebRtc_Word16 shifts0, shifts1; |
| WebRtc_Word16 tmp16, tmp16_1, tmp16_2; |
| WebRtc_Word16 diff, nr, pos; |
| WebRtc_Word16 nmk, nmk2, nmk3, smk, smk2, nsk, ssk; |
| WebRtc_Word16 delt, ndelt; |
| WebRtc_Word16 maxspe, maxmu; |
| WebRtc_Word16 deltaN[NUM_TABLE_VALUES], deltaS[NUM_TABLE_VALUES]; |
| WebRtc_Word16 ngprvec[NUM_TABLE_VALUES], sgprvec[NUM_TABLE_VALUES]; |
| WebRtc_Word32 h0test, h1test; |
| WebRtc_Word32 tmp32_1, tmp32_2; |
| WebRtc_Word32 dotVal; |
| WebRtc_Word32 nmid, smid; |
| WebRtc_Word32 probn[NUM_MODELS], probs[NUM_MODELS]; |
| WebRtc_Word16 *nmean1ptr, *nmean2ptr, *smean1ptr, *smean2ptr, *nstd1ptr, *nstd2ptr, |
| *sstd1ptr, *sstd2ptr; |
| WebRtc_Word16 overhead1, overhead2, individualTest, totalTest; |
| |
| // Set the thresholds to different values based on frame length |
| if (frame_length == 80) |
| { |
| // 80 input samples |
| overhead1 = inst->over_hang_max_1[0]; |
| overhead2 = inst->over_hang_max_2[0]; |
| individualTest = inst->individual[0]; |
| totalTest = inst->total[0]; |
| } else if (frame_length == 160) |
| { |
| // 160 input samples |
| overhead1 = inst->over_hang_max_1[1]; |
| overhead2 = inst->over_hang_max_2[1]; |
| individualTest = inst->individual[1]; |
| totalTest = inst->total[1]; |
| } else |
| { |
| // 240 input samples |
| overhead1 = inst->over_hang_max_1[2]; |
| overhead2 = inst->over_hang_max_2[2]; |
| individualTest = inst->individual[2]; |
| totalTest = inst->total[2]; |
| } |
| |
| if (total_power > MIN_ENERGY) |
| { // If signal present at all |
| |
| // Set pointers to the gaussian parameters |
| nmean1ptr = &inst->noise_means[0]; |
| nmean2ptr = &inst->noise_means[NUM_CHANNELS]; |
| smean1ptr = &inst->speech_means[0]; |
| smean2ptr = &inst->speech_means[NUM_CHANNELS]; |
| nstd1ptr = &inst->noise_stds[0]; |
| nstd2ptr = &inst->noise_stds[NUM_CHANNELS]; |
| sstd1ptr = &inst->speech_stds[0]; |
| sstd2ptr = &inst->speech_stds[NUM_CHANNELS]; |
| |
| vadflag = 0; |
| dotVal = 0; |
| for (n = 0; n < NUM_CHANNELS; n++) |
| { // For all channels |
| |
| pos = WEBRTC_SPL_LSHIFT_W16(n, 1); |
| xval = feature_vector[n]; |
| |
| // Probability for Noise, Q7 * Q20 = Q27 |
| tmp32_1 = WebRtcVad_GaussianProbability(xval, *nmean1ptr++, *nstd1ptr++, |
| &deltaN[pos]); |
| probn[0] = (WebRtc_Word32)(kNoiseDataWeights[n] * tmp32_1); |
| tmp32_1 = WebRtcVad_GaussianProbability(xval, *nmean2ptr++, *nstd2ptr++, |
| &deltaN[pos + 1]); |
| probn[1] = (WebRtc_Word32)(kNoiseDataWeights[n + NUM_CHANNELS] * tmp32_1); |
| h0test = probn[0] + probn[1]; // Q27 |
| h0 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(h0test, 12); // Q15 |
| |
| // Probability for Speech |
| tmp32_1 = WebRtcVad_GaussianProbability(xval, *smean1ptr++, *sstd1ptr++, |
| &deltaS[pos]); |
| probs[0] = (WebRtc_Word32)(kSpeechDataWeights[n] * tmp32_1); |
| tmp32_1 = WebRtcVad_GaussianProbability(xval, *smean2ptr++, *sstd2ptr++, |
| &deltaS[pos + 1]); |
| probs[1] = (WebRtc_Word32)(kSpeechDataWeights[n + NUM_CHANNELS] * tmp32_1); |
| h1test = probs[0] + probs[1]; // Q27 |
| h1 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(h1test, 12); // Q15 |
| |
| // Get likelihood ratio. Approximate log2(H1/H0) with shifts0 - shifts1 |
| shifts0 = WebRtcSpl_NormW32(h0test); |
| shifts1 = WebRtcSpl_NormW32(h1test); |
| |
| if ((h0test > 0) && (h1test > 0)) |
| { |
| ratvec = shifts0 - shifts1; |
| } else if (h1test > 0) |
| { |
| ratvec = 31 - shifts1; |
| } else if (h0test > 0) |
| { |
| ratvec = shifts0 - 31; |
| } else |
| { |
| ratvec = 0; |
| } |
| |
| // VAD decision with spectrum weighting |
| dotVal += WEBRTC_SPL_MUL_16_16(ratvec, kSpectrumWeight[n]); |
| |
| // Individual channel test |
| if ((ratvec << 2) > individualTest) |
| { |
| vadflag = 1; |
| } |
| |
| // Probabilities used when updating model |
| if (h0 > 0) |
| { |
| tmp32_1 = probn[0] & 0xFFFFF000; // Q27 |
| tmp32_2 = WEBRTC_SPL_LSHIFT_W32(tmp32_1, 2); // Q29 |
| ngprvec[pos] = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_2, h0); |
| ngprvec[pos + 1] = 16384 - ngprvec[pos]; |
| } else |
| { |
| ngprvec[pos] = 16384; |
| ngprvec[pos + 1] = 0; |
| } |
| |
| // Probabilities used when updating model |
| if (h1 > 0) |
| { |
| tmp32_1 = probs[0] & 0xFFFFF000; |
| tmp32_2 = WEBRTC_SPL_LSHIFT_W32(tmp32_1, 2); |
| sgprvec[pos] = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_2, h1); |
| sgprvec[pos + 1] = 16384 - sgprvec[pos]; |
| } else |
| { |
| sgprvec[pos] = 0; |
| sgprvec[pos + 1] = 0; |
| } |
| } |
| |
| // Overall test |
| if (dotVal >= totalTest) |
| { |
| vadflag |= 1; |
| } |
| |
| // Set pointers to the means and standard deviations. |
| nmean1ptr = &inst->noise_means[0]; |
| smean1ptr = &inst->speech_means[0]; |
| nstd1ptr = &inst->noise_stds[0]; |
| sstd1ptr = &inst->speech_stds[0]; |
| |
| maxspe = 12800; |
| |
| // Update the model's parameters |
| for (n = 0; n < NUM_CHANNELS; n++) |
| { |
| |
| pos = WEBRTC_SPL_LSHIFT_W16(n, 1); |
| |
| // Get min value in past which is used for long term correction |
| backval = WebRtcVad_FindMinimum(inst, feature_vector[n], n); // Q4 |
| |
| // Compute the "global" mean, that is the sum of the two means weighted |
| nmid = WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n], *nmean1ptr); // Q7 * Q7 |
| nmid += WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n+NUM_CHANNELS], |
| *(nmean1ptr+NUM_CHANNELS)); |
| tmp16_1 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(nmid, 6); // Q8 |
| |
| for (k = 0; k < NUM_MODELS; k++) |
| { |
| |
| nr = pos + k; |
| |
| nmean2ptr = nmean1ptr + k * NUM_CHANNELS; |
| smean2ptr = smean1ptr + k * NUM_CHANNELS; |
| nstd2ptr = nstd1ptr + k * NUM_CHANNELS; |
| sstd2ptr = sstd1ptr + k * NUM_CHANNELS; |
| nmk = *nmean2ptr; |
| smk = *smean2ptr; |
| nsk = *nstd2ptr; |
| ssk = *sstd2ptr; |
| |
| // Update noise mean vector if the frame consists of noise only |
| nmk2 = nmk; |
| if (!vadflag) |
| { |
| // deltaN = (x-mu)/sigma^2 |
| // ngprvec[k] = probn[k]/(probn[0] + probn[1]) |
| |
| delt = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(ngprvec[nr], |
| deltaN[nr], 11); // Q14*Q11 |
| nmk2 = nmk + (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delt, |
| kNoiseUpdateConst, |
| 22); // Q7+(Q14*Q15>>22) |
| } |
| |
| // Long term correction of the noise mean |
| ndelt = WEBRTC_SPL_LSHIFT_W16(backval, 4); |
| ndelt -= tmp16_1; // Q8 - Q8 |
| nmk3 = nmk2 + (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(ndelt, |
| kBackEta, |
| 9); // Q7+(Q8*Q8)>>9 |
| |
| // Control that the noise mean does not drift to much |
| tmp16 = WEBRTC_SPL_LSHIFT_W16(k+5, 7); |
| if (nmk3 < tmp16) |
| nmk3 = tmp16; |
| tmp16 = WEBRTC_SPL_LSHIFT_W16(72+k-n, 7); |
| if (nmk3 > tmp16) |
| nmk3 = tmp16; |
| *nmean2ptr = nmk3; |
| |
| if (vadflag) |
| { |
| // Update speech mean vector: |
| // deltaS = (x-mu)/sigma^2 |
| // sgprvec[k] = probn[k]/(probn[0] + probn[1]) |
| |
| delt = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(sgprvec[nr], |
| deltaS[nr], |
| 11); // (Q14*Q11)>>11=Q14 |
| tmp16 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delt, |
| kSpeechUpdateConst, |
| 21) + 1; |
| smk2 = smk + (tmp16 >> 1); // Q7 + (Q14 * Q15 >> 22) |
| |
| // Control that the speech mean does not drift to much |
| maxmu = maxspe + 640; |
| if (smk2 < kMinimumMean[k]) |
| smk2 = kMinimumMean[k]; |
| if (smk2 > maxmu) |
| smk2 = maxmu; |
| |
| *smean2ptr = smk2; |
| |
| // (Q7>>3) = Q4 |
| tmp16 = WEBRTC_SPL_RSHIFT_W16((smk + 4), 3); |
| |
| tmp16 = feature_vector[n] - tmp16; // Q4 |
| tmp32_1 = WEBRTC_SPL_MUL_16_16_RSFT(deltaS[nr], tmp16, 3); |
| tmp32_2 = tmp32_1 - (WebRtc_Word32)4096; // Q12 |
| tmp16 = WEBRTC_SPL_RSHIFT_W16((sgprvec[nr]), 2); |
| tmp32_1 = (WebRtc_Word32)(tmp16 * tmp32_2);// (Q15>>3)*(Q14>>2)=Q12*Q12=Q24 |
| |
| tmp32_2 = WEBRTC_SPL_RSHIFT_W32(tmp32_1, 4); // Q20 |
| |
| // 0.1 * Q20 / Q7 = Q13 |
| if (tmp32_2 > 0) |
| tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_2, ssk * 10); |
| else |
| { |
| tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(-tmp32_2, ssk * 10); |
| tmp16 = -tmp16; |
| } |
| // divide by 4 giving an update factor of 0.025 |
| tmp16 += 128; // Rounding |
| ssk += WEBRTC_SPL_RSHIFT_W16(tmp16, 8); |
| // Division with 8 plus Q7 |
| if (ssk < kMinStd) |
| ssk = kMinStd; |
| *sstd2ptr = ssk; |
| } else |
| { |
| // Update GMM variance vectors |
| // deltaN * (feature_vector[n] - nmk) - 1, Q11 * Q4 |
| tmp16 = feature_vector[n] - WEBRTC_SPL_RSHIFT_W16(nmk, 3); |
| |
| // (Q15>>3) * (Q14>>2) = Q12 * Q12 = Q24 |
| tmp32_1 = WEBRTC_SPL_MUL_16_16_RSFT(deltaN[nr], tmp16, 3) - 4096; |
| tmp16 = WEBRTC_SPL_RSHIFT_W16((ngprvec[nr]+2), 2); |
| tmp32_2 = (WebRtc_Word32)(tmp16 * tmp32_1); |
| tmp32_1 = WEBRTC_SPL_RSHIFT_W32(tmp32_2, 14); |
| // Q20 * approx 0.001 (2^-10=0.0009766) |
| |
| // Q20 / Q7 = Q13 |
| tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_1, nsk); |
| if (tmp32_1 > 0) |
| tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_1, nsk); |
| else |
| { |
| tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(-tmp32_1, nsk); |
| tmp16 = -tmp16; |
| } |
| tmp16 += 32; // Rounding |
| nsk += WEBRTC_SPL_RSHIFT_W16(tmp16, 6); |
| |
| if (nsk < kMinStd) |
| nsk = kMinStd; |
| |
| *nstd2ptr = nsk; |
| } |
| } |
| |
| // Separate models if they are too close - nmid in Q14 |
| nmid = WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n], *nmean1ptr); |
| nmid += WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n+NUM_CHANNELS], *nmean2ptr); |
| |
| // smid in Q14 |
| smid = WEBRTC_SPL_MUL_16_16(kSpeechDataWeights[n], *smean1ptr); |
| smid += WEBRTC_SPL_MUL_16_16(kSpeechDataWeights[n+NUM_CHANNELS], *smean2ptr); |
| |
| // diff = "global" speech mean - "global" noise mean |
| diff = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(smid, 9); |
| tmp16 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(nmid, 9); |
| diff -= tmp16; |
| |
| if (diff < kMinimumDifference[n]) |
| { |
| |
| tmp16 = kMinimumDifference[n] - diff; // Q5 |
| |
| // tmp16_1 = ~0.8 * (kMinimumDifference - diff) in Q7 |
| // tmp16_2 = ~0.2 * (kMinimumDifference - diff) in Q7 |
| tmp16_1 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(13, tmp16, 2); |
| tmp16_2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(3, tmp16, 2); |
| |
| // First Gauss, speech model |
| tmp16 = tmp16_1 + *smean1ptr; |
| *smean1ptr = tmp16; |
| smid = WEBRTC_SPL_MUL_16_16(tmp16, kSpeechDataWeights[n]); |
| |
| // Second Gauss, speech model |
| tmp16 = tmp16_1 + *smean2ptr; |
| *smean2ptr = tmp16; |
| smid += WEBRTC_SPL_MUL_16_16(tmp16, kSpeechDataWeights[n+NUM_CHANNELS]); |
| |
| // First Gauss, noise model |
| tmp16 = *nmean1ptr - tmp16_2; |
| *nmean1ptr = tmp16; |
| |
| nmid = WEBRTC_SPL_MUL_16_16(tmp16, kNoiseDataWeights[n]); |
| |
| // Second Gauss, noise model |
| tmp16 = *nmean2ptr - tmp16_2; |
| *nmean2ptr = tmp16; |
| nmid += WEBRTC_SPL_MUL_16_16(tmp16, kNoiseDataWeights[n+NUM_CHANNELS]); |
| } |
| |
| // Control that the speech & noise means do not drift to much |
| maxspe = kMaximumSpeech[n]; |
| tmp16_2 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(smid, 7); |
| if (tmp16_2 > maxspe) |
| { // Upper limit of speech model |
| tmp16_2 -= maxspe; |
| |
| *smean1ptr -= tmp16_2; |
| *smean2ptr -= tmp16_2; |
| } |
| |
| tmp16_2 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(nmid, 7); |
| if (tmp16_2 > kMaximumNoise[n]) |
| { |
| tmp16_2 -= kMaximumNoise[n]; |
| |
| *nmean1ptr -= tmp16_2; |
| *nmean2ptr -= tmp16_2; |
| } |
| |
| nmean1ptr++; |
| smean1ptr++; |
| nstd1ptr++; |
| sstd1ptr++; |
| } |
| inst->frame_counter++; |
| } else |
| { |
| vadflag = 0; |
| } |
| |
| // Hangover smoothing |
| if (!vadflag) |
| { |
| if (inst->over_hang > 0) |
| { |
| vadflag = 2 + inst->over_hang; |
| inst->over_hang = inst->over_hang - 1; |
| } |
| inst->num_of_speech = 0; |
| } else |
| { |
| inst->num_of_speech = inst->num_of_speech + 1; |
| if (inst->num_of_speech > kMaxSpeechFrames) |
| { |
| inst->num_of_speech = kMaxSpeechFrames; |
| inst->over_hang = overhead2; |
| } else |
| inst->over_hang = overhead1; |
| } |
| return vadflag; |
| } |
| |
| // Initialize VAD |
| int WebRtcVad_InitCore(VadInstT *inst, short mode) |
| { |
| int i; |
| |
| // Initialization of struct |
| inst->vad = 1; |
| inst->frame_counter = 0; |
| inst->over_hang = 0; |
| inst->num_of_speech = 0; |
| |
| // Initialization of downsampling filter state |
| inst->downsampling_filter_states[0] = 0; |
| inst->downsampling_filter_states[1] = 0; |
| inst->downsampling_filter_states[2] = 0; |
| inst->downsampling_filter_states[3] = 0; |
| |
| // Read initial PDF parameters |
| for (i = 0; i < NUM_TABLE_VALUES; i++) |
| { |
| inst->noise_means[i] = kNoiseDataMeans[i]; |
| inst->speech_means[i] = kSpeechDataMeans[i]; |
| inst->noise_stds[i] = kNoiseDataStds[i]; |
| inst->speech_stds[i] = kSpeechDataStds[i]; |
| } |
| |
| // Index and Minimum value vectors are initialized |
| for (i = 0; i < 16 * NUM_CHANNELS; i++) |
| { |
| inst->low_value_vector[i] = 10000; |
| inst->index_vector[i] = 0; |
| } |
| |
| for (i = 0; i < 5; i++) |
| { |
| inst->upper_state[i] = 0; |
| inst->lower_state[i] = 0; |
| } |
| |
| for (i = 0; i < 4; i++) |
| { |
| inst->hp_filter_state[i] = 0; |
| } |
| |
| // Init mean value memory, for FindMin function |
| inst->mean_value[0] = 1600; |
| inst->mean_value[1] = 1600; |
| inst->mean_value[2] = 1600; |
| inst->mean_value[3] = 1600; |
| inst->mean_value[4] = 1600; |
| inst->mean_value[5] = 1600; |
| |
| if (WebRtcVad_set_mode_core(inst, mode) != 0) { |
| return -1; |
| } |
| |
| inst->init_flag = kInitCheck; |
| |
| return 0; |
| } |
| |
| // Set aggressiveness mode |
| int WebRtcVad_set_mode_core(VadInstT *inst, short mode) |
| { |
| |
| if (mode == 0) |
| { |
| // Quality mode |
| inst->over_hang_max_1[0] = OHMAX1_10MS_Q; // Overhang short speech burst |
| inst->over_hang_max_1[1] = OHMAX1_20MS_Q; // Overhang short speech burst |
| inst->over_hang_max_1[2] = OHMAX1_30MS_Q; // Overhang short speech burst |
| inst->over_hang_max_2[0] = OHMAX2_10MS_Q; // Overhang long speech burst |
| inst->over_hang_max_2[1] = OHMAX2_20MS_Q; // Overhang long speech burst |
| inst->over_hang_max_2[2] = OHMAX2_30MS_Q; // Overhang long speech burst |
| |
| inst->individual[0] = INDIVIDUAL_10MS_Q; |
| inst->individual[1] = INDIVIDUAL_20MS_Q; |
| inst->individual[2] = INDIVIDUAL_30MS_Q; |
| |
| inst->total[0] = TOTAL_10MS_Q; |
| inst->total[1] = TOTAL_20MS_Q; |
| inst->total[2] = TOTAL_30MS_Q; |
| } else if (mode == 1) |
| { |
| // Low bitrate mode |
| inst->over_hang_max_1[0] = OHMAX1_10MS_LBR; // Overhang short speech burst |
| inst->over_hang_max_1[1] = OHMAX1_20MS_LBR; // Overhang short speech burst |
| inst->over_hang_max_1[2] = OHMAX1_30MS_LBR; // Overhang short speech burst |
| inst->over_hang_max_2[0] = OHMAX2_10MS_LBR; // Overhang long speech burst |
| inst->over_hang_max_2[1] = OHMAX2_20MS_LBR; // Overhang long speech burst |
| inst->over_hang_max_2[2] = OHMAX2_30MS_LBR; // Overhang long speech burst |
| |
| inst->individual[0] = INDIVIDUAL_10MS_LBR; |
| inst->individual[1] = INDIVIDUAL_20MS_LBR; |
| inst->individual[2] = INDIVIDUAL_30MS_LBR; |
| |
| inst->total[0] = TOTAL_10MS_LBR; |
| inst->total[1] = TOTAL_20MS_LBR; |
| inst->total[2] = TOTAL_30MS_LBR; |
| } else if (mode == 2) |
| { |
| // Aggressive mode |
| inst->over_hang_max_1[0] = OHMAX1_10MS_AGG; // Overhang short speech burst |
| inst->over_hang_max_1[1] = OHMAX1_20MS_AGG; // Overhang short speech burst |
| inst->over_hang_max_1[2] = OHMAX1_30MS_AGG; // Overhang short speech burst |
| inst->over_hang_max_2[0] = OHMAX2_10MS_AGG; // Overhang long speech burst |
| inst->over_hang_max_2[1] = OHMAX2_20MS_AGG; // Overhang long speech burst |
| inst->over_hang_max_2[2] = OHMAX2_30MS_AGG; // Overhang long speech burst |
| |
| inst->individual[0] = INDIVIDUAL_10MS_AGG; |
| inst->individual[1] = INDIVIDUAL_20MS_AGG; |
| inst->individual[2] = INDIVIDUAL_30MS_AGG; |
| |
| inst->total[0] = TOTAL_10MS_AGG; |
| inst->total[1] = TOTAL_20MS_AGG; |
| inst->total[2] = TOTAL_30MS_AGG; |
| } else if (mode == 3) |
| { |
| // Very aggressive mode |
| inst->over_hang_max_1[0] = OHMAX1_10MS_VAG; // Overhang short speech burst |
| inst->over_hang_max_1[1] = OHMAX1_20MS_VAG; // Overhang short speech burst |
| inst->over_hang_max_1[2] = OHMAX1_30MS_VAG; // Overhang short speech burst |
| inst->over_hang_max_2[0] = OHMAX2_10MS_VAG; // Overhang long speech burst |
| inst->over_hang_max_2[1] = OHMAX2_20MS_VAG; // Overhang long speech burst |
| inst->over_hang_max_2[2] = OHMAX2_30MS_VAG; // Overhang long speech burst |
| |
| inst->individual[0] = INDIVIDUAL_10MS_VAG; |
| inst->individual[1] = INDIVIDUAL_20MS_VAG; |
| inst->individual[2] = INDIVIDUAL_30MS_VAG; |
| |
| inst->total[0] = TOTAL_10MS_VAG; |
| inst->total[1] = TOTAL_20MS_VAG; |
| inst->total[2] = TOTAL_30MS_VAG; |
| } else |
| { |
| return -1; |
| } |
| |
| return 0; |
| } |
| |
| // Calculate VAD decision by first extracting feature values and then calculate |
| // probability for both speech and background noise. |
| |
| WebRtc_Word16 WebRtcVad_CalcVad32khz(VadInstT *inst, WebRtc_Word16 *speech_frame, |
| int frame_length) |
| { |
| WebRtc_Word16 len, vad; |
| WebRtc_Word16 speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB) |
| WebRtc_Word16 speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB) |
| |
| |
| // Downsample signal 32->16->8 before doing VAD |
| WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_states[2]), |
| frame_length); |
| len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1); |
| |
| WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, len); |
| len = WEBRTC_SPL_RSHIFT_W16(len, 1); |
| |
| // Do VAD on an 8 kHz signal |
| vad = WebRtcVad_CalcVad8khz(inst, speechNB, len); |
| |
| return vad; |
| } |
| |
| WebRtc_Word16 WebRtcVad_CalcVad16khz(VadInstT *inst, WebRtc_Word16 *speech_frame, |
| int frame_length) |
| { |
| WebRtc_Word16 len, vad; |
| WebRtc_Word16 speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB) |
| |
| // Wideband: Downsample signal before doing VAD |
| WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_states, |
| frame_length); |
| |
| len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1); |
| vad = WebRtcVad_CalcVad8khz(inst, speechNB, len); |
| |
| return vad; |
| } |
| |
| WebRtc_Word16 WebRtcVad_CalcVad8khz(VadInstT *inst, WebRtc_Word16 *speech_frame, |
| int frame_length) |
| { |
| WebRtc_Word16 feature_vector[NUM_CHANNELS], total_power; |
| |
| // Get power in the bands |
| total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length, |
| feature_vector); |
| |
| // Make a VAD |
| inst->vad = GmmProbability(inst, feature_vector, total_power, frame_length); |
| |
| return inst->vad; |
| } |