| /* |
| * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. |
| * |
| * Use of this source code is governed by a BSD-style license |
| * that can be found in the LICENSE file in the root of the source |
| * tree. An additional intellectual property rights grant can be found |
| * in the file PATENTS. All contributing project authors may |
| * be found in the AUTHORS file in the root of the source tree. |
| */ |
| |
| #include "nsx_core.h" |
| |
| #include <arm_neon.h> |
| #include <assert.h> |
| |
| // Update the noise estimation information. |
| static void UpdateNoiseEstimateNeon(NsxInst_t* inst, int offset) { |
| int i = 0; |
| const int16_t kExp2Const = 11819; // Q13 |
| int16_t* ptr_noiseEstLogQuantile = NULL; |
| int16_t* ptr_noiseEstQuantile = NULL; |
| int16x4_t kExp2Const16x4 = vdup_n_s16(kExp2Const); |
| int32x4_t twentyOne32x4 = vdupq_n_s32(21); |
| int32x4_t constA32x4 = vdupq_n_s32(0x1fffff); |
| int32x4_t constB32x4 = vdupq_n_s32(0x200000); |
| |
| int16_t tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset, |
| inst->magnLen); |
| |
| // Guarantee a Q-domain as high as possible and still fit in int16 |
| inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(kExp2Const, |
| tmp16, |
| 21); |
| |
| int32x4_t qNoise32x4 = vdupq_n_s32(inst->qNoise); |
| |
| for (ptr_noiseEstLogQuantile = &inst->noiseEstLogQuantile[offset], |
| ptr_noiseEstQuantile = &inst->noiseEstQuantile[0]; |
| ptr_noiseEstQuantile < &inst->noiseEstQuantile[inst->magnLen - 3]; |
| ptr_noiseEstQuantile += 4, ptr_noiseEstLogQuantile += 4) { |
| |
| // tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const, |
| // inst->noiseEstLogQuantile[offset + i]); |
| int16x4_t v16x4 = vld1_s16(ptr_noiseEstLogQuantile); |
| int32x4_t v32x4B = vmull_s16(v16x4, kExp2Const16x4); |
| |
| // tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac |
| int32x4_t v32x4A = vandq_s32(v32x4B, constA32x4); |
| v32x4A = vorrq_s32(v32x4A, constB32x4); |
| |
| // tmp16 = (int16_t) WEBRTC_SPL_RSHIFT_W32(tmp32no2, 21); |
| v32x4B = vshrq_n_s32(v32x4B, 21); |
| |
| // tmp16 -= 21;// shift 21 to get result in Q0 |
| v32x4B = vsubq_s32(v32x4B, twentyOne32x4); |
| |
| // tmp16 += (int16_t) inst->qNoise; |
| // shift to get result in Q(qNoise) |
| v32x4B = vaddq_s32(v32x4B, qNoise32x4); |
| |
| // if (tmp16 < 0) { |
| // tmp32no1 = WEBRTC_SPL_RSHIFT_W32(tmp32no1, -tmp16); |
| // } else { |
| // tmp32no1 = WEBRTC_SPL_LSHIFT_W32(tmp32no1, tmp16); |
| // } |
| v32x4B = vshlq_s32(v32x4A, v32x4B); |
| |
| // tmp16 = WebRtcSpl_SatW32ToW16(tmp32no1); |
| v16x4 = vqmovn_s32(v32x4B); |
| |
| //inst->noiseEstQuantile[i] = tmp16; |
| vst1_s16(ptr_noiseEstQuantile, v16x4); |
| } |
| |
| // Last iteration: |
| |
| // inst->quantile[i]=exp(inst->lquantile[offset+i]); |
| // in Q21 |
| int32_t tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const, |
| *ptr_noiseEstLogQuantile); |
| int32_t tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac |
| |
| tmp16 = (int16_t) WEBRTC_SPL_RSHIFT_W32(tmp32no2, 21); |
| tmp16 -= 21;// shift 21 to get result in Q0 |
| tmp16 += (int16_t) inst->qNoise; //shift to get result in Q(qNoise) |
| if (tmp16 < 0) { |
| tmp32no1 = WEBRTC_SPL_RSHIFT_W32(tmp32no1, -tmp16); |
| } else { |
| tmp32no1 = WEBRTC_SPL_LSHIFT_W32(tmp32no1, tmp16); |
| } |
| *ptr_noiseEstQuantile = WebRtcSpl_SatW32ToW16(tmp32no1); |
| } |
| |
| // Noise Estimation |
| static void NoiseEstimationNeon(NsxInst_t* inst, |
| uint16_t* magn, |
| uint32_t* noise, |
| int16_t* q_noise) { |
| int16_t lmagn[HALF_ANAL_BLOCKL], counter, countDiv; |
| int16_t countProd, delta, zeros, frac; |
| int16_t log2, tabind, logval, tmp16, tmp16no1, tmp16no2; |
| const int16_t log2_const = 22713; |
| const int16_t width_factor = 21845; |
| |
| int i, s, offset; |
| |
| tabind = inst->stages - inst->normData; |
| assert(tabind < 9); |
| assert(tabind > -9); |
| if (tabind < 0) { |
| logval = -WebRtcNsx_kLogTable[-tabind]; |
| } else { |
| logval = WebRtcNsx_kLogTable[tabind]; |
| } |
| |
| int16x8_t logval_16x8 = vdupq_n_s16(logval); |
| |
| // lmagn(i)=log(magn(i))=log(2)*log2(magn(i)) |
| // magn is in Q(-stages), and the real lmagn values are: |
| // real_lmagn(i)=log(magn(i)*2^stages)=log(magn(i))+log(2^stages) |
| // lmagn in Q8 |
| for (i = 0; i < inst->magnLen; i++) { |
| if (magn[i]) { |
| zeros = WebRtcSpl_NormU32((uint32_t)magn[i]); |
| frac = (int16_t)((((uint32_t)magn[i] << zeros) |
| & 0x7FFFFFFF) >> 23); |
| assert(frac < 256); |
| // log2(magn(i)) |
| log2 = (int16_t)(((31 - zeros) << 8) |
| + WebRtcNsx_kLogTableFrac[frac]); |
| // log2(magn(i))*log(2) |
| lmagn[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2_const, 15); |
| // + log(2^stages) |
| lmagn[i] += logval; |
| } else { |
| lmagn[i] = logval; |
| } |
| } |
| |
| int16x4_t Q3_16x4 = vdup_n_s16(3); |
| int16x8_t WIDTHQ8_16x8 = vdupq_n_s16(WIDTH_Q8); |
| int16x8_t WIDTHFACTOR_16x8 = vdupq_n_s16(width_factor); |
| |
| int16_t factor = FACTOR_Q7; |
| if (inst->blockIndex < END_STARTUP_LONG) |
| factor = FACTOR_Q7_STARTUP; |
| |
| // Loop over simultaneous estimates |
| for (s = 0; s < SIMULT; s++) { |
| offset = s * inst->magnLen; |
| |
| // Get counter values from state |
| counter = inst->noiseEstCounter[s]; |
| assert(counter < 201); |
| countDiv = WebRtcNsx_kCounterDiv[counter]; |
| countProd = (int16_t)WEBRTC_SPL_MUL_16_16(counter, countDiv); |
| |
| // quant_est(...) |
| int16_t deltaBuff[8]; |
| int16x4_t tmp16x4_0; |
| int16x4_t tmp16x4_1; |
| int16x4_t countDiv_16x4 = vdup_n_s16(countDiv); |
| int16x8_t countProd_16x8 = vdupq_n_s16(countProd); |
| int16x8_t tmp16x8_0 = vdupq_n_s16(countDiv); |
| int16x8_t prod16x8 = vqrdmulhq_s16(WIDTHFACTOR_16x8, tmp16x8_0); |
| int16x8_t tmp16x8_1; |
| int16x8_t tmp16x8_2; |
| int16x8_t tmp16x8_3; |
| int16x8_t tmp16x8_4; |
| int16x8_t tmp16x8_5; |
| int32x4_t tmp32x4; |
| |
| for (i = 0; i < inst->magnLen - 7; i += 8) { |
| // Compute delta. |
| // Smaller step size during startup. This prevents from using |
| // unrealistic values causing overflow. |
| tmp16x8_0 = vdupq_n_s16(factor); |
| vst1q_s16(deltaBuff, tmp16x8_0); |
| |
| int j; |
| for (j = 0; j < 8; j++) { |
| if (inst->noiseEstDensity[offset + i + j] > 512) { |
| // Get values for deltaBuff by shifting intead of dividing. |
| int factor = WebRtcSpl_NormW16(inst->noiseEstDensity[offset + i + j]); |
| deltaBuff[j] = (int16_t)(FACTOR_Q16 >> (14 - factor)); |
| } |
| } |
| |
| // Update log quantile estimate |
| |
| // tmp16 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14); |
| tmp32x4 = vmull_s16(vld1_s16(&deltaBuff[0]), countDiv_16x4); |
| tmp16x4_1 = vshrn_n_s32(tmp32x4, 14); |
| tmp32x4 = vmull_s16(vld1_s16(&deltaBuff[4]), countDiv_16x4); |
| tmp16x4_0 = vshrn_n_s32(tmp32x4, 14); |
| tmp16x8_0 = vcombine_s16(tmp16x4_1, tmp16x4_0); // Keep for several lines. |
| |
| // prepare for the "if" branch |
| // tmp16 += 2; |
| // tmp16_1 = (Word16)(tmp16>>2); |
| tmp16x8_1 = vrshrq_n_s16(tmp16x8_0, 2); |
| |
| // inst->noiseEstLogQuantile[offset+i] + tmp16_1; |
| tmp16x8_2 = vld1q_s16(&inst->noiseEstLogQuantile[offset + i]); // Keep |
| tmp16x8_1 = vaddq_s16(tmp16x8_2, tmp16x8_1); // Keep for several lines |
| |
| // Prepare for the "else" branch |
| // tmp16 += 1; |
| // tmp16_1 = (Word16)(tmp16>>1); |
| tmp16x8_0 = vrshrq_n_s16(tmp16x8_0, 1); |
| |
| // tmp16_2 = (Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16_1,3,1); |
| tmp32x4 = vmull_s16(vget_low_s16(tmp16x8_0), Q3_16x4); |
| tmp16x4_1 = vshrn_n_s32(tmp32x4, 1); |
| |
| // tmp16_2 = (Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16_1,3,1); |
| tmp32x4 = vmull_s16(vget_high_s16(tmp16x8_0), Q3_16x4); |
| tmp16x4_0 = vshrn_n_s32(tmp32x4, 1); |
| |
| // inst->noiseEstLogQuantile[offset + i] - tmp16_2; |
| tmp16x8_0 = vcombine_s16(tmp16x4_1, tmp16x4_0); // keep |
| tmp16x8_0 = vsubq_s16(tmp16x8_2, tmp16x8_0); |
| |
| // logval is the smallest fixed point representation we can have. Values |
| // below that will correspond to values in the interval [0, 1], which |
| // can't possibly occur. |
| tmp16x8_0 = vmaxq_s16(tmp16x8_0, logval_16x8); |
| |
| // Do the if-else branches: |
| tmp16x8_3 = vld1q_s16(&lmagn[i]); // keep for several lines |
| tmp16x8_5 = vsubq_s16(tmp16x8_3, tmp16x8_2); |
| __asm__("vcgt.s16 %q0, %q1, #0"::"w"(tmp16x8_4), "w"(tmp16x8_5)); |
| __asm__("vbit %q0, %q1, %q2":: |
| "w"(tmp16x8_2), "w"(tmp16x8_1), "w"(tmp16x8_4)); |
| __asm__("vbif %q0, %q1, %q2":: |
| "w"(tmp16x8_2), "w"(tmp16x8_0), "w"(tmp16x8_4)); |
| vst1q_s16(&inst->noiseEstLogQuantile[offset + i], tmp16x8_2); |
| |
| // Update density estimate |
| // tmp16_1 + tmp16_2 |
| tmp16x8_1 = vld1q_s16(&inst->noiseEstDensity[offset + i]); |
| tmp16x8_0 = vqrdmulhq_s16(tmp16x8_1, countProd_16x8); |
| tmp16x8_0 = vaddq_s16(tmp16x8_0, prod16x8); |
| |
| // lmagn[i] - inst->noiseEstLogQuantile[offset + i] |
| tmp16x8_3 = vsubq_s16(tmp16x8_3, tmp16x8_2); |
| tmp16x8_3 = vabsq_s16(tmp16x8_3); |
| tmp16x8_4 = vcgtq_s16(WIDTHQ8_16x8, tmp16x8_3); |
| __asm__("vbit %q0, %q1, %q2":: |
| "w"(tmp16x8_1), "w"(tmp16x8_0), "w"(tmp16x8_4)); |
| vst1q_s16(&inst->noiseEstDensity[offset + i], tmp16x8_1); |
| } // End loop over magnitude spectrum |
| |
| // Last iteration over magnitude spectrum: |
| // compute delta |
| if (inst->noiseEstDensity[offset + i] > 512) { |
| // Get values for deltaBuff by shifting intead of dividing. |
| int factor = WebRtcSpl_NormW16(inst->noiseEstDensity[offset + i]); |
| delta = (int16_t)(FACTOR_Q16 >> (14 - factor)); |
| } else { |
| delta = FACTOR_Q7; |
| if (inst->blockIndex < END_STARTUP_LONG) { |
| // Smaller step size during startup. This prevents from using |
| // unrealistic values causing overflow. |
| delta = FACTOR_Q7_STARTUP; |
| } |
| } |
| // update log quantile estimate |
| tmp16 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14); |
| if (lmagn[i] > inst->noiseEstLogQuantile[offset + i]) { |
| // +=QUANTILE*delta/(inst->counter[s]+1) QUANTILE=0.25, =1 in Q2 |
| // CounterDiv=1/(inst->counter[s]+1) in Q15 |
| tmp16 += 2; |
| tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 2); |
| inst->noiseEstLogQuantile[offset + i] += tmp16no1; |
| } else { |
| tmp16 += 1; |
| tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 1); |
| // *(1-QUANTILE), in Q2 QUANTILE=0.25, 1-0.25=0.75=3 in Q2 |
| tmp16no2 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(tmp16no1, 3, 1); |
| inst->noiseEstLogQuantile[offset + i] -= tmp16no2; |
| if (inst->noiseEstLogQuantile[offset + i] < logval) { |
| // logval is the smallest fixed point representation we can have. |
| // Values below that will correspond to values in the interval |
| // [0, 1], which can't possibly occur. |
| inst->noiseEstLogQuantile[offset + i] = logval; |
| } |
| } |
| |
| // update density estimate |
| if (WEBRTC_SPL_ABS_W16(lmagn[i] - inst->noiseEstLogQuantile[offset + i]) |
| < WIDTH_Q8) { |
| tmp16no1 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( |
| inst->noiseEstDensity[offset + i], countProd, 15); |
| tmp16no2 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( |
| width_factor, countDiv, 15); |
| inst->noiseEstDensity[offset + i] = tmp16no1 + tmp16no2; |
| } |
| |
| |
| if (counter >= END_STARTUP_LONG) { |
| inst->noiseEstCounter[s] = 0; |
| if (inst->blockIndex >= END_STARTUP_LONG) { |
| UpdateNoiseEstimateNeon(inst, offset); |
| } |
| } |
| inst->noiseEstCounter[s]++; |
| |
| } // end loop over simultaneous estimates |
| |
| // Sequentially update the noise during startup |
| if (inst->blockIndex < END_STARTUP_LONG) { |
| UpdateNoiseEstimateNeon(inst, offset); |
| } |
| |
| for (i = 0; i < inst->magnLen; i++) { |
| noise[i] = (uint32_t)(inst->noiseEstQuantile[i]); // Q(qNoise) |
| } |
| (*q_noise) = (int16_t)inst->qNoise; |
| } |
| |
| // Filter the data in the frequency domain, and create spectrum. |
| static void PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buf) { |
| |
| // (1) Filtering. |
| |
| // Fixed point C code for the next block is as follows: |
| // for (i = 0; i < inst->magnLen; i++) { |
| // inst->real[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(inst->real[i], |
| // (int16_t)(inst->noiseSupFilter[i]), 14); // Q(normData-stages) |
| // inst->imag[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(inst->imag[i], |
| // (int16_t)(inst->noiseSupFilter[i]), 14); // Q(normData-stages) |
| // } |
| |
| int16_t* ptr_real = &inst->real[0]; |
| int16_t* ptr_imag = &inst->imag[0]; |
| uint16_t* ptr_noiseSupFilter = &inst->noiseSupFilter[0]; |
| |
| // Filter the rest in the frequency domain. |
| for (; ptr_real < &inst->real[inst->magnLen - 1];) { |
| // Loop unrolled once. Both pointers are incremented by 4 twice. |
| __asm__ __volatile__( |
| "vld1.16 d20, [%[ptr_real]]\n\t" |
| "vld1.16 d22, [%[ptr_imag]]\n\t" |
| "vld1.16 d23, [%[ptr_noiseSupFilter]]!\n\t" |
| "vmull.s16 q10, d20, d23\n\t" |
| "vmull.s16 q11, d22, d23\n\t" |
| "vshrn.s32 d20, q10, #14\n\t" |
| "vshrn.s32 d22, q11, #14\n\t" |
| "vst1.16 d20, [%[ptr_real]]!\n\t" |
| "vst1.16 d22, [%[ptr_imag]]!\n\t" |
| |
| "vld1.16 d18, [%[ptr_real]]\n\t" |
| "vld1.16 d24, [%[ptr_imag]]\n\t" |
| "vld1.16 d25, [%[ptr_noiseSupFilter]]!\n\t" |
| "vmull.s16 q9, d18, d25\n\t" |
| "vmull.s16 q12, d24, d25\n\t" |
| "vshrn.s32 d18, q9, #14\n\t" |
| "vshrn.s32 d24, q12, #14\n\t" |
| "vst1.16 d18, [%[ptr_real]]!\n\t" |
| "vst1.16 d24, [%[ptr_imag]]!\n\t" |
| |
| // Specify constraints. |
| :[ptr_imag]"+r"(ptr_imag), |
| [ptr_real]"+r"(ptr_real), |
| [ptr_noiseSupFilter]"+r"(ptr_noiseSupFilter) |
| : |
| :"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", |
| "q9", "q10", "q11", "q12" |
| ); |
| } |
| |
| // Filter the last pair of elements in the frequency domain. |
| *ptr_real = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(*ptr_real, |
| (int16_t)(*ptr_noiseSupFilter), 14); // Q(normData-stages) |
| *ptr_imag = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(*ptr_imag, |
| (int16_t)(*ptr_noiseSupFilter), 14); // Q(normData-stages) |
| |
| // (2) Create spectrum. |
| |
| // Fixed point C code for the rest of the function is as follows: |
| // freq_buf[0] = inst->real[0]; |
| // freq_buf[1] = -inst->imag[0]; |
| // for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) { |
| // tmp16 = (inst->anaLen << 1) - j; |
| // freq_buf[j] = inst->real[i]; |
| // freq_buf[j + 1] = -inst->imag[i]; |
| // freq_buf[tmp16] = inst->real[i]; |
| // freq_buf[tmp16 + 1] = inst->imag[i]; |
| // } |
| // freq_buf[inst->anaLen] = inst->real[inst->anaLen2]; |
| // freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2]; |
| |
| freq_buf[0] = inst->real[0]; |
| freq_buf[1] = -inst->imag[0]; |
| |
| int offset = -16; |
| int16_t* ptr_realImag1 = &freq_buf[2]; |
| int16_t* ptr_realImag2 = ptr_realImag2 = &freq_buf[(inst->anaLen << 1) - 8]; |
| ptr_real = &inst->real[1]; |
| ptr_imag = &inst->imag[1]; |
| for (; ptr_real < &inst->real[inst->anaLen2 - 11];) { |
| // Loop unrolled once. All pointers are incremented twice. |
| __asm__ __volatile__( |
| "vld1.16 d22, [%[ptr_real]]!\n\t" |
| "vld1.16 d23, [%[ptr_imag]]!\n\t" |
| // Negate and interleave: |
| "vmov.s16 d20, d22\n\t" |
| "vneg.s16 d21, d23\n\t" |
| "vzip.16 d20, d21\n\t" |
| // Write 8 elements to &freq_buf[j] |
| "vst1.16 {d20, d21}, [%[ptr_realImag1]]!\n\t" |
| // Interleave and reverse elements: |
| "vzip.16 d22, d23\n\t" |
| "vrev64.32 d18, d23\n\t" |
| "vrev64.32 d19, d22\n\t" |
| // Write 8 elements to &freq_buf[tmp16] |
| "vst1.16 {d18, d19}, [%[ptr_realImag2]], %[offset]\n\t" |
| |
| "vld1.16 d22, [%[ptr_real]]!\n\t" |
| "vld1.16 d23, [%[ptr_imag]]!\n\t" |
| // Negate and interleave: |
| "vmov.s16 d20, d22\n\t" |
| "vneg.s16 d21, d23\n\t" |
| "vzip.16 d20, d21\n\t" |
| // Write 8 elements to &freq_buf[j] |
| "vst1.16 {d20, d21}, [%[ptr_realImag1]]!\n\t" |
| // Interleave and reverse elements: |
| "vzip.16 d22, d23\n\t" |
| "vrev64.32 d18, d23\n\t" |
| "vrev64.32 d19, d22\n\t" |
| // Write 8 elements to &freq_buf[tmp16] |
| "vst1.16 {d18, d19}, [%[ptr_realImag2]], %[offset]\n\t" |
| |
| // Specify constraints. |
| :[ptr_imag]"+r"(ptr_imag), |
| [ptr_real]"+r"(ptr_real), |
| [ptr_realImag1]"+r"(ptr_realImag1), |
| [ptr_realImag2]"+r"(ptr_realImag2) |
| :[offset]"r"(offset) |
| :"d18", "d19", "d20", "d21", "d22", "d23" |
| ); |
| } |
| for (ptr_realImag2 += 6; |
| ptr_real <= &inst->real[inst->anaLen2]; |
| ptr_real += 1, ptr_imag += 1, ptr_realImag1 += 2, ptr_realImag2 -= 2) { |
| *ptr_realImag1 = *ptr_real; |
| *(ptr_realImag1 + 1) = -(*ptr_imag); |
| *ptr_realImag2 = *ptr_real; |
| *(ptr_realImag2 + 1) = *ptr_imag; |
| } |
| |
| freq_buf[inst->anaLen] = inst->real[inst->anaLen2]; |
| freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2]; |
| } |
| |
| // Denormalize the input buffer. |
| static __inline void DenormalizeNeon(NsxInst_t* inst, int16_t* in, int factor) { |
| int16_t* ptr_real = &inst->real[0]; |
| int16_t* ptr_in = &in[0]; |
| |
| __asm__ __volatile__("vdup.32 q10, %0" :: |
| "r"((int32_t)(factor - inst->normData)) : "q10"); |
| for (; ptr_real < &inst->real[inst->anaLen];) { |
| |
| // Loop unrolled once. Both pointers are incremented. |
| __asm__ __volatile__( |
| // tmp32 = WEBRTC_SPL_SHIFT_W32((int32_t)in[j], |
| // factor - inst->normData); |
| "vld2.16 {d24, d25}, [%[ptr_in]]!\n\t" |
| "vmovl.s16 q12, d24\n\t" |
| "vshl.s32 q12, q10\n\t" |
| // inst->real[i] = WebRtcSpl_SatW32ToW16(tmp32); // Q0 |
| "vqmovn.s32 d24, q12\n\t" |
| "vst1.16 d24, [%[ptr_real]]!\n\t" |
| |
| // tmp32 = WEBRTC_SPL_SHIFT_W32((int32_t)in[j], |
| // factor - inst->normData); |
| "vld2.16 {d22, d23}, [%[ptr_in]]!\n\t" |
| "vmovl.s16 q11, d22\n\t" |
| "vshl.s32 q11, q10\n\t" |
| // inst->real[i] = WebRtcSpl_SatW32ToW16(tmp32); // Q0 |
| "vqmovn.s32 d22, q11\n\t" |
| "vst1.16 d22, [%[ptr_real]]!\n\t" |
| |
| // Specify constraints. |
| :[ptr_in]"+r"(ptr_in), |
| [ptr_real]"+r"(ptr_real) |
| : |
| :"d22", "d23", "d24", "d25" |
| ); |
| } |
| } |
| |
| // For the noise supress process, synthesis, read out fully processed segment, |
| // and update synthesis buffer. |
| static void SynthesisUpdateNeon(NsxInst_t* inst, |
| int16_t* out_frame, |
| int16_t gain_factor) { |
| int16_t* ptr_real = &inst->real[0]; |
| int16_t* ptr_syn = &inst->synthesisBuffer[0]; |
| int16_t* ptr_window = &inst->window[0]; |
| |
| // synthesis |
| __asm__ __volatile__("vdup.16 d24, %0" : : "r"(gain_factor) : "d24"); |
| // Loop unrolled once. All pointers are incremented in the assembly code. |
| for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen];) { |
| __asm__ __volatile__( |
| // Load variables. |
| "vld1.16 d22, [%[ptr_real]]!\n\t" |
| "vld1.16 d23, [%[ptr_window]]!\n\t" |
| "vld1.16 d25, [%[ptr_syn]]\n\t" |
| // tmp16a = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( |
| // inst->window[i], inst->real[i], 14); // Q0, window in Q14 |
| "vmull.s16 q11, d22, d23\n\t" |
| "vrshrn.i32 d22, q11, #14\n\t" |
| // tmp32 = WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(tmp16a, gain_factor, 13); |
| "vmull.s16 q11, d24, d22\n\t" |
| // tmp16b = WebRtcSpl_SatW32ToW16(tmp32); // Q0 |
| "vqrshrn.s32 d22, q11, #13\n\t" |
| // inst->synthesisBuffer[i] = WEBRTC_SPL_ADD_SAT_W16( |
| // inst->synthesisBuffer[i], tmp16b); // Q0 |
| "vqadd.s16 d25, d22\n\t" |
| "vst1.16 d25, [%[ptr_syn]]!\n\t" |
| |
| // Load variables. |
| "vld1.16 d26, [%[ptr_real]]!\n\t" |
| "vld1.16 d27, [%[ptr_window]]!\n\t" |
| "vld1.16 d28, [%[ptr_syn]]\n\t" |
| // tmp16a = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( |
| // inst->window[i], inst->real[i], 14); // Q0, window in Q14 |
| "vmull.s16 q13, d26, d27\n\t" |
| "vrshrn.i32 d26, q13, #14\n\t" |
| // tmp32 = WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(tmp16a, gain_factor, 13); |
| "vmull.s16 q13, d24, d26\n\t" |
| // tmp16b = WebRtcSpl_SatW32ToW16(tmp32); // Q0 |
| "vqrshrn.s32 d26, q13, #13\n\t" |
| // inst->synthesisBuffer[i] = WEBRTC_SPL_ADD_SAT_W16( |
| // inst->synthesisBuffer[i], tmp16b); // Q0 |
| "vqadd.s16 d28, d26\n\t" |
| "vst1.16 d28, [%[ptr_syn]]!\n\t" |
| |
| // Specify constraints. |
| :[ptr_real]"+r"(ptr_real), |
| [ptr_window]"+r"(ptr_window), |
| [ptr_syn]"+r"(ptr_syn) |
| : |
| :"d22", "d23", "d24", "d25", "d26", "d27", "d28", "q11", "q12", "q13" |
| ); |
| } |
| |
| int16_t* ptr_out = &out_frame[0]; |
| ptr_syn = &inst->synthesisBuffer[0]; |
| // read out fully processed segment |
| for (; ptr_syn < &inst->synthesisBuffer[inst->blockLen10ms];) { |
| // Loop unrolled once. Both pointers are incremented in the assembly code. |
| __asm__ __volatile__( |
| // out_frame[i] = inst->synthesisBuffer[i]; // Q0 |
| "vld1.16 {d22, d23}, [%[ptr_syn]]!\n\t" |
| "vld1.16 {d24, d25}, [%[ptr_syn]]!\n\t" |
| "vst1.16 {d22, d23}, [%[ptr_out]]!\n\t" |
| "vst1.16 {d24, d25}, [%[ptr_out]]!\n\t" |
| :[ptr_syn]"+r"(ptr_syn), |
| [ptr_out]"+r"(ptr_out) |
| : |
| :"d22", "d23", "d24", "d25" |
| ); |
| } |
| |
| // Update synthesis buffer. |
| // C code: |
| // WEBRTC_SPL_MEMCPY_W16(inst->synthesisBuffer, |
| // inst->synthesisBuffer + inst->blockLen10ms, |
| // inst->anaLen - inst->blockLen10ms); |
| ptr_out = &inst->synthesisBuffer[0], |
| ptr_syn = &inst->synthesisBuffer[inst->blockLen10ms]; |
| for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen];) { |
| // Loop unrolled once. Both pointers are incremented in the assembly code. |
| __asm__ __volatile__( |
| "vld1.16 {d22, d23}, [%[ptr_syn]]!\n\t" |
| "vld1.16 {d24, d25}, [%[ptr_syn]]!\n\t" |
| "vst1.16 {d22, d23}, [%[ptr_out]]!\n\t" |
| "vst1.16 {d24, d25}, [%[ptr_out]]!\n\t" |
| :[ptr_syn]"+r"(ptr_syn), |
| [ptr_out]"+r"(ptr_out) |
| : |
| :"d22", "d23", "d24", "d25" |
| ); |
| } |
| |
| // C code: |
| // WebRtcSpl_ZerosArrayW16(inst->synthesisBuffer |
| // + inst->anaLen - inst->blockLen10ms, inst->blockLen10ms); |
| __asm__ __volatile__("vdup.16 q10, %0" : : "r"(0) : "q10"); |
| for (; ptr_out < &inst->synthesisBuffer[inst->anaLen];) { |
| // Loop unrolled once. Pointer is incremented in the assembly code. |
| __asm__ __volatile__( |
| "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t" |
| "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t" |
| :[ptr_out]"+r"(ptr_out) |
| : |
| :"d20", "d21" |
| ); |
| } |
| } |
| |
| // Update analysis buffer for lower band, and window data before FFT. |
| static void AnalysisUpdateNeon(NsxInst_t* inst, |
| int16_t* out, |
| int16_t* new_speech) { |
| |
| int16_t* ptr_ana = &inst->analysisBuffer[inst->blockLen10ms]; |
| int16_t* ptr_out = &inst->analysisBuffer[0]; |
| |
| // For lower band update analysis buffer. |
| // WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer, |
| // inst->analysisBuffer + inst->blockLen10ms, |
| // inst->anaLen - inst->blockLen10ms); |
| for (; ptr_out < &inst->analysisBuffer[inst->anaLen - inst->blockLen10ms];) { |
| // Loop unrolled once, so both pointers are incremented by 8 twice. |
| __asm__ __volatile__( |
| "vld1.16 {d20, d21}, [%[ptr_ana]]!\n\t" |
| "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t" |
| "vld1.16 {d22, d23}, [%[ptr_ana]]!\n\t" |
| "vst1.16 {d22, d23}, [%[ptr_out]]!\n\t" |
| :[ptr_ana]"+r"(ptr_ana), |
| [ptr_out]"+r"(ptr_out) |
| : |
| :"d20", "d21", "d22", "d23" |
| ); |
| } |
| |
| // WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer |
| // + inst->anaLen - inst->blockLen10ms, new_speech, inst->blockLen10ms); |
| for (ptr_ana = new_speech; ptr_out < &inst->analysisBuffer[inst->anaLen];) { |
| // Loop unrolled once, so both pointers are incremented by 8 twice. |
| __asm__ __volatile__( |
| "vld1.16 {d20, d21}, [%[ptr_ana]]!\n\t" |
| "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t" |
| "vld1.16 {d22, d23}, [%[ptr_ana]]!\n\t" |
| "vst1.16 {d22, d23}, [%[ptr_out]]!\n\t" |
| :[ptr_ana]"+r"(ptr_ana), |
| [ptr_out]"+r"(ptr_out) |
| : |
| :"d20", "d21", "d22", "d23" |
| ); |
| } |
| |
| // Window data before FFT |
| int16_t* ptr_window = &inst->window[0]; |
| ptr_out = &out[0]; |
| ptr_ana = &inst->analysisBuffer[0]; |
| for (; ptr_out < &out[inst->anaLen];) { |
| |
| // Loop unrolled once, so all pointers are incremented by 4 twice. |
| __asm__ __volatile__( |
| "vld1.16 d20, [%[ptr_ana]]!\n\t" |
| "vld1.16 d21, [%[ptr_window]]!\n\t" |
| // out[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( |
| // inst->window[i], inst->analysisBuffer[i], 14); // Q0 |
| "vmull.s16 q10, d20, d21\n\t" |
| "vrshrn.i32 d20, q10, #14\n\t" |
| "vst1.16 d20, [%[ptr_out]]!\n\t" |
| |
| "vld1.16 d22, [%[ptr_ana]]!\n\t" |
| "vld1.16 d23, [%[ptr_window]]!\n\t" |
| // out[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( |
| // inst->window[i], inst->analysisBuffer[i], 14); // Q0 |
| "vmull.s16 q11, d22, d23\n\t" |
| "vrshrn.i32 d22, q11, #14\n\t" |
| "vst1.16 d22, [%[ptr_out]]!\n\t" |
| |
| // Specify constraints. |
| :[ptr_ana]"+r"(ptr_ana), |
| [ptr_window]"+r"(ptr_window), |
| [ptr_out]"+r"(ptr_out) |
| : |
| :"d20", "d21", "d22", "d23", "q10", "q11" |
| ); |
| } |
| } |
| |
| // Create a complex number buffer (out[]) as the intput (in[]) interleaved with |
| // zeros, and normalize it. |
| static __inline void CreateComplexBufferNeon(NsxInst_t* inst, |
| int16_t* in, |
| int16_t* out) { |
| int16_t* ptr_out = &out[0]; |
| int16_t* ptr_in = &in[0]; |
| |
| __asm__ __volatile__("vdup.16 d25, %0" : : "r"(0) : "d25"); |
| __asm__ __volatile__("vdup.16 q10, %0" : : "r"(inst->normData) : "q10"); |
| for (; ptr_in < &in[inst->anaLen];) { |
| |
| // Loop unrolled once, so ptr_in is incremented by 8 twice, |
| // and ptr_out is incremented by 8 four times. |
| __asm__ __volatile__( |
| // out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData) |
| "vld1.16 {d22, d23}, [%[ptr_in]]!\n\t" |
| "vshl.s16 q11, q10\n\t" |
| "vmov d24, d23\n\t" |
| |
| // out[j + 1] = 0; // Insert zeros in imaginary part |
| "vmov d23, d25\n\t" |
| "vst2.16 {d22, d23}, [%[ptr_out]]!\n\t" |
| "vst2.16 {d24, d25}, [%[ptr_out]]!\n\t" |
| |
| // out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData) |
| "vld1.16 {d22, d23}, [%[ptr_in]]!\n\t" |
| "vshl.s16 q11, q10\n\t" |
| "vmov d24, d23\n\t" |
| |
| // out[j + 1] = 0; // Insert zeros in imaginary part |
| "vmov d23, d25\n\t" |
| "vst2.16 {d22, d23}, [%[ptr_out]]!\n\t" |
| "vst2.16 {d24, d25}, [%[ptr_out]]!\n\t" |
| |
| // Specify constraints. |
| :[ptr_in]"+r"(ptr_in), |
| [ptr_out]"+r"(ptr_out) |
| : |
| :"d22", "d23", "d24", "d25", "q10", "q11" |
| ); |
| } |
| } |
| |
| void WebRtcNsx_InitNeon(void) { |
| WebRtcNsx_NoiseEstimation = NoiseEstimationNeon; |
| WebRtcNsx_PrepareSpectrum = PrepareSpectrumNeon; |
| WebRtcNsx_SynthesisUpdate = SynthesisUpdateNeon; |
| WebRtcNsx_AnalysisUpdate = AnalysisUpdateNeon; |
| WebRtcNsx_Denormalize = DenormalizeNeon; |
| WebRtcNsx_CreateComplexBuffer = CreateComplexBufferNeon; |
| } |