blob: a59b6e37f21e7040e62a63f1a44b045a843498ed [file] [log] [blame]
@
@ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
@
@ Use of this source code is governed by a BSD-style license
@ that can be found in the LICENSE file in the root of the source
@ tree. An additional intellectual property rights grant can be found
@ in the file PATENTS. All contributing project authors may
@ be found in the AUTHORS file in the root of the source tree.
@
@ lattice_neon.s
@
@ Contains a function for the core loop in the normalized lattice MA
@ filter routine for iSAC codec, optimized for ARM Neon platform.
@ void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,
@ int16_t input1,
@ int32_t input2,
@ int32_t* ptr0,
@ int32_t* ptr1,
@ int32_t* __restrict ptr2);
@ It calculates
@ *ptr2 = input2 * (*ptr2) + input0 * (*ptr0));
@ *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
@ in Q15 domain.
@
@ Reference code in lattice.c.
@ Output is not bit-exact with the reference C code, due to the replacement
@ of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon
@ instructions, smulwb, and smull. Speech quality was not degraded by
@ testing speech and tone vectors.
.arch armv7-a
.fpu neon
#include "settings.h"
.global WebRtcIsacfix_FilterMaLoopNeon
.align 2
WebRtcIsacfix_FilterMaLoopNeon:
.fnstart
.save {r4-r8}
push {r4-r8}
vdup.32 d28, r0 @ Initialize Neon register with input0
vdup.32 d29, r1 @ Initialize Neon register with input1
vdup.32 d30, r2 @ Initialize Neon register with input2
ldr r4, [sp, #20] @ ptr1
ldr r12, [sp, #24] @ ptr2
@ Number of loop iterations after unrolling: r5 = (HALF_SUBFRAMELEN - 1) >> 2
@ Leftover samples after the loop, in r6:
@ r6 = (HALF_SUBFRAMELEN - 1) - (HALF_SUBFRAMELEN - 1) >> 2 << 2
mov r6, #HALF_SUBFRAMELEN
sub r6, #1
lsr r5, r6, #2
sub r6, r5, lsl #2
@ First r5 iterations in a loop.
LOOP:
vld1.32 {d0, d1}, [r3]! @ *ptr0
vmull.s32 q10, d0, d28 @ tmp32a = input0 * (*ptr0)
vmull.s32 q11, d1, d28 @ tmp32a = input0 * (*ptr0)
vmull.s32 q12, d0, d29 @ input1 * (*ptr0)
vmull.s32 q13, d1, d29 @ input1 * (*ptr0)
vrshrn.i64 d4, q10, #15
vrshrn.i64 d5, q11, #15
vld1.32 {d2, d3}, [r12] @ *ptr2
vadd.i32 q3, q2, q1 @ tmp32b = *ptr2 + tmp32a
vrshrn.i64 d0, q12, #15
vmull.s32 q10, d6, d30 @ input2 * (*ptr2 + tmp32b)
vmull.s32 q11, d7, d30 @ input2 * (*ptr2 + tmp32b)
vrshrn.i64 d16, q10, #16
vrshrn.i64 d17, q11, #16
vmull.s32 q10, d16, d28 @ input0 * (*ptr2)
vmull.s32 q11, d17, d28 @ input0 * (*ptr2)
vrshrn.i64 d1, q13, #15
vrshrn.i64 d18, q10, #15
vrshrn.i64 d19, q11, #15
vst1.32 {d16, d17}, [r12]! @ *ptr2
vadd.i32 q9, q0, q9
subs r5, #1
vst1.32 {d18, d19}, [r4]! @ *ptr1
bgt LOOP
@ Check how many samples still need to be processed.
subs r6, #2
blt LAST_SAMPLE
@ Process two more samples:
vld1.32 d0, [r3]! @ *ptr0
vmull.s32 q11, d0, d28 @ tmp32a = input0 * (*ptr0)
vmull.s32 q13, d0, d29 @ input1 * (*ptr0)
vld1.32 d18, [r12] @ *ptr2
vrshrn.i64 d4, q11, #15
vadd.i32 d7, d4, d18 @ tmp32b = *ptr2 + tmp32a
vmull.s32 q11, d7, d30 @ input2 * (*ptr2 + tmp32b)
vrshrn.i64 d16, q11, #16
vmull.s32 q11, d16, d28 @ input0 * (*ptr2)
vst1.32 d16, [r12]! @ *ptr2
vrshrn.i64 d0, q13, #15
vrshrn.i64 d19, q11, #15
vadd.i32 d19, d0, d19
vst1.32 d19, [r4]! @ *ptr1
@ If there's still one more sample, process it here.
LAST_SAMPLE:
cmp r6, #1
bne END
@ *ptr2 = input2 * (*ptr2 + input0 * (*ptr0));
ldr r7, [r3] @ *ptr0
ldr r8, [r12] @ *ptr2
smulwb r5, r7, r0 @ tmp32a = *ptr0 * input0 >> 16
add r8, r8, r5, lsl #1 @ tmp32b = *ptr2 + (tmp32a << 1)
smull r5, r6, r8, r2 @ tmp32b * input2, in 64 bits
lsl r6, #16
add r6, r5, lsr #16 @ Only take the middle 32 bits
str r6, [r12] @ Output (*ptr2, as 32 bits)
@ *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
smulwb r5, r7, r1 @ tmp32a = *ptr0 * input1 >> 16
smulwb r6, r6, r0 @ tmp32b = *ptr2 * input0 >> 16
lsl r5, r5, #1
add r5, r6, lsl #1
str r5, [r4] @ Output (*ptr1)
END:
pop {r4-r8}
bx lr
.fnend