| @ |
| @ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. |
| @ |
| @ Use of this source code is governed by a BSD-style license |
| @ that can be found in the LICENSE file in the root of the source |
| @ tree. An additional intellectual property rights grant can be found |
| @ in the file PATENTS. All contributing project authors may |
| @ be found in the AUTHORS file in the root of the source tree. |
| @ |
| |
| @ cross_correlation_neon.s |
| @ This file contains the function WebRtcSpl_CrossCorrelation(), |
| @ optimized for ARM Neon platform. |
| @ |
| @ Reference Ccode at end of this file. |
| @ Output is bit-exact with the reference C code, but not with the generic |
| @ C code in file cross_correlation.c, due to reduction of shift operations |
| @ from using Neon registers. |
| |
| @ Register usage: |
| @ |
| @ r0: *cross_correlation (function argument) |
| @ r1: *seq1 (function argument) |
| @ r2: *seq2 (function argument) |
| @ r3: dim_seq (function argument); then, total iteration of LOOP_DIM_SEQ |
| @ r4: counter for LOOP_DIM_CROSS_CORRELATION |
| @ r5: seq2_ptr |
| @ r6: seq1_ptr |
| @ r7: Total iteration of LOOP_DIM_SEQ_RESIDUAL |
| @ r8, r9, r10, r11, r12: scratch |
| |
| .arch armv7-a |
| .fpu neon |
| |
| .align 2 |
| .global WebRtcSpl_CrossCorrelation |
| |
| WebRtcSpl_CrossCorrelation: |
| |
| .fnstart |
| |
| .save {r4-r11} |
| push {r4-r11} |
| |
| @ Put the shift value (-right_shifts) into a Neon register. |
| ldrsh r10, [sp, #36] |
| rsb r10, r10, #0 |
| mov r8, r10, asr #31 |
| vmov.32 d16, r10, r8 |
| |
| @ Initialize loop counters. |
| and r7, r3, #7 @ inner_loop_len2 = dim_seq % 8; |
| asr r3, r3, #3 @ inner_loop_len1 = dim_seq / 8; |
| ldrsh r4, [sp, #32] @ dim_cross_correlation |
| |
| LOOP_DIM_CROSS_CORRELATION: |
| vmov.i32 q9, #0 |
| vmov.i32 q14, #0 |
| movs r8, r3 @ inner_loop_len1 |
| mov r6, r1 @ seq1_ptr |
| mov r5, r2 @ seq2_ptr |
| ble POST_LOOP_DIM_SEQ |
| |
| LOOP_DIM_SEQ: |
| vld1.16 {d20, d21}, [r6]! @ seq1_ptr |
| vld1.16 {d22, d23}, [r5]! @ seq2_ptr |
| subs r8, r8, #1 |
| vmull.s16 q12, d20, d22 |
| vmull.s16 q13, d21, d23 |
| vpadal.s32 q9, q12 |
| vpadal.s32 q14, q13 |
| bgt LOOP_DIM_SEQ |
| |
| POST_LOOP_DIM_SEQ: |
| movs r10, r7 @ Loop counter |
| mov r12, #0 |
| mov r8, #0 |
| ble POST_LOOP_DIM_SEQ_RESIDUAL |
| |
| LOOP_DIM_SEQ_RESIDUAL: |
| ldrh r11, [r6], #2 |
| ldrh r9, [r5], #2 |
| smulbb r11, r11, r9 |
| adds r8, r8, r11 |
| adc r12, r12, r11, asr #31 |
| subs r10, #1 |
| bgt LOOP_DIM_SEQ_RESIDUAL |
| |
| POST_LOOP_DIM_SEQ_RESIDUAL: @ Sum the results up and do the shift. |
| vadd.i64 d18, d19 |
| vadd.i64 d28, d29 |
| vadd.i64 d18, d28 |
| vmov.32 d17[0], r8 |
| vmov.32 d17[1], r12 |
| vadd.i64 d17, d18 |
| vshl.s64 d17, d16 |
| vst1.32 d17[0], [r0]! @ Store the output |
| |
| ldr r8, [sp, #40] @ step_seq2 |
| add r2, r8, lsl #1 @ prepare for seq2_ptr(r5) in the next loop. |
| |
| subs r4, #1 |
| bgt LOOP_DIM_CROSS_CORRELATION |
| |
| pop {r4-r11} |
| bx lr |
| |
| .fnend |
| |
| |
| @ TODO(kma): Place this piece of reference code into a C code file. |
| @ void WebRtcSpl_CrossCorrelation(WebRtc_Word32* cross_correlation, |
| @ WebRtc_Word16* seq1, |
| @ WebRtc_Word16* seq2, |
| @ WebRtc_Word16 dim_seq, |
| @ WebRtc_Word16 dim_cross_correlation, |
| @ WebRtc_Word16 right_shifts, |
| @ WebRtc_Word16 step_seq2) { |
| @ int i = 0; |
| @ int j = 0; |
| @ int inner_loop_len1 = dim_seq >> 3; |
| @ int inner_loop_len2 = dim_seq - (inner_loop_len1 << 3); |
| @ |
| @ assert(dim_cross_correlation > 0); |
| @ assert(dim_seq > 0); |
| @ |
| @ for (i = 0; i < dim_cross_correlation; i++) { |
| @ int16_t *seq1_ptr = seq1; |
| @ int16_t *seq2_ptr = seq2 + (step_seq2 * i); |
| @ int64_t sum = 0; |
| @ |
| @ for (j = inner_loop_len1; j > 0; j -= 1) { |
| @ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); |
| @ seq1_ptr++; |
| @ seq2_ptr++; |
| @ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); |
| @ seq1_ptr++; |
| @ seq2_ptr++; |
| @ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); |
| @ seq1_ptr++; |
| @ seq2_ptr++; |
| @ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); |
| @ seq1_ptr++; |
| @ seq2_ptr++; |
| @ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); |
| @ seq1_ptr++; |
| @ seq2_ptr++; |
| @ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); |
| @ seq1_ptr++; |
| @ seq2_ptr++; |
| @ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); |
| @ seq1_ptr++; |
| @ seq2_ptr++; |
| @ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); |
| @ seq1_ptr++; |
| @ seq2_ptr++; |
| @ } |
| @ |
| @ // Calculate the rest of the samples. |
| @ for (j = inner_loop_len2; j > 0; j -= 1) { |
| @ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); |
| @ seq1_ptr++; |
| @ seq2_ptr++; |
| @ } |
| @ |
| @ *cross_correlation++ = (int32_t)(sum >> right_shifts); |
| @ } |
| @ } |