src/common_audio/signal_processing/cross_correlation_neon.s - vendor/opensource/webrtc - Git at Google

 @
 @ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
 @
 @ Use of this source code is governed by a BSD-style license
 @ that can be found in the LICENSE file in the root of the source
 @ tree. An additional intellectual property rights grant can be found
 @ in the file PATENTS.  All contributing project authors may
 @ be found in the AUTHORS file in the root of the source tree.
 @

 @ cross_correlation_neon.s
 @ This file contains the function WebRtcSpl_CrossCorrelation(),
 @ optimized for ARM Neon platform.
 @
 @ Reference Ccode at end of this file.
 @ Output is bit-exact with the reference C code, but not with the generic
 @ C code in file cross_correlation.c, due to reduction of shift operations
 @ from using Neon registers.

 @ Register usage:
 @
 @ r0: *cross_correlation (function argument)
 @ r1: *seq1 (function argument)
 @ r2: *seq2 (function argument)
 @ r3: dim_seq (function argument); then, total iteration of LOOP_DIM_SEQ
 @ r4: counter for LOOP_DIM_CROSS_CORRELATION
 @ r5: seq2_ptr
 @ r6: seq1_ptr
 @ r7: Total iteration of LOOP_DIM_SEQ_RESIDUAL
 @ r8, r9, r10, r11, r12: scratch

 .arch armv7-a
 .fpu neon

 .align  2
 .global WebRtcSpl_CrossCorrelation

 WebRtcSpl_CrossCorrelation:

 .fnstart

 .save {r4-r11}
   push {r4-r11}

   @ Put the shift value (-right_shifts) into a Neon register.
   ldrsh r10, [sp, #36]
   rsb r10, r10, #0
   mov r8, r10, asr #31
   vmov.32 d16, r10, r8

   @ Initialize loop counters.
   and r7, r3, #7              @ inner_loop_len2 = dim_seq % 8;
   asr r3, r3, #3              @ inner_loop_len1 = dim_seq / 8;
   ldrsh r4, [sp, #32]         @ dim_cross_correlation

 LOOP_DIM_CROSS_CORRELATION:
   vmov.i32 q9, #0
   vmov.i32 q14, #0
   movs r8, r3                 @ inner_loop_len1
   mov r6, r1                  @ seq1_ptr
   mov r5, r2                  @ seq2_ptr
   ble POST_LOOP_DIM_SEQ

 LOOP_DIM_SEQ:
   vld1.16 {d20, d21}, [r6]!   @ seq1_ptr
   vld1.16 {d22, d23}, [r5]!   @ seq2_ptr
   subs r8, r8, #1
   vmull.s16 q12, d20, d22
   vmull.s16 q13, d21, d23
   vpadal.s32 q9, q12
   vpadal.s32 q14, q13
   bgt LOOP_DIM_SEQ

 POST_LOOP_DIM_SEQ:
   movs r10, r7                @ Loop counter
   mov r12, #0
   mov r8, #0
   ble POST_LOOP_DIM_SEQ_RESIDUAL

 LOOP_DIM_SEQ_RESIDUAL:
   ldrh r11, [r6], #2
   ldrh r9, [r5], #2
   smulbb r11, r11, r9
   adds r8, r8, r11
   adc r12, r12, r11, asr #31
   subs r10, #1
   bgt LOOP_DIM_SEQ_RESIDUAL

 POST_LOOP_DIM_SEQ_RESIDUAL:   @ Sum the results up and do the shift.
   vadd.i64 d18, d19
   vadd.i64 d28, d29
   vadd.i64 d18, d28
   vmov.32 d17[0], r8
   vmov.32 d17[1], r12
   vadd.i64 d17, d18
   vshl.s64 d17, d16
   vst1.32 d17[0], [r0]!       @ Store the output

   ldr r8, [sp, #40]           @ step_seq2
   add r2, r8, lsl #1          @ prepare for seq2_ptr(r5) in the next loop.

   subs r4, #1
   bgt LOOP_DIM_CROSS_CORRELATION

   pop {r4-r11}
   bx  lr

 .fnend


 @ TODO(kma): Place this piece of reference code into a C code file.
 @ void WebRtcSpl_CrossCorrelation(WebRtc_Word32* cross_correlation,
 @                                 WebRtc_Word16* seq1,
 @                                 WebRtc_Word16* seq2,
 @                                 WebRtc_Word16 dim_seq,
 @                                 WebRtc_Word16 dim_cross_correlation,
 @                                 WebRtc_Word16 right_shifts,
 @                                 WebRtc_Word16 step_seq2) {
 @   int i = 0;
 @   int j = 0;
 @   int inner_loop_len1 = dim_seq >> 3;
 @   int inner_loop_len2 = dim_seq - (inner_loop_len1 << 3);
 @
 @   assert(dim_cross_correlation > 0);
 @   assert(dim_seq > 0);
 @
 @   for (i = 0; i < dim_cross_correlation; i++) {
 @     int16_t *seq1_ptr = seq1;
 @     int16_t *seq2_ptr = seq2 + (step_seq2 * i);
 @     int64_t sum = 0;
 @
 @     for (j = inner_loop_len1; j > 0; j -= 1) {
 @       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
 @       seq1_ptr++;
 @       seq2_ptr++;
 @       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
 @       seq1_ptr++;
 @       seq2_ptr++;
 @       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
 @       seq1_ptr++;
 @       seq2_ptr++;
 @       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
 @       seq1_ptr++;
 @       seq2_ptr++;
 @       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
 @       seq1_ptr++;
 @       seq2_ptr++;
 @       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
 @       seq1_ptr++;
 @       seq2_ptr++;
 @       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
 @       seq1_ptr++;
 @       seq2_ptr++;
 @       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
 @       seq1_ptr++;
 @       seq2_ptr++;
 @     }
 @
 @     // Calculate the rest of the samples.
 @     for (j = inner_loop_len2; j > 0; j -= 1) {
 @       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
 @       seq1_ptr++;
 @       seq2_ptr++;
 @     }
 @
 @     *cross_correlation++ = (int32_t)(sum >> right_shifts);
 @   }
 @ }
	@
	@ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
	@
	@ Use of this source code is governed by a BSD-style license
	@ that can be found in the LICENSE file in the root of the source
	@ tree. An additional intellectual property rights grant can be found
	@ in the file PATENTS. All contributing project authors may
	@ be found in the AUTHORS file in the root of the source tree.
	@

	@ cross_correlation_neon.s
	@ This file contains the function WebRtcSpl_CrossCorrelation(),
	@ optimized for ARM Neon platform.
	@
	@ Reference Ccode at end of this file.
	@ Output is bit-exact with the reference C code, but not with the generic
	@ C code in file cross_correlation.c, due to reduction of shift operations
	@ from using Neon registers.

	@ Register usage:
	@
	@ r0: *cross_correlation (function argument)
	@ r1: *seq1 (function argument)
	@ r2: *seq2 (function argument)
	@ r3: dim_seq (function argument); then, total iteration of LOOP_DIM_SEQ
	@ r4: counter for LOOP_DIM_CROSS_CORRELATION
	@ r5: seq2_ptr
	@ r6: seq1_ptr
	@ r7: Total iteration of LOOP_DIM_SEQ_RESIDUAL
	@ r8, r9, r10, r11, r12: scratch

	.arch armv7-a
	.fpu neon

	.align 2
	.global WebRtcSpl_CrossCorrelation

	WebRtcSpl_CrossCorrelation:

	.fnstart

	.save {r4-r11}
	push {r4-r11}

	@ Put the shift value (-right_shifts) into a Neon register.
	ldrsh r10, [sp, #36]
	rsb r10, r10, #0
	mov r8, r10, asr #31
	vmov.32 d16, r10, r8

	@ Initialize loop counters.
	and r7, r3, #7 @ inner_loop_len2 = dim_seq % 8;
	asr r3, r3, #3 @ inner_loop_len1 = dim_seq / 8;
	ldrsh r4, [sp, #32] @ dim_cross_correlation

	LOOP_DIM_CROSS_CORRELATION:
	vmov.i32 q9, #0
	vmov.i32 q14, #0
	movs r8, r3 @ inner_loop_len1
	mov r6, r1 @ seq1_ptr
	mov r5, r2 @ seq2_ptr
	ble POST_LOOP_DIM_SEQ

	LOOP_DIM_SEQ:
	vld1.16 {d20, d21}, [r6]! @ seq1_ptr
	vld1.16 {d22, d23}, [r5]! @ seq2_ptr
	subs r8, r8, #1
	vmull.s16 q12, d20, d22
	vmull.s16 q13, d21, d23
	vpadal.s32 q9, q12
	vpadal.s32 q14, q13
	bgt LOOP_DIM_SEQ

	POST_LOOP_DIM_SEQ:
	movs r10, r7 @ Loop counter
	mov r12, #0
	mov r8, #0
	ble POST_LOOP_DIM_SEQ_RESIDUAL

	LOOP_DIM_SEQ_RESIDUAL:
	ldrh r11, [r6], #2
	ldrh r9, [r5], #2
	smulbb r11, r11, r9
	adds r8, r8, r11
	adc r12, r12, r11, asr #31
	subs r10, #1
	bgt LOOP_DIM_SEQ_RESIDUAL

	POST_LOOP_DIM_SEQ_RESIDUAL: @ Sum the results up and do the shift.
	vadd.i64 d18, d19
	vadd.i64 d28, d29
	vadd.i64 d18, d28
	vmov.32 d17[0], r8
	vmov.32 d17[1], r12
	vadd.i64 d17, d18
	vshl.s64 d17, d16
	vst1.32 d17[0], [r0]! @ Store the output

	ldr r8, [sp, #40] @ step_seq2
	add r2, r8, lsl #1 @ prepare for seq2_ptr(r5) in the next loop.

	subs r4, #1
	bgt LOOP_DIM_CROSS_CORRELATION

	pop {r4-r11}
	bx lr

	.fnend


	@ TODO(kma): Place this piece of reference code into a C code file.
	@ void WebRtcSpl_CrossCorrelation(WebRtc_Word32* cross_correlation,
	@ WebRtc_Word16* seq1,
	@ WebRtc_Word16* seq2,
	@ WebRtc_Word16 dim_seq,
	@ WebRtc_Word16 dim_cross_correlation,
	@ WebRtc_Word16 right_shifts,
	@ WebRtc_Word16 step_seq2) {
	@ int i = 0;
	@ int j = 0;
	@ int inner_loop_len1 = dim_seq >> 3;
	@ int inner_loop_len2 = dim_seq - (inner_loop_len1 << 3);
	@
	@ assert(dim_cross_correlation > 0);
	@ assert(dim_seq > 0);
	@
	@ for (i = 0; i < dim_cross_correlation; i++) {
	@ int16_t *seq1_ptr = seq1;
	@ int16_t seq2_ptr = seq2 + (step_seq2 i);
	@ int64_t sum = 0;
	@
	@ for (j = inner_loop_len1; j > 0; j -= 1) {
	@ sum += WEBRTC_SPL_MUL_16_16(seq1_ptr, seq2_ptr);
	@ seq1_ptr++;
	@ seq2_ptr++;
	@ sum += WEBRTC_SPL_MUL_16_16(seq1_ptr, seq2_ptr);
	@ seq1_ptr++;
	@ seq2_ptr++;
	@ sum += WEBRTC_SPL_MUL_16_16(seq1_ptr, seq2_ptr);
	@ seq1_ptr++;
	@ seq2_ptr++;
	@ sum += WEBRTC_SPL_MUL_16_16(seq1_ptr, seq2_ptr);
	@ seq1_ptr++;
	@ seq2_ptr++;
	@ sum += WEBRTC_SPL_MUL_16_16(seq1_ptr, seq2_ptr);
	@ seq1_ptr++;
	@ seq2_ptr++;
	@ sum += WEBRTC_SPL_MUL_16_16(seq1_ptr, seq2_ptr);
	@ seq1_ptr++;
	@ seq2_ptr++;
	@ sum += WEBRTC_SPL_MUL_16_16(seq1_ptr, seq2_ptr);
	@ seq1_ptr++;
	@ seq2_ptr++;
	@ sum += WEBRTC_SPL_MUL_16_16(seq1_ptr, seq2_ptr);
	@ seq1_ptr++;
	@ seq2_ptr++;
	@ }
	@
	@ // Calculate the rest of the samples.
	@ for (j = inner_loop_len2; j > 0; j -= 1) {
	@ sum += WEBRTC_SPL_MUL_16_16(seq1_ptr, seq2_ptr);
	@ seq1_ptr++;
	@ seq2_ptr++;
	@ }
	@
	@ *cross_correlation++ = (int32_t)(sum >> right_shifts);
	@ }
	@ }