@
@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
@
@ Use of this source code is governed by a BSD-style license
@ that can be found in the LICENSE file in the root of the source
@ tree. An additional intellectual property rights grant can be found
@ in the file PATENTS.  All contributing project authors may
@ be found in the AUTHORS file in the root of the source tree.
@

@ This file contains the function WebRtcSpl_FilterARFastQ12(), optimized for
@ ARMv7  platform. The description header can be found in
@ signal_processing_library.h
@
@ Output is bit-exact with the generic C code as in filter_ar_fast_q12.c, and
@ the reference C code at end of this file.

@ Assumptions:
@ (1) data_length > 0
@ (2) coefficients_length > 1

@ Register usage:
@
@ r0:  &data_in[i]
@ r1:  &data_out[i], for result ouput
@ r2:  &coefficients[0]
@ r3:  coefficients_length
@ r4:  Iteration counter for the outer loop.
@ r5:  data_out[j] as multiplication inputs
@ r6:  Calculated value for output data_out[]; interation counter for inner loop
@ r7:  Partial sum of a filtering multiplication results
@ r8:  Partial sum of a filtering multiplication results
@ r9:  &data_out[], for filtering input; data_in[i]
@ r10: coefficients[j]
@ r11: Scratch
@ r12: &coefficients[j]

.arch armv7-a

.align  2
.global WebRtcSpl_FilterARFastQ12

WebRtcSpl_FilterARFastQ12:

.fnstart

.save {r4-r11}
  push {r4-r11}

  ldrsh r12, [sp, #32]         @ data_length
  subs r4, r12, #1
  beq ODD_LENGTH               @ jump if data_length == 1

LOOP_LENGTH:
  add r12, r2, r3, lsl #1
  sub r12, #4                  @ &coefficients[coefficients_length - 2]
  sub r9, r1, r3, lsl #1
  add r9, #2                   @ &data_out[i - coefficients_length + 1]
  ldr r5, [r9], #4             @ data_out[i - coefficients_length + {1,2}]

  mov r7, #0                   @ sum1
  mov r8, #0                   @ sum2
  subs r6, r3, #3              @ Iteration counter for inner loop.
  beq ODD_A_LENGTH             @ branch if coefficients_length == 3
  blt POST_LOOP_A_LENGTH       @ branch if coefficients_length == 2

LOOP_A_LENGTH:
  ldr r10, [r12], #-4          @ coefficients[j - 1], coefficients[j]
  subs r6, #2
  smlatt r8, r10, r5, r8       @ sum2 += coefficients[j] * data_out[i - j + 1];
  smlatb r7, r10, r5, r7       @ sum1 += coefficients[j] * data_out[i - j];
  smlabt r7, r10, r5, r7       @ coefficients[j - 1] * data_out[i - j + 1];
  ldr r5, [r9], #4             @ data_out[i - j + 2],  data_out[i - j + 3]
  smlabb r8, r10, r5, r8       @ coefficients[j - 1] * data_out[i - j + 2];
  bgt LOOP_A_LENGTH
  blt POST_LOOP_A_LENGTH

ODD_A_LENGTH:
  ldrsh r10, [r12, #2]         @ Filter coefficients coefficients[2]
  sub r12, #2                  @ &coefficients[0]
  smlabb r7, r10, r5, r7       @ sum1 += coefficients[2] * data_out[i - 2];
  smlabt r8, r10, r5, r8       @ sum2 += coefficients[2] * data_out[i - 1];
  ldr r5, [r9, #-2]            @ data_out[i - 1],  data_out[i]

POST_LOOP_A_LENGTH:
  ldr r10, [r12]               @ coefficients[0], coefficients[1]
  smlatb r7, r10, r5, r7       @ sum1 += coefficients[1] * data_out[i - 1];

  ldr r9, [r0], #4             @ data_in[i], data_in[i + 1]
  smulbb r6, r10, r9           @ output1 = coefficients[0] * data_in[i];
  sub r6, r7                   @ output1 -= sum1;

  sbfx r11, r6, #12, #16
  ssat r7, #16, r6, asr #12
  cmp r7, r11
  addeq r6, r6, #2048
  ssat r6, #16, r6, asr #12
  strh r6, [r1], #2            @ Store data_out[i]

  smlatb r8, r10, r6, r8       @ sum2 += coefficients[1] * data_out[i];
  smulbt r6, r10, r9           @ output2 = coefficients[0] * data_in[i + 1];
  sub r6, r8                   @ output1 -= sum1;

  sbfx r11, r6, #12, #16
  ssat r7, #16, r6, asr #12
  cmp r7, r11
  addeq r6, r6, #2048
  ssat r6, #16, r6, asr #12
  strh r6, [r1], #2            @ Store data_out[i + 1]

  subs r4, #2
  bgt LOOP_LENGTH
  blt END                      @ For even data_length, it's done. Jump to END.

@ Process i = data_length -1, for the case of an odd length.
ODD_LENGTH:
  add r12, r2, r3, lsl #1
  sub r12, #4                  @ &coefficients[coefficients_length - 2]
  sub r9, r1, r3, lsl #1
  add r9, #2                   @ &data_out[i - coefficients_length + 1]
  mov r7, #0                   @ sum1
  mov r8, #0                   @ sum1
  subs r6, r3, #2              @ inner loop counter
  beq EVEN_A_LENGTH            @ branch if coefficients_length == 2

LOOP2_A_LENGTH:
  ldr r10, [r12], #-4          @ coefficients[j - 1], coefficients[j]
  ldr r5, [r9], #4             @ data_out[i - j],  data_out[i - j + 1]
  subs r6, #2
  smlatb r7, r10, r5, r7       @ sum1 += coefficients[j] * data_out[i - j];
  smlabt r8, r10, r5, r8       @ coefficients[j - 1] * data_out[i - j + 1];
  bgt LOOP2_A_LENGTH
  addlt r12, #2
  blt POST_LOOP2_A_LENGTH

EVEN_A_LENGTH:
  ldrsh r10, [r12, #2]         @ Filter coefficients coefficients[1]
  ldrsh r5, [r9]               @ data_out[i - 1]
  smlabb r7, r10, r5, r7       @ sum1 += coefficients[1] * data_out[i - 1];

POST_LOOP2_A_LENGTH:
  ldrsh r10, [r12]             @ Filter coefficients coefficients[0]
  ldrsh r9, [r0]               @ data_in[i]
  smulbb r6, r10, r9           @ output1 = coefficients[0] * data_in[i];
  sub r6, r7                   @ output1 -= sum1;
  sub r6, r8                   @ output1 -= sum1;
  sbfx r8, r6, #12, #16
  ssat r7, #16, r6, asr #12
  cmp r7, r8
  addeq r6, r6, #2048
  ssat r6, #16, r6, asr #12
  strh r6, [r1]                @ Store the data_out[i]

END:
  pop {r4-r11}
  bx  lr

.fnend


@Reference C code:
@
@void WebRtcSpl_FilterARFastQ12(int16_t* data_in,
@                               int16_t* data_out,
@                               int16_t* __restrict coefficients,
@                               int coefficients_length,
@                               int data_length) {
@  int i = 0;
@  int j = 0;
@
@  for (i = 0; i < data_length - 1; i += 2) {
@    int32_t output1 = 0;
@    int32_t sum1 = 0;
@    int32_t output2 = 0;
@    int32_t sum2 = 0;
@
@    for (j = coefficients_length - 1; j > 2; j -= 2) {
@      sum1 += coefficients[j]      * data_out[i - j];
@      sum1 += coefficients[j - 1]  * data_out[i - j + 1];
@      sum2 += coefficients[j]     * data_out[i - j + 1];
@      sum2 += coefficients[j - 1] * data_out[i - j + 2];
@    }
@
@    if (j == 2) {
@      sum1 += coefficients[2] * data_out[i - 2];
@      sum2 += coefficients[2] * data_out[i - 1];
@    }
@
@    sum1 += coefficients[1] * data_out[i - 1];
@    output1 = coefficients[0] * data_in[i];
@    output1 -= sum1;
@    // Saturate and store the output.
@    output1 = WEBRTC_SPL_SAT(134215679, output1, -134217728);
@    data_out[i] = (int16_t)((output1 + 2048) >> 12);
@
@    sum2 += coefficients[1] * data_out[i];
@    output2 = coefficients[0] * data_in[i + 1];
@    output2 -= sum2;
@    // Saturate and store the output.
@    output2 = WEBRTC_SPL_SAT(134215679, output2, -134217728);
@    data_out[i + 1] = (int16_t)((output2 + 2048) >> 12);
@  }
@
@  if (i == data_length - 1) {
@    int32_t output1 = 0;
@    int32_t sum1 = 0;
@
@    for (j = coefficients_length - 1; j > 1; j -= 2) {
@      sum1 += coefficients[j]      * data_out[i - j];
@      sum1 += coefficients[j - 1]  * data_out[i - j + 1];
@    }
@
@    if (j == 1) {
@      sum1 += coefficients[1] * data_out[i - 1];
@    }
@
@    output1 = coefficients[0] * data_in[i];
@    output1 -= sum1;
@    // Saturate and store the output.
@    output1 = WEBRTC_SPL_SAT(134215679, output1, -134217728);
@    data_out[i] = (int16_t)((output1 + 2048) >> 12);
@  }
@}
