1 files changed, 551 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/libopus/celt/arm/celt_pitch_xcorr_arm.s b/lib/rbcodec/codecs/libopus/celt/arm/celt_pitch_xcorr_arm.s
new file mode 100644
index 0000000000..6e873afc37
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/celt/arm/celt_pitch_xcorr_arm.s
@@ -0,0 +1,551 @@
+; Copyright (c) 2007-2008 CSIRO
+; Copyright (c) 2007-2009 Xiph.Org Foundation
+; Copyright (c) 2013      Parrot
+; Written by Aurélien Zanelli
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+;
+; - Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+;
+; - Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in the
+; documentation and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+  AREA  |.text|, CODE, READONLY
+  GET    celt/arm/armopts.s
+IF OPUS_ARM_MAY_HAVE_EDSP
+  EXPORT celt_pitch_xcorr_edsp
+ENDIF
+IF OPUS_ARM_MAY_HAVE_NEON
+  EXPORT celt_pitch_xcorr_neon
+ENDIF
+IF OPUS_ARM_MAY_HAVE_NEON
+; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
+xcorr_kernel_neon PROC
+xcorr_kernel_neon_start
+  ; input:
+  ;   r3     = int         len
+  ;   r4     = opus_val16 *x
+  ;   r5     = opus_val16 *y
+  ;   q0     = opus_val32  sum[4]
+  ; output:
+  ;   q0     = opus_val32  sum[4]
+  ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
+  ; internal usage:
+  ;   r12 = int j
+  ;   d3  = y_3|y_2|y_1|y_0
+  ;   q2  = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
+  ;   q3  = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
+  ;   q8  = scratch
+  ;
+  ; Load y[0...3]
+  ; This requires len>0 to always be valid (which we assert in the C code).
+  VLD1.16      {d5}, [r5]!
+  SUBS         r12, r3, #8
+  BLE xcorr_kernel_neon_process4
+; Process 8 samples at a time.
+; This loop loads one y value more than we actually need. Therefore we have to
+; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
+; reading past the end of the array.
+xcorr_kernel_neon_process8
+  ; This loop has 19 total instructions (10 cycles to issue, minimum), with
+  ; - 2 cycles of ARM insrtuctions,
+  ; - 10 cycles of load/store/byte permute instructions, and
+  ; - 9 cycles of data processing instructions.
+  ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
+  ; latter two categories, meaning the whole loop should run in 10 cycles per
+  ; iteration, barring cache misses.
+  ;
+  ; Load x[0...7]
+  VLD1.16      {d6, d7}, [r4]!
+  ; Unlike VMOV, VAND is a data processsing instruction (and doesn't get
+  ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
+  VAND         d3, d5, d5
+  SUBS         r12, r12, #8
+  ; Load y[4...11]
+  VLD1.16      {d4, d5}, [r5]!
+  VMLAL.S16    q0, d3, d6[0]
+  VEXT.16      d16, d3, d4, #1
+  VMLAL.S16    q0, d4, d7[0]
+  VEXT.16      d17, d4, d5, #1
+  VMLAL.S16    q0, d16, d6[1]
+  VEXT.16      d16, d3, d4, #2
+  VMLAL.S16    q0, d17, d7[1]
+  VEXT.16      d17, d4, d5, #2
+  VMLAL.S16    q0, d16, d6[2]
+  VEXT.16      d16, d3, d4, #3
+  VMLAL.S16    q0, d17, d7[2]
+  VEXT.16      d17, d4, d5, #3
+  VMLAL.S16    q0, d16, d6[3]
+  VMLAL.S16    q0, d17, d7[3]
+  BGT xcorr_kernel_neon_process8
+; Process 4 samples here if we have > 4 left (still reading one extra y value).
+xcorr_kernel_neon_process4
+  ADDS         r12, r12, #4
+  BLE xcorr_kernel_neon_process2
+  ; Load x[0...3]
+  VLD1.16      d6, [r4]!
+  ; Use VAND since it's a data processing instruction again.
+  VAND         d4, d5, d5
+  SUB          r12, r12, #4
+  ; Load y[4...7]
+  VLD1.16      d5, [r5]!
+  VMLAL.S16    q0, d4, d6[0]
+  VEXT.16      d16, d4, d5, #1
+  VMLAL.S16    q0, d16, d6[1]
+  VEXT.16      d16, d4, d5, #2
+  VMLAL.S16    q0, d16, d6[2]
+  VEXT.16      d16, d4, d5, #3
+  VMLAL.S16    q0, d16, d6[3]
+; Process 2 samples here if we have > 2 left (still reading one extra y value).
+xcorr_kernel_neon_process2
+  ADDS         r12, r12, #2
+  BLE xcorr_kernel_neon_process1
+  ; Load x[0...1]
+  VLD2.16      {d6[],d7[]}, [r4]!
+  ; Use VAND since it's a data processing instruction again.
+  VAND         d4, d5, d5
+  SUB          r12, r12, #2
+  ; Load y[4...5]
+  VLD1.32      {d5[]}, [r5]!
+  VMLAL.S16    q0, d4, d6
+  VEXT.16      d16, d4, d5, #1
+  ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
+  ; instead of VEXT, since it's a data-processing instruction.
+  VSRI.64      d5, d4, #32
+  VMLAL.S16    q0, d16, d7
+; Process 1 sample using the extra y value we loaded above.
+xcorr_kernel_neon_process1
+  ; Load next *x
+  VLD1.16      {d6[]}, [r4]!
+  ADDS         r12, r12, #1
+  ; y[0...3] are left in d5 from prior iteration(s) (if any)
+  VMLAL.S16    q0, d5, d6
+  MOVLE        pc, lr
+; Now process 1 last sample, not reading ahead.
+  ; Load last *y
+  VLD1.16      {d4[]}, [r5]!
+  VSRI.64      d4, d5, #16
+  ; Load last *x
+  VLD1.16      {d6[]}, [r4]!
+  VMLAL.S16    q0, d4, d6
+  MOV          pc, lr
+  ENDP
+; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
+;  opus_val32 *xcorr, int len, int max_pitch, int arch)
+celt_pitch_xcorr_neon PROC
+  ; input:
+  ;   r0  = opus_val16 *_x
+  ;   r1  = opus_val16 *_y
+  ;   r2  = opus_val32 *xcorr
+  ;   r3  = int         len
+  ; output:
+  ;   r0  = int         maxcorr
+  ; internal usage:
+  ;   r4  = opus_val16 *x (for xcorr_kernel_neon())
+  ;   r5  = opus_val16 *y (for xcorr_kernel_neon())
+  ;   r6  = int         max_pitch
+  ;   r12 = int         j
+  ;   q15 = int         maxcorr[4] (q15 is not used by xcorr_kernel_neon())
+  ; ignored:
+  ;         int         arch
+  STMFD        sp!, {r4-r6, lr}
+  LDR          r6, [sp, #16]
+  VMOV.S32     q15, #1
+  ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
+  SUBS         r6, r6, #4
+  BLT celt_pitch_xcorr_neon_process4_done
+celt_pitch_xcorr_neon_process4
+  ; xcorr_kernel_neon parameters:
+  ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
+  MOV          r4, r0
+  MOV          r5, r1
+  VEOR         q0, q0, q0
+  ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
+  ; So we don't save/restore any other registers.
+  BL xcorr_kernel_neon_start
+  SUBS         r6, r6, #4
+  VST1.32      {q0}, [r2]!
+  ; _y += 4
+  ADD          r1, r1, #8
+  VMAX.S32     q15, q15, q0
+  ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
+  BGE celt_pitch_xcorr_neon_process4
+; We have less than 4 sums left to compute.
+celt_pitch_xcorr_neon_process4_done
+  ADDS         r6, r6, #4
+  ; Reduce maxcorr to a single value
+  VMAX.S32     d30, d30, d31
+  VPMAX.S32    d30, d30, d30
+  ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
+  BLE celt_pitch_xcorr_neon_done
+; Now compute each remaining sum one at a time.
+celt_pitch_xcorr_neon_process_remaining
+  MOV          r4, r0
+  MOV          r5, r1
+  VMOV.I32     q0, #0
+  SUBS         r12, r3, #8
+  BLT celt_pitch_xcorr_neon_process_remaining4
+; Sum terms 8 at a time.
+celt_pitch_xcorr_neon_process_remaining_loop8
+  ; Load x[0...7]
+  VLD1.16      {q1}, [r4]!
+  ; Load y[0...7]
+  VLD1.16      {q2}, [r5]!
+  SUBS         r12, r12, #8
+  VMLAL.S16    q0, d4, d2
+  VMLAL.S16    q0, d5, d3
+  BGE celt_pitch_xcorr_neon_process_remaining_loop8
+; Sum terms 4 at a time.
+celt_pitch_xcorr_neon_process_remaining4
+  ADDS         r12, r12, #4
+  BLT celt_pitch_xcorr_neon_process_remaining4_done
+  ; Load x[0...3]
+  VLD1.16      {d2}, [r4]!
+  ; Load y[0...3]
+  VLD1.16      {d3}, [r5]!
+  SUB          r12, r12, #4
+  VMLAL.S16    q0, d3, d2
+celt_pitch_xcorr_neon_process_remaining4_done
+  ; Reduce the sum to a single value.
+  VADD.S32     d0, d0, d1
+  VPADDL.S32   d0, d0
+  ADDS         r12, r12, #4
+  BLE celt_pitch_xcorr_neon_process_remaining_loop_done
+; Sum terms 1 at a time.
+celt_pitch_xcorr_neon_process_remaining_loop1
+  VLD1.16      {d2[]}, [r4]!
+  VLD1.16      {d3[]}, [r5]!
+  SUBS         r12, r12, #1
+  VMLAL.S16    q0, d2, d3
+  BGT celt_pitch_xcorr_neon_process_remaining_loop1
+celt_pitch_xcorr_neon_process_remaining_loop_done
+  VST1.32      {d0[0]}, [r2]!
+  VMAX.S32     d30, d30, d0
+  SUBS         r6, r6, #1
+  ; _y++
+  ADD          r1, r1, #2
+  ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
+  BGT celt_pitch_xcorr_neon_process_remaining
+celt_pitch_xcorr_neon_done
+  VMOV.32      r0, d30[0]
+  LDMFD        sp!, {r4-r6, pc}
+  ENDP
+ENDIF
+IF OPUS_ARM_MAY_HAVE_EDSP
+; This will get used on ARMv7 devices without NEON, so it has been optimized
+; to take advantage of dual-issuing where possible.
+xcorr_kernel_edsp PROC
+xcorr_kernel_edsp_start
+  ; input:
+  ;   r3      = int         len
+  ;   r4      = opus_val16 *_x (must be 32-bit aligned)
+  ;   r5      = opus_val16 *_y (must be 32-bit aligned)
+  ;   r6...r9 = opus_val32  sum[4]
+  ; output:
+  ;   r6...r9 = opus_val32  sum[4]
+  ; preserved: r0-r5
+  ; internal usage
+  ;   r2      = int         j
+  ;   r12,r14 = opus_val16  x[4]
+  ;   r10,r11 = opus_val16  y[4]
+  STMFD        sp!, {r2,r4,r5,lr}
+  LDR          r10, [r5], #4      ; Load y[0...1]
+  SUBS         r2, r3, #4         ; j = len-4
+  LDR          r11, [r5], #4      ; Load y[2...3]
+  BLE xcorr_kernel_edsp_process4_done
+  LDR          r12, [r4], #4      ; Load x[0...1]
+  ; Stall
+xcorr_kernel_edsp_process4
+  ; The multiplies must issue from pipeline 0, and can't dual-issue with each
+  ; other. Every other instruction here dual-issues with a multiply, and is
+  ; thus "free". There should be no stalls in the body of the loop.
+  SMLABB       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x_0,y_0)
+  LDR          r14, [r4], #4      ; Load x[2...3]
+  SMLABT       r7, r12, r10, r7   ; sum[1] = MAC16_16(sum[1],x_0,y_1)
+  SUBS         r2, r2, #4         ; j-=4
+  SMLABB       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x_0,y_2)
+  SMLABT       r9, r12, r11, r9   ; sum[3] = MAC16_16(sum[3],x_0,y_3)
+  SMLATT       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x_1,y_1)
+  LDR          r10, [r5], #4      ; Load y[4...5]
+  SMLATB       r7, r12, r11, r7   ; sum[1] = MAC16_16(sum[1],x_1,y_2)
+  SMLATT       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x_1,y_3)
+  SMLATB       r9, r12, r10, r9   ; sum[3] = MAC16_16(sum[3],x_1,y_4)
+  LDRGT        r12, [r4], #4      ; Load x[0...1]
+  SMLABB       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],x_2,y_2)
+  SMLABT       r7, r14, r11, r7   ; sum[1] = MAC16_16(sum[1],x_2,y_3)
+  SMLABB       r8, r14, r10, r8   ; sum[2] = MAC16_16(sum[2],x_2,y_4)
+  SMLABT       r9, r14, r10, r9   ; sum[3] = MAC16_16(sum[3],x_2,y_5)
+  SMLATT       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],x_3,y_3)
+  LDR          r11, [r5], #4      ; Load y[6...7]
+  SMLATB       r7, r14, r10, r7   ; sum[1] = MAC16_16(sum[1],x_3,y_4)
+  SMLATT       r8, r14, r10, r8   ; sum[2] = MAC16_16(sum[2],x_3,y_5)
+  SMLATB       r9, r14, r11, r9   ; sum[3] = MAC16_16(sum[3],x_3,y_6)
+  BGT xcorr_kernel_edsp_process4
+xcorr_kernel_edsp_process4_done
+  ADDS         r2, r2, #4
+  BLE xcorr_kernel_edsp_done
+  LDRH         r12, [r4], #2      ; r12 = *x++
+  SUBS         r2, r2, #1         ; j--
+  ; Stall
+  SMLABB       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x,y_0)
+  LDRHGT       r14, [r4], #2      ; r14 = *x++
+  SMLABT       r7, r12, r10, r7   ; sum[1] = MAC16_16(sum[1],x,y_1)
+  SMLABB       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x,y_2)
+  SMLABT       r9, r12, r11, r9   ; sum[3] = MAC16_16(sum[3],x,y_3)
+  BLE xcorr_kernel_edsp_done
+  SMLABT       r6, r14, r10, r6   ; sum[0] = MAC16_16(sum[0],x,y_1)
+  SUBS         r2, r2, #1         ; j--
+  SMLABB       r7, r14, r11, r7   ; sum[1] = MAC16_16(sum[1],x,y_2)
+  LDRH         r10, [r5], #2      ; r10 = y_4 = *y++
+  SMLABT       r8, r14, r11, r8   ; sum[2] = MAC16_16(sum[2],x,y_3)
+  LDRHGT       r12, [r4], #2      ; r12 = *x++
+  SMLABB       r9, r14, r10, r9   ; sum[3] = MAC16_16(sum[3],x,y_4)
+  BLE xcorr_kernel_edsp_done
+  SMLABB       r6, r12, r11, r6   ; sum[0] = MAC16_16(sum[0],tmp,y_2)
+  CMP          r2, #1             ; j--
+  SMLABT       r7, r12, r11, r7   ; sum[1] = MAC16_16(sum[1],tmp,y_3)
+  LDRH         r2, [r5], #2       ; r2 = y_5 = *y++
+  SMLABB       r8, r12, r10, r8   ; sum[2] = MAC16_16(sum[2],tmp,y_4)
+  LDRHGT       r14, [r4]          ; r14 = *x
+  SMLABB       r9, r12, r2, r9    ; sum[3] = MAC16_16(sum[3],tmp,y_5)
+  BLE xcorr_kernel_edsp_done
+  SMLABT       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],tmp,y_3)
+  LDRH         r11, [r5]          ; r11 = y_6 = *y
+  SMLABB       r7, r14, r10, r7   ; sum[1] = MAC16_16(sum[1],tmp,y_4)
+  SMLABB       r8, r14, r2, r8    ; sum[2] = MAC16_16(sum[2],tmp,y_5)
+  SMLABB       r9, r14, r11, r9   ; sum[3] = MAC16_16(sum[3],tmp,y_6)
+xcorr_kernel_edsp_done
+  LDMFD        sp!, {r2,r4,r5,pc}
+  ENDP
+celt_pitch_xcorr_edsp PROC
+  ; input:
+  ;   r0  = opus_val16 *_x (must be 32-bit aligned)
+  ;   r1  = opus_val16 *_y (only needs to be 16-bit aligned)
+  ;   r2  = opus_val32 *xcorr
+  ;   r3  = int         len
+  ; output:
+  ;   r0  = maxcorr
+  ; internal usage
+  ;   r4  = opus_val16 *x
+  ;   r5  = opus_val16 *y
+  ;   r6  = opus_val32  sum0
+  ;   r7  = opus_val32  sum1
+  ;   r8  = opus_val32  sum2
+  ;   r9  = opus_val32  sum3
+  ;   r1  = int         max_pitch
+  ;   r12 = int         j
+  ; ignored:
+  ;         int         arch
+  STMFD        sp!, {r4-r11, lr}
+  MOV          r5, r1
+  LDR          r1, [sp, #36]
+  MOV          r4, r0
+  TST          r5, #3
+  ; maxcorr = 1
+  MOV          r0, #1
+  BEQ          celt_pitch_xcorr_edsp_process1u_done
+; Compute one sum at the start to make y 32-bit aligned.
+  SUBS         r12, r3, #4
+  ; r14 = sum = 0
+  MOV          r14, #0
+  LDRH         r8, [r5], #2
+  BLE celt_pitch_xcorr_edsp_process1u_loop4_done
+  LDR          r6, [r4], #4
+  MOV          r8, r8, LSL #16
+celt_pitch_xcorr_edsp_process1u_loop4
+  LDR          r9, [r5], #4
+  SMLABT       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
+  LDR          r7, [r4], #4
+  SMLATB       r14, r6, r9, r14     ; sum = MAC16_16(sum, x_1, y_1)
+  LDR          r8, [r5], #4
+  SMLABT       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_2, y_2)
+  SUBS         r12, r12, #4         ; j-=4
+  SMLATB       r14, r7, r8, r14     ; sum = MAC16_16(sum, x_3, y_3)
+  LDRGT        r6, [r4], #4
+  BGT celt_pitch_xcorr_edsp_process1u_loop4
+  MOV          r8, r8, LSR #16
+celt_pitch_xcorr_edsp_process1u_loop4_done
+  ADDS         r12, r12, #4
+celt_pitch_xcorr_edsp_process1u_loop1
+  LDRHGE       r6, [r4], #2
+  ; Stall
+  SMLABBGE     r14, r6, r8, r14    ; sum = MAC16_16(sum, *x, *y)
+  SUBSGE       r12, r12, #1
+  LDRHGT       r8, [r5], #2
+  BGT celt_pitch_xcorr_edsp_process1u_loop1
+  ; Restore _x
+  SUB          r4, r4, r3, LSL #1
+  ; Restore and advance _y
+  SUB          r5, r5, r3, LSL #1
+  ; maxcorr = max(maxcorr, sum)
+  CMP          r0, r14
+  ADD          r5, r5, #2
+  MOVLT        r0, r14
+  SUBS         r1, r1, #1
+  ; xcorr[i] = sum
+  STR          r14, [r2], #4
+  BLE celt_pitch_xcorr_edsp_done
+celt_pitch_xcorr_edsp_process1u_done
+  ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
+  SUBS         r1, r1, #4
+  BLT celt_pitch_xcorr_edsp_process2
+celt_pitch_xcorr_edsp_process4
+  ; xcorr_kernel_edsp parameters:
+  ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
+  MOV          r6, #0
+  MOV          r7, #0
+  MOV          r8, #0
+  MOV          r9, #0
+  BL xcorr_kernel_edsp_start  ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
+  ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
+  CMP          r0, r6
+  ; _y+=4
+  ADD          r5, r5, #8
+  MOVLT        r0, r6
+  CMP          r0, r7
+  MOVLT        r0, r7
+  CMP          r0, r8
+  MOVLT        r0, r8
+  CMP          r0, r9
+  MOVLT        r0, r9
+  STMIA        r2!, {r6-r9}
+  SUBS         r1, r1, #4
+  BGE celt_pitch_xcorr_edsp_process4
+celt_pitch_xcorr_edsp_process2
+  ADDS         r1, r1, #2
+  BLT celt_pitch_xcorr_edsp_process1a
+  SUBS         r12, r3, #4
+  ; {r10, r11} = {sum0, sum1} = {0, 0}
+  MOV          r10, #0
+  MOV          r11, #0
+  LDR          r8, [r5], #4
+  BLE celt_pitch_xcorr_edsp_process2_loop_done
+  LDR          r6, [r4], #4
+  LDR          r9, [r5], #4
+celt_pitch_xcorr_edsp_process2_loop4
+  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
+  LDR          r7, [r4], #4
+  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
+  SUBS         r12, r12, #4         ; j-=4
+  SMLATT       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_1, y_1)
+  LDR          r8, [r5], #4
+  SMLATB       r11, r6, r9, r11     ; sum1 = MAC16_16(sum1, x_1, y_2)
+  LDRGT        r6, [r4], #4
+  SMLABB       r10, r7, r9, r10     ; sum0 = MAC16_16(sum0, x_2, y_2)
+  SMLABT       r11, r7, r9, r11     ; sum1 = MAC16_16(sum1, x_2, y_3)
+  SMLATT       r10, r7, r9, r10     ; sum0 = MAC16_16(sum0, x_3, y_3)
+  LDRGT        r9, [r5], #4
+  SMLATB       r11, r7, r8, r11     ; sum1 = MAC16_16(sum1, x_3, y_4)
+  BGT celt_pitch_xcorr_edsp_process2_loop4
+celt_pitch_xcorr_edsp_process2_loop_done
+  ADDS         r12, r12, #2
+  BLE  celt_pitch_xcorr_edsp_process2_1
+  LDR          r6, [r4], #4
+  ; Stall
+  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
+  LDR          r9, [r5], #4
+  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
+  SUB          r12, r12, #2
+  SMLATT       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_1, y_1)
+  MOV          r8, r9
+  SMLATB       r11, r6, r9, r11     ; sum1 = MAC16_16(sum1, x_1, y_2)
+celt_pitch_xcorr_edsp_process2_1
+  LDRH         r6, [r4], #2
+  ADDS         r12, r12, #1
+  ; Stall
+  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
+  LDRHGT       r7, [r4], #2
+  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
+  BLE celt_pitch_xcorr_edsp_process2_done
+  LDRH         r9, [r5], #2
+  SMLABT       r10, r7, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_1)
+  SMLABB       r11, r7, r9, r11     ; sum1 = MAC16_16(sum1, x_0, y_2)
+celt_pitch_xcorr_edsp_process2_done
+  ; Restore _x
+  SUB          r4, r4, r3, LSL #1
+  ; Restore and advance _y
+  SUB          r5, r5, r3, LSL #1
+  ; maxcorr = max(maxcorr, sum0)
+  CMP          r0, r10
+  ADD          r5, r5, #2
+  MOVLT        r0, r10
+  SUB          r1, r1, #2
+  ; maxcorr = max(maxcorr, sum1)
+  CMP          r0, r11
+  ; xcorr[i] = sum
+  STR          r10, [r2], #4
+  MOVLT        r0, r11
+  STR          r11, [r2], #4
+celt_pitch_xcorr_edsp_process1a
+  ADDS         r1, r1, #1
+  BLT celt_pitch_xcorr_edsp_done
+  SUBS         r12, r3, #4
+  ; r14 = sum = 0
+  MOV          r14, #0
+  BLT celt_pitch_xcorr_edsp_process1a_loop_done
+  LDR          r6, [r4], #4
+  LDR          r8, [r5], #4
+  LDR          r7, [r4], #4
+  LDR          r9, [r5], #4
+celt_pitch_xcorr_edsp_process1a_loop4
+  SMLABB       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
+  SUBS         r12, r12, #4         ; j-=4
+  SMLATT       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_1, y_1)
+  LDRGE        r6, [r4], #4
+  SMLABB       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_2, y_2)
+  LDRGE        r8, [r5], #4
+  SMLATT       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_3, y_3)
+  LDRGE        r7, [r4], #4
+  LDRGE        r9, [r5], #4
+  BGE celt_pitch_xcorr_edsp_process1a_loop4
+celt_pitch_xcorr_edsp_process1a_loop_done
+  ADDS         r12, r12, #2
+  LDRGE        r6, [r4], #4
+  LDRGE        r8, [r5], #4
+  ; Stall
+  SMLABBGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
+  SUBGE        r12, r12, #2
+  SMLATTGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, x_1, y_1)
+  ADDS         r12, r12, #1
+  LDRHGE       r6, [r4], #2
+  LDRHGE       r8, [r5], #2
+  ; Stall
+  SMLABBGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, *x, *y)
+  ; maxcorr = max(maxcorr, sum)
+  CMP          r0, r14
+  ; xcorr[i] = sum
+  STR          r14, [r2], #4
+  MOVLT        r0, r14
+celt_pitch_xcorr_edsp_done
+  LDMFD        sp!, {r4-r11, pc}
+  ENDP
+ENDIF
+END

diff --git a/lib/rbcodec/codecs/libopus/celt/arm/celt_pitch_xcorr_arm.s b/lib/rbcodec/codecs/libopus/celt/arm/celt_pitch_xcorr_arm.s new file mode 100644 index 0000000000..6e873afc37 --- /dev/null +++ b/lib/rbcodec/codecs/libopus/celt/arm/celt_pitch_xcorr_arm.s
@@ -0,0 +1,551 @@
	1	; Copyright (c) 2007-2008 CSIRO
	2	; Copyright (c) 2007-2009 Xiph.Org Foundation
	3	; Copyright (c) 2013 Parrot
	4	; Written by Aurélien Zanelli
	5	;
	6	; Redistribution and use in source and binary forms, with or without
	7	; modification, are permitted provided that the following conditions
	8	; are met:
	9	;
	10	; - Redistributions of source code must retain the above copyright
	11	; notice, this list of conditions and the following disclaimer.
	12	;
	13	; - Redistributions in binary form must reproduce the above copyright
	14	; notice, this list of conditions and the following disclaimer in the
	15	; documentation and/or other materials provided with the distribution.
	16	;
	17	; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	18	; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	19	; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	20	; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
	21	; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	22	; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	23	; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	24	; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
	25	; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
	26	; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	27	; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	28
	29	AREA \|.text\|, CODE, READONLY
	30
	31	GET celt/arm/armopts.s
	32
	33	IF OPUS_ARM_MAY_HAVE_EDSP
	34	EXPORT celt_pitch_xcorr_edsp
	35	ENDIF
	36
	37	IF OPUS_ARM_MAY_HAVE_NEON
	38	EXPORT celt_pitch_xcorr_neon
	39	ENDIF
	40
	41	IF OPUS_ARM_MAY_HAVE_NEON
	42
	43	; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
	44	xcorr_kernel_neon PROC
	45	xcorr_kernel_neon_start
	46	; input:
	47	; r3 = int len
	48	; r4 = opus_val16 *x
	49	; r5 = opus_val16 *y
	50	; q0 = opus_val32 sum[4]
	51	; output:
	52	; q0 = opus_val32 sum[4]
	53	; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
	54	; internal usage:
	55	; r12 = int j
	56	; d3 = y_3\|y_2\|y_1\|y_0
	57	; q2 = y_B\|y_A\|y_9\|y_8\|y_7\|y_6\|y_5\|y_4
	58	; q3 = x_7\|x_6\|x_5\|x_4\|x_3\|x_2\|x_1\|x_0
	59	; q8 = scratch
	60	;
	61	; Load y[0...3]
	62	; This requires len>0 to always be valid (which we assert in the C code).
	63	VLD1.16 {d5}, [r5]!
	64	SUBS r12, r3, #8
	65	BLE xcorr_kernel_neon_process4
	66	; Process 8 samples at a time.
	67	; This loop loads one y value more than we actually need. Therefore we have to
	68	; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
	69	; reading past the end of the array.
	70	xcorr_kernel_neon_process8
	71	; This loop has 19 total instructions (10 cycles to issue, minimum), with
	72	; - 2 cycles of ARM insrtuctions,
	73	; - 10 cycles of load/store/byte permute instructions, and
	74	; - 9 cycles of data processing instructions.
	75	; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
	76	; latter two categories, meaning the whole loop should run in 10 cycles per
	77	; iteration, barring cache misses.
	78	;
	79	; Load x[0...7]
	80	VLD1.16 {d6, d7}, [r4]!
	81	; Unlike VMOV, VAND is a data processsing instruction (and doesn't get
	82	; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
	83	VAND d3, d5, d5
	84	SUBS r12, r12, #8
	85	; Load y[4...11]
	86	VLD1.16 {d4, d5}, [r5]!
	87	VMLAL.S16 q0, d3, d6[0]
	88	VEXT.16 d16, d3, d4, #1
	89	VMLAL.S16 q0, d4, d7[0]
	90	VEXT.16 d17, d4, d5, #1
	91	VMLAL.S16 q0, d16, d6[1]
	92	VEXT.16 d16, d3, d4, #2
	93	VMLAL.S16 q0, d17, d7[1]
	94	VEXT.16 d17, d4, d5, #2
	95	VMLAL.S16 q0, d16, d6[2]
	96	VEXT.16 d16, d3, d4, #3
	97	VMLAL.S16 q0, d17, d7[2]
	98	VEXT.16 d17, d4, d5, #3
	99	VMLAL.S16 q0, d16, d6[3]
	100	VMLAL.S16 q0, d17, d7[3]
	101	BGT xcorr_kernel_neon_process8
	102	; Process 4 samples here if we have > 4 left (still reading one extra y value).
	103	xcorr_kernel_neon_process4
	104	ADDS r12, r12, #4
	105	BLE xcorr_kernel_neon_process2
	106	; Load x[0...3]
	107	VLD1.16 d6, [r4]!
	108	; Use VAND since it's a data processing instruction again.
	109	VAND d4, d5, d5
	110	SUB r12, r12, #4
	111	; Load y[4...7]
	112	VLD1.16 d5, [r5]!
	113	VMLAL.S16 q0, d4, d6[0]
	114	VEXT.16 d16, d4, d5, #1
	115	VMLAL.S16 q0, d16, d6[1]
	116	VEXT.16 d16, d4, d5, #2
	117	VMLAL.S16 q0, d16, d6[2]
	118	VEXT.16 d16, d4, d5, #3
	119	VMLAL.S16 q0, d16, d6[3]
	120	; Process 2 samples here if we have > 2 left (still reading one extra y value).
	121	xcorr_kernel_neon_process2
	122	ADDS r12, r12, #2
	123	BLE xcorr_kernel_neon_process1
	124	; Load x[0...1]
	125	VLD2.16 {d6[],d7[]}, [r4]!
	126	; Use VAND since it's a data processing instruction again.
	127	VAND d4, d5, d5
	128	SUB r12, r12, #2
	129	; Load y[4...5]
	130	VLD1.32 {d5[]}, [r5]!
	131	VMLAL.S16 q0, d4, d6
	132	VEXT.16 d16, d4, d5, #1
	133	; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
	134	; instead of VEXT, since it's a data-processing instruction.
	135	VSRI.64 d5, d4, #32
	136	VMLAL.S16 q0, d16, d7
	137	; Process 1 sample using the extra y value we loaded above.
	138	xcorr_kernel_neon_process1
	139	; Load next *x
	140	VLD1.16 {d6[]}, [r4]!
	141	ADDS r12, r12, #1
	142	; y[0...3] are left in d5 from prior iteration(s) (if any)
	143	VMLAL.S16 q0, d5, d6
	144	MOVLE pc, lr
	145	; Now process 1 last sample, not reading ahead.
	146	; Load last *y
	147	VLD1.16 {d4[]}, [r5]!
	148	VSRI.64 d4, d5, #16
	149	; Load last *x
	150	VLD1.16 {d6[]}, [r4]!
	151	VMLAL.S16 q0, d4, d6
	152	MOV pc, lr
	153	ENDP
	154
	155	; opus_val32 celt_pitch_xcorr_neon(opus_val16 _x, opus_val16 _y,
	156	; opus_val32 *xcorr, int len, int max_pitch, int arch)
	157	celt_pitch_xcorr_neon PROC
	158	; input:
	159	; r0 = opus_val16 *_x
	160	; r1 = opus_val16 *_y
	161	; r2 = opus_val32 *xcorr
	162	; r3 = int len
	163	; output:
	164	; r0 = int maxcorr
	165	; internal usage:
	166	; r4 = opus_val16 *x (for xcorr_kernel_neon())
	167	; r5 = opus_val16 *y (for xcorr_kernel_neon())
	168	; r6 = int max_pitch
	169	; r12 = int j
	170	; q15 = int maxcorr[4] (q15 is not used by xcorr_kernel_neon())
	171	; ignored:
	172	; int arch
	173	STMFD sp!, {r4-r6, lr}
	174	LDR r6, [sp, #16]
	175	VMOV.S32 q15, #1
	176	; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
	177	SUBS r6, r6, #4
	178	BLT celt_pitch_xcorr_neon_process4_done
	179	celt_pitch_xcorr_neon_process4
	180	; xcorr_kernel_neon parameters:
	181	; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
	182	MOV r4, r0
	183	MOV r5, r1
	184	VEOR q0, q0, q0
	185	; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
	186	; So we don't save/restore any other registers.
	187	BL xcorr_kernel_neon_start
	188	SUBS r6, r6, #4
	189	VST1.32 {q0}, [r2]!
	190	; _y += 4
	191	ADD r1, r1, #8
	192	VMAX.S32 q15, q15, q0
	193	; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
	194	BGE celt_pitch_xcorr_neon_process4
	195	; We have less than 4 sums left to compute.
	196	celt_pitch_xcorr_neon_process4_done
	197	ADDS r6, r6, #4
	198	; Reduce maxcorr to a single value
	199	VMAX.S32 d30, d30, d31
	200	VPMAX.S32 d30, d30, d30
	201	; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
	202	BLE celt_pitch_xcorr_neon_done
	203	; Now compute each remaining sum one at a time.
	204	celt_pitch_xcorr_neon_process_remaining
	205	MOV r4, r0
	206	MOV r5, r1
	207	VMOV.I32 q0, #0
	208	SUBS r12, r3, #8
	209	BLT celt_pitch_xcorr_neon_process_remaining4
	210	; Sum terms 8 at a time.
	211	celt_pitch_xcorr_neon_process_remaining_loop8
	212	; Load x[0...7]
	213	VLD1.16 {q1}, [r4]!
	214	; Load y[0...7]
	215	VLD1.16 {q2}, [r5]!
	216	SUBS r12, r12, #8
	217	VMLAL.S16 q0, d4, d2
	218	VMLAL.S16 q0, d5, d3
	219	BGE celt_pitch_xcorr_neon_process_remaining_loop8
	220	; Sum terms 4 at a time.
	221	celt_pitch_xcorr_neon_process_remaining4
	222	ADDS r12, r12, #4
	223	BLT celt_pitch_xcorr_neon_process_remaining4_done
	224	; Load x[0...3]
	225	VLD1.16 {d2}, [r4]!
	226	; Load y[0...3]
	227	VLD1.16 {d3}, [r5]!
	228	SUB r12, r12, #4
	229	VMLAL.S16 q0, d3, d2
	230	celt_pitch_xcorr_neon_process_remaining4_done
	231	; Reduce the sum to a single value.
	232	VADD.S32 d0, d0, d1
	233	VPADDL.S32 d0, d0
	234	ADDS r12, r12, #4
	235	BLE celt_pitch_xcorr_neon_process_remaining_loop_done
	236	; Sum terms 1 at a time.
	237	celt_pitch_xcorr_neon_process_remaining_loop1
	238	VLD1.16 {d2[]}, [r4]!
	239	VLD1.16 {d3[]}, [r5]!
	240	SUBS r12, r12, #1
	241	VMLAL.S16 q0, d2, d3
	242	BGT celt_pitch_xcorr_neon_process_remaining_loop1
	243	celt_pitch_xcorr_neon_process_remaining_loop_done
	244	VST1.32 {d0[0]}, [r2]!
	245	VMAX.S32 d30, d30, d0
	246	SUBS r6, r6, #1
	247	; _y++
	248	ADD r1, r1, #2
	249	; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
	250	BGT celt_pitch_xcorr_neon_process_remaining
	251	celt_pitch_xcorr_neon_done
	252	VMOV.32 r0, d30[0]
	253	LDMFD sp!, {r4-r6, pc}
	254	ENDP
	255
	256	ENDIF
	257
	258	IF OPUS_ARM_MAY_HAVE_EDSP
	259
	260	; This will get used on ARMv7 devices without NEON, so it has been optimized
	261	; to take advantage of dual-issuing where possible.
	262	xcorr_kernel_edsp PROC
	263	xcorr_kernel_edsp_start
	264	; input:
	265	; r3 = int len
	266	; r4 = opus_val16 *_x (must be 32-bit aligned)
	267	; r5 = opus_val16 *_y (must be 32-bit aligned)
	268	; r6...r9 = opus_val32 sum[4]
	269	; output:
	270	; r6...r9 = opus_val32 sum[4]
	271	; preserved: r0-r5
	272	; internal usage
	273	; r2 = int j
	274	; r12,r14 = opus_val16 x[4]
	275	; r10,r11 = opus_val16 y[4]
	276	STMFD sp!, {r2,r4,r5,lr}
	277	LDR r10, [r5], #4 ; Load y[0...1]
	278	SUBS r2, r3, #4 ; j = len-4
	279	LDR r11, [r5], #4 ; Load y[2...3]
	280	BLE xcorr_kernel_edsp_process4_done
	281	LDR r12, [r4], #4 ; Load x[0...1]
	282	; Stall
	283	xcorr_kernel_edsp_process4
	284	; The multiplies must issue from pipeline 0, and can't dual-issue with each
	285	; other. Every other instruction here dual-issues with a multiply, and is
	286	; thus "free". There should be no stalls in the body of the loop.
	287	SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_0,y_0)
	288	LDR r14, [r4], #4 ; Load x[2...3]
	289	SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x_0,y_1)
	290	SUBS r2, r2, #4 ; j-=4
	291	SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_0,y_2)
	292	SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x_0,y_3)
	293	SMLATT r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_1,y_1)
	294	LDR r10, [r5], #4 ; Load y[4...5]
	295	SMLATB r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],x_1,y_2)
	296	SMLATT r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_1,y_3)
	297	SMLATB r9, r12, r10, r9 ; sum[3] = MAC16_16(sum[3],x_1,y_4)
	298	LDRGT r12, [r4], #4 ; Load x[0...1]
	299	SMLABB r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_2,y_2)
	300	SMLABT r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x_2,y_3)
	301	SMLABB r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_2,y_4)
	302	SMLABT r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x_2,y_5)
	303	SMLATT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_3,y_3)
	304	LDR r11, [r5], #4 ; Load y[6...7]
	305	SMLATB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],x_3,y_4)
	306	SMLATT r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_3,y_5)
	307	SMLATB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],x_3,y_6)
	308	BGT xcorr_kernel_edsp_process4
	309	xcorr_kernel_edsp_process4_done
	310	ADDS r2, r2, #4
	311	BLE xcorr_kernel_edsp_done
	312	LDRH r12, [r4], #2 ; r12 = *x++
	313	SUBS r2, r2, #1 ; j--
	314	; Stall
	315	SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_0)
	316	LDRHGT r14, [r4], #2 ; r14 = *x++
	317	SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x,y_1)
	318	SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_2)
	319	SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x,y_3)
	320	BLE xcorr_kernel_edsp_done
	321	SMLABT r6, r14, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_1)
	322	SUBS r2, r2, #1 ; j--
	323	SMLABB r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x,y_2)
	324	LDRH r10, [r5], #2 ; r10 = y_4 = *y++
	325	SMLABT r8, r14, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_3)
	326	LDRHGT r12, [r4], #2 ; r12 = *x++
	327	SMLABB r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x,y_4)
	328	BLE xcorr_kernel_edsp_done
	329	SMLABB r6, r12, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_2)
	330	CMP r2, #1 ; j--
	331	SMLABT r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_3)
	332	LDRH r2, [r5], #2 ; r2 = y_5 = *y++
	333	SMLABB r8, r12, r10, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_4)
	334	LDRHGT r14, [r4] ; r14 = *x
	335	SMLABB r9, r12, r2, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_5)
	336	BLE xcorr_kernel_edsp_done
	337	SMLABT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_3)
	338	LDRH r11, [r5] ; r11 = y_6 = *y
	339	SMLABB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_4)
	340	SMLABB r8, r14, r2, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_5)
	341	SMLABB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_6)
	342	xcorr_kernel_edsp_done
	343	LDMFD sp!, {r2,r4,r5,pc}
	344	ENDP
	345
	346	celt_pitch_xcorr_edsp PROC
	347	; input:
	348	; r0 = opus_val16 *_x (must be 32-bit aligned)
	349	; r1 = opus_val16 *_y (only needs to be 16-bit aligned)
	350	; r2 = opus_val32 *xcorr
	351	; r3 = int len
	352	; output:
	353	; r0 = maxcorr
	354	; internal usage
	355	; r4 = opus_val16 *x
	356	; r5 = opus_val16 *y
	357	; r6 = opus_val32 sum0
	358	; r7 = opus_val32 sum1
	359	; r8 = opus_val32 sum2
	360	; r9 = opus_val32 sum3
	361	; r1 = int max_pitch
	362	; r12 = int j
	363	; ignored:
	364	; int arch
	365	STMFD sp!, {r4-r11, lr}
	366	MOV r5, r1
	367	LDR r1, [sp, #36]
	368	MOV r4, r0
	369	TST r5, #3
	370	; maxcorr = 1
	371	MOV r0, #1
	372	BEQ celt_pitch_xcorr_edsp_process1u_done
	373	; Compute one sum at the start to make y 32-bit aligned.
	374	SUBS r12, r3, #4
	375	; r14 = sum = 0
	376	MOV r14, #0
	377	LDRH r8, [r5], #2
	378	BLE celt_pitch_xcorr_edsp_process1u_loop4_done
	379	LDR r6, [r4], #4
	380	MOV r8, r8, LSL #16
	381	celt_pitch_xcorr_edsp_process1u_loop4
	382	LDR r9, [r5], #4
	383	SMLABT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
	384	LDR r7, [r4], #4
	385	SMLATB r14, r6, r9, r14 ; sum = MAC16_16(sum, x_1, y_1)
	386	LDR r8, [r5], #4
	387	SMLABT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2)
	388	SUBS r12, r12, #4 ; j-=4
	389	SMLATB r14, r7, r8, r14 ; sum = MAC16_16(sum, x_3, y_3)
	390	LDRGT r6, [r4], #4
	391	BGT celt_pitch_xcorr_edsp_process1u_loop4
	392	MOV r8, r8, LSR #16
	393	celt_pitch_xcorr_edsp_process1u_loop4_done
	394	ADDS r12, r12, #4
	395	celt_pitch_xcorr_edsp_process1u_loop1
	396	LDRHGE r6, [r4], #2
	397	; Stall
	398	SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x, y)
	399	SUBSGE r12, r12, #1
	400	LDRHGT r8, [r5], #2
	401	BGT celt_pitch_xcorr_edsp_process1u_loop1
	402	; Restore _x
	403	SUB r4, r4, r3, LSL #1
	404	; Restore and advance _y
	405	SUB r5, r5, r3, LSL #1
	406	; maxcorr = max(maxcorr, sum)
	407	CMP r0, r14
	408	ADD r5, r5, #2
	409	MOVLT r0, r14
	410	SUBS r1, r1, #1
	411	; xcorr[i] = sum
	412	STR r14, [r2], #4
	413	BLE celt_pitch_xcorr_edsp_done
	414	celt_pitch_xcorr_edsp_process1u_done
	415	; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
	416	SUBS r1, r1, #4
	417	BLT celt_pitch_xcorr_edsp_process2
	418	celt_pitch_xcorr_edsp_process4
	419	; xcorr_kernel_edsp parameters:
	420	; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
	421	MOV r6, #0
	422	MOV r7, #0
	423	MOV r8, #0
	424	MOV r9, #0
	425	BL xcorr_kernel_edsp_start ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
	426	; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
	427	CMP r0, r6
	428	; _y+=4
	429	ADD r5, r5, #8
	430	MOVLT r0, r6
	431	CMP r0, r7
	432	MOVLT r0, r7
	433	CMP r0, r8
	434	MOVLT r0, r8
	435	CMP r0, r9
	436	MOVLT r0, r9
	437	STMIA r2!, {r6-r9}
	438	SUBS r1, r1, #4
	439	BGE celt_pitch_xcorr_edsp_process4
	440	celt_pitch_xcorr_edsp_process2
	441	ADDS r1, r1, #2
	442	BLT celt_pitch_xcorr_edsp_process1a
	443	SUBS r12, r3, #4
	444	; {r10, r11} = {sum0, sum1} = {0, 0}
	445	MOV r10, #0
	446	MOV r11, #0
	447	LDR r8, [r5], #4
	448	BLE celt_pitch_xcorr_edsp_process2_loop_done
	449	LDR r6, [r4], #4
	450	LDR r9, [r5], #4
	451	celt_pitch_xcorr_edsp_process2_loop4
	452	SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
	453	LDR r7, [r4], #4
	454	SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
	455	SUBS r12, r12, #4 ; j-=4
	456	SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1)
	457	LDR r8, [r5], #4
	458	SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2)
	459	LDRGT r6, [r4], #4
	460	SMLABB r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_2, y_2)
	461	SMLABT r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_2, y_3)
	462	SMLATT r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_3, y_3)
	463	LDRGT r9, [r5], #4
	464	SMLATB r11, r7, r8, r11 ; sum1 = MAC16_16(sum1, x_3, y_4)
	465	BGT celt_pitch_xcorr_edsp_process2_loop4
	466	celt_pitch_xcorr_edsp_process2_loop_done
	467	ADDS r12, r12, #2
	468	BLE celt_pitch_xcorr_edsp_process2_1
	469	LDR r6, [r4], #4
	470	; Stall
	471	SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
	472	LDR r9, [r5], #4
	473	SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
	474	SUB r12, r12, #2
	475	SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1)
	476	MOV r8, r9
	477	SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2)
	478	celt_pitch_xcorr_edsp_process2_1
	479	LDRH r6, [r4], #2
	480	ADDS r12, r12, #1
	481	; Stall
	482	SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
	483	LDRHGT r7, [r4], #2
	484	SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
	485	BLE celt_pitch_xcorr_edsp_process2_done
	486	LDRH r9, [r5], #2
	487	SMLABT r10, r7, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_1)
	488	SMLABB r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_0, y_2)
	489	celt_pitch_xcorr_edsp_process2_done
	490	; Restore _x
	491	SUB r4, r4, r3, LSL #1
	492	; Restore and advance _y
	493	SUB r5, r5, r3, LSL #1
	494	; maxcorr = max(maxcorr, sum0)
	495	CMP r0, r10
	496	ADD r5, r5, #2
	497	MOVLT r0, r10
	498	SUB r1, r1, #2
	499	; maxcorr = max(maxcorr, sum1)
	500	CMP r0, r11
	501	; xcorr[i] = sum
	502	STR r10, [r2], #4
	503	MOVLT r0, r11
	504	STR r11, [r2], #4
	505	celt_pitch_xcorr_edsp_process1a
	506	ADDS r1, r1, #1
	507	BLT celt_pitch_xcorr_edsp_done
	508	SUBS r12, r3, #4
	509	; r14 = sum = 0
	510	MOV r14, #0
	511	BLT celt_pitch_xcorr_edsp_process1a_loop_done
	512	LDR r6, [r4], #4
	513	LDR r8, [r5], #4
	514	LDR r7, [r4], #4
	515	LDR r9, [r5], #4
	516	celt_pitch_xcorr_edsp_process1a_loop4
	517	SMLABB r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
	518	SUBS r12, r12, #4 ; j-=4
	519	SMLATT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1)
	520	LDRGE r6, [r4], #4
	521	SMLABB r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2)
	522	LDRGE r8, [r5], #4
	523	SMLATT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_3, y_3)
	524	LDRGE r7, [r4], #4
	525	LDRGE r9, [r5], #4
	526	BGE celt_pitch_xcorr_edsp_process1a_loop4
	527	celt_pitch_xcorr_edsp_process1a_loop_done
	528	ADDS r12, r12, #2
	529	LDRGE r6, [r4], #4
	530	LDRGE r8, [r5], #4
	531	; Stall
	532	SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
	533	SUBGE r12, r12, #2
	534	SMLATTGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1)
	535	ADDS r12, r12, #1
	536	LDRHGE r6, [r4], #2
	537	LDRHGE r8, [r5], #2
	538	; Stall
	539	SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x, y)
	540	; maxcorr = max(maxcorr, sum)
	541	CMP r0, r14
	542	; xcorr[i] = sum
	543	STR r14, [r2], #4
	544	MOVLT r0, r14
	545	celt_pitch_xcorr_edsp_done
	546	LDMFD sp!, {r4-r11, pc}
	547	ENDP
	548
	549	ENDIF
	550
	551	END