summaryrefslogtreecommitdiff
path: root/lib/rbcodec/codecs/libopus/celt/arm/celt_pitch_xcorr_arm.s
diff options
context:
space:
mode:
Diffstat (limited to 'lib/rbcodec/codecs/libopus/celt/arm/celt_pitch_xcorr_arm.s')
-rw-r--r--lib/rbcodec/codecs/libopus/celt/arm/celt_pitch_xcorr_arm.s551
1 files changed, 551 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/libopus/celt/arm/celt_pitch_xcorr_arm.s b/lib/rbcodec/codecs/libopus/celt/arm/celt_pitch_xcorr_arm.s
new file mode 100644
index 0000000000..6e873afc37
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/celt/arm/celt_pitch_xcorr_arm.s
@@ -0,0 +1,551 @@
1; Copyright (c) 2007-2008 CSIRO
2; Copyright (c) 2007-2009 Xiph.Org Foundation
3; Copyright (c) 2013 Parrot
4; Written by Aurélien Zanelli
5;
6; Redistribution and use in source and binary forms, with or without
7; modification, are permitted provided that the following conditions
8; are met:
9;
10; - Redistributions of source code must retain the above copyright
11; notice, this list of conditions and the following disclaimer.
12;
13; - Redistributions in binary form must reproduce the above copyright
14; notice, this list of conditions and the following disclaimer in the
15; documentation and/or other materials provided with the distribution.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
21; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
25; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
29 AREA |.text|, CODE, READONLY
30
31 GET celt/arm/armopts.s
32
33IF OPUS_ARM_MAY_HAVE_EDSP
34 EXPORT celt_pitch_xcorr_edsp
35ENDIF
36
37IF OPUS_ARM_MAY_HAVE_NEON
38 EXPORT celt_pitch_xcorr_neon
39ENDIF
40
41IF OPUS_ARM_MAY_HAVE_NEON
42
43; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
44xcorr_kernel_neon PROC
45xcorr_kernel_neon_start
46 ; input:
47 ; r3 = int len
48 ; r4 = opus_val16 *x
49 ; r5 = opus_val16 *y
50 ; q0 = opus_val32 sum[4]
51 ; output:
52 ; q0 = opus_val32 sum[4]
53 ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
54 ; internal usage:
55 ; r12 = int j
56 ; d3 = y_3|y_2|y_1|y_0
57 ; q2 = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
58 ; q3 = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
59 ; q8 = scratch
60 ;
61 ; Load y[0...3]
62 ; This requires len>0 to always be valid (which we assert in the C code).
63 VLD1.16 {d5}, [r5]!
64 SUBS r12, r3, #8
65 BLE xcorr_kernel_neon_process4
66; Process 8 samples at a time.
67; This loop loads one y value more than we actually need. Therefore we have to
68; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
69; reading past the end of the array.
70xcorr_kernel_neon_process8
71 ; This loop has 19 total instructions (10 cycles to issue, minimum), with
72 ; - 2 cycles of ARM insrtuctions,
73 ; - 10 cycles of load/store/byte permute instructions, and
74 ; - 9 cycles of data processing instructions.
75 ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
76 ; latter two categories, meaning the whole loop should run in 10 cycles per
77 ; iteration, barring cache misses.
78 ;
79 ; Load x[0...7]
80 VLD1.16 {d6, d7}, [r4]!
81 ; Unlike VMOV, VAND is a data processsing instruction (and doesn't get
82 ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
83 VAND d3, d5, d5
84 SUBS r12, r12, #8
85 ; Load y[4...11]
86 VLD1.16 {d4, d5}, [r5]!
87 VMLAL.S16 q0, d3, d6[0]
88 VEXT.16 d16, d3, d4, #1
89 VMLAL.S16 q0, d4, d7[0]
90 VEXT.16 d17, d4, d5, #1
91 VMLAL.S16 q0, d16, d6[1]
92 VEXT.16 d16, d3, d4, #2
93 VMLAL.S16 q0, d17, d7[1]
94 VEXT.16 d17, d4, d5, #2
95 VMLAL.S16 q0, d16, d6[2]
96 VEXT.16 d16, d3, d4, #3
97 VMLAL.S16 q0, d17, d7[2]
98 VEXT.16 d17, d4, d5, #3
99 VMLAL.S16 q0, d16, d6[3]
100 VMLAL.S16 q0, d17, d7[3]
101 BGT xcorr_kernel_neon_process8
102; Process 4 samples here if we have > 4 left (still reading one extra y value).
103xcorr_kernel_neon_process4
104 ADDS r12, r12, #4
105 BLE xcorr_kernel_neon_process2
106 ; Load x[0...3]
107 VLD1.16 d6, [r4]!
108 ; Use VAND since it's a data processing instruction again.
109 VAND d4, d5, d5
110 SUB r12, r12, #4
111 ; Load y[4...7]
112 VLD1.16 d5, [r5]!
113 VMLAL.S16 q0, d4, d6[0]
114 VEXT.16 d16, d4, d5, #1
115 VMLAL.S16 q0, d16, d6[1]
116 VEXT.16 d16, d4, d5, #2
117 VMLAL.S16 q0, d16, d6[2]
118 VEXT.16 d16, d4, d5, #3
119 VMLAL.S16 q0, d16, d6[3]
120; Process 2 samples here if we have > 2 left (still reading one extra y value).
121xcorr_kernel_neon_process2
122 ADDS r12, r12, #2
123 BLE xcorr_kernel_neon_process1
124 ; Load x[0...1]
125 VLD2.16 {d6[],d7[]}, [r4]!
126 ; Use VAND since it's a data processing instruction again.
127 VAND d4, d5, d5
128 SUB r12, r12, #2
129 ; Load y[4...5]
130 VLD1.32 {d5[]}, [r5]!
131 VMLAL.S16 q0, d4, d6
132 VEXT.16 d16, d4, d5, #1
133 ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
134 ; instead of VEXT, since it's a data-processing instruction.
135 VSRI.64 d5, d4, #32
136 VMLAL.S16 q0, d16, d7
137; Process 1 sample using the extra y value we loaded above.
138xcorr_kernel_neon_process1
139 ; Load next *x
140 VLD1.16 {d6[]}, [r4]!
141 ADDS r12, r12, #1
142 ; y[0...3] are left in d5 from prior iteration(s) (if any)
143 VMLAL.S16 q0, d5, d6
144 MOVLE pc, lr
145; Now process 1 last sample, not reading ahead.
146 ; Load last *y
147 VLD1.16 {d4[]}, [r5]!
148 VSRI.64 d4, d5, #16
149 ; Load last *x
150 VLD1.16 {d6[]}, [r4]!
151 VMLAL.S16 q0, d4, d6
152 MOV pc, lr
153 ENDP
154
155; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
156; opus_val32 *xcorr, int len, int max_pitch, int arch)
157celt_pitch_xcorr_neon PROC
158 ; input:
159 ; r0 = opus_val16 *_x
160 ; r1 = opus_val16 *_y
161 ; r2 = opus_val32 *xcorr
162 ; r3 = int len
163 ; output:
164 ; r0 = int maxcorr
165 ; internal usage:
166 ; r4 = opus_val16 *x (for xcorr_kernel_neon())
167 ; r5 = opus_val16 *y (for xcorr_kernel_neon())
168 ; r6 = int max_pitch
169 ; r12 = int j
170 ; q15 = int maxcorr[4] (q15 is not used by xcorr_kernel_neon())
171 ; ignored:
172 ; int arch
173 STMFD sp!, {r4-r6, lr}
174 LDR r6, [sp, #16]
175 VMOV.S32 q15, #1
176 ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
177 SUBS r6, r6, #4
178 BLT celt_pitch_xcorr_neon_process4_done
179celt_pitch_xcorr_neon_process4
180 ; xcorr_kernel_neon parameters:
181 ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
182 MOV r4, r0
183 MOV r5, r1
184 VEOR q0, q0, q0
185 ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
186 ; So we don't save/restore any other registers.
187 BL xcorr_kernel_neon_start
188 SUBS r6, r6, #4
189 VST1.32 {q0}, [r2]!
190 ; _y += 4
191 ADD r1, r1, #8
192 VMAX.S32 q15, q15, q0
193 ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
194 BGE celt_pitch_xcorr_neon_process4
195; We have less than 4 sums left to compute.
196celt_pitch_xcorr_neon_process4_done
197 ADDS r6, r6, #4
198 ; Reduce maxcorr to a single value
199 VMAX.S32 d30, d30, d31
200 VPMAX.S32 d30, d30, d30
201 ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
202 BLE celt_pitch_xcorr_neon_done
203; Now compute each remaining sum one at a time.
204celt_pitch_xcorr_neon_process_remaining
205 MOV r4, r0
206 MOV r5, r1
207 VMOV.I32 q0, #0
208 SUBS r12, r3, #8
209 BLT celt_pitch_xcorr_neon_process_remaining4
210; Sum terms 8 at a time.
211celt_pitch_xcorr_neon_process_remaining_loop8
212 ; Load x[0...7]
213 VLD1.16 {q1}, [r4]!
214 ; Load y[0...7]
215 VLD1.16 {q2}, [r5]!
216 SUBS r12, r12, #8
217 VMLAL.S16 q0, d4, d2
218 VMLAL.S16 q0, d5, d3
219 BGE celt_pitch_xcorr_neon_process_remaining_loop8
220; Sum terms 4 at a time.
221celt_pitch_xcorr_neon_process_remaining4
222 ADDS r12, r12, #4
223 BLT celt_pitch_xcorr_neon_process_remaining4_done
224 ; Load x[0...3]
225 VLD1.16 {d2}, [r4]!
226 ; Load y[0...3]
227 VLD1.16 {d3}, [r5]!
228 SUB r12, r12, #4
229 VMLAL.S16 q0, d3, d2
230celt_pitch_xcorr_neon_process_remaining4_done
231 ; Reduce the sum to a single value.
232 VADD.S32 d0, d0, d1
233 VPADDL.S32 d0, d0
234 ADDS r12, r12, #4
235 BLE celt_pitch_xcorr_neon_process_remaining_loop_done
236; Sum terms 1 at a time.
237celt_pitch_xcorr_neon_process_remaining_loop1
238 VLD1.16 {d2[]}, [r4]!
239 VLD1.16 {d3[]}, [r5]!
240 SUBS r12, r12, #1
241 VMLAL.S16 q0, d2, d3
242 BGT celt_pitch_xcorr_neon_process_remaining_loop1
243celt_pitch_xcorr_neon_process_remaining_loop_done
244 VST1.32 {d0[0]}, [r2]!
245 VMAX.S32 d30, d30, d0
246 SUBS r6, r6, #1
247 ; _y++
248 ADD r1, r1, #2
249 ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
250 BGT celt_pitch_xcorr_neon_process_remaining
251celt_pitch_xcorr_neon_done
252 VMOV.32 r0, d30[0]
253 LDMFD sp!, {r4-r6, pc}
254 ENDP
255
256ENDIF
257
258IF OPUS_ARM_MAY_HAVE_EDSP
259
260; This will get used on ARMv7 devices without NEON, so it has been optimized
261; to take advantage of dual-issuing where possible.
262xcorr_kernel_edsp PROC
263xcorr_kernel_edsp_start
264 ; input:
265 ; r3 = int len
266 ; r4 = opus_val16 *_x (must be 32-bit aligned)
267 ; r5 = opus_val16 *_y (must be 32-bit aligned)
268 ; r6...r9 = opus_val32 sum[4]
269 ; output:
270 ; r6...r9 = opus_val32 sum[4]
271 ; preserved: r0-r5
272 ; internal usage
273 ; r2 = int j
274 ; r12,r14 = opus_val16 x[4]
275 ; r10,r11 = opus_val16 y[4]
276 STMFD sp!, {r2,r4,r5,lr}
277 LDR r10, [r5], #4 ; Load y[0...1]
278 SUBS r2, r3, #4 ; j = len-4
279 LDR r11, [r5], #4 ; Load y[2...3]
280 BLE xcorr_kernel_edsp_process4_done
281 LDR r12, [r4], #4 ; Load x[0...1]
282 ; Stall
283xcorr_kernel_edsp_process4
284 ; The multiplies must issue from pipeline 0, and can't dual-issue with each
285 ; other. Every other instruction here dual-issues with a multiply, and is
286 ; thus "free". There should be no stalls in the body of the loop.
287 SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_0,y_0)
288 LDR r14, [r4], #4 ; Load x[2...3]
289 SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x_0,y_1)
290 SUBS r2, r2, #4 ; j-=4
291 SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_0,y_2)
292 SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x_0,y_3)
293 SMLATT r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_1,y_1)
294 LDR r10, [r5], #4 ; Load y[4...5]
295 SMLATB r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],x_1,y_2)
296 SMLATT r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_1,y_3)
297 SMLATB r9, r12, r10, r9 ; sum[3] = MAC16_16(sum[3],x_1,y_4)
298 LDRGT r12, [r4], #4 ; Load x[0...1]
299 SMLABB r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_2,y_2)
300 SMLABT r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x_2,y_3)
301 SMLABB r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_2,y_4)
302 SMLABT r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x_2,y_5)
303 SMLATT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_3,y_3)
304 LDR r11, [r5], #4 ; Load y[6...7]
305 SMLATB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],x_3,y_4)
306 SMLATT r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_3,y_5)
307 SMLATB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],x_3,y_6)
308 BGT xcorr_kernel_edsp_process4
309xcorr_kernel_edsp_process4_done
310 ADDS r2, r2, #4
311 BLE xcorr_kernel_edsp_done
312 LDRH r12, [r4], #2 ; r12 = *x++
313 SUBS r2, r2, #1 ; j--
314 ; Stall
315 SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_0)
316 LDRHGT r14, [r4], #2 ; r14 = *x++
317 SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x,y_1)
318 SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_2)
319 SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x,y_3)
320 BLE xcorr_kernel_edsp_done
321 SMLABT r6, r14, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_1)
322 SUBS r2, r2, #1 ; j--
323 SMLABB r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x,y_2)
324 LDRH r10, [r5], #2 ; r10 = y_4 = *y++
325 SMLABT r8, r14, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_3)
326 LDRHGT r12, [r4], #2 ; r12 = *x++
327 SMLABB r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x,y_4)
328 BLE xcorr_kernel_edsp_done
329 SMLABB r6, r12, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_2)
330 CMP r2, #1 ; j--
331 SMLABT r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_3)
332 LDRH r2, [r5], #2 ; r2 = y_5 = *y++
333 SMLABB r8, r12, r10, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_4)
334 LDRHGT r14, [r4] ; r14 = *x
335 SMLABB r9, r12, r2, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_5)
336 BLE xcorr_kernel_edsp_done
337 SMLABT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_3)
338 LDRH r11, [r5] ; r11 = y_6 = *y
339 SMLABB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_4)
340 SMLABB r8, r14, r2, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_5)
341 SMLABB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_6)
342xcorr_kernel_edsp_done
343 LDMFD sp!, {r2,r4,r5,pc}
344 ENDP
345
346celt_pitch_xcorr_edsp PROC
347 ; input:
348 ; r0 = opus_val16 *_x (must be 32-bit aligned)
349 ; r1 = opus_val16 *_y (only needs to be 16-bit aligned)
350 ; r2 = opus_val32 *xcorr
351 ; r3 = int len
352 ; output:
353 ; r0 = maxcorr
354 ; internal usage
355 ; r4 = opus_val16 *x
356 ; r5 = opus_val16 *y
357 ; r6 = opus_val32 sum0
358 ; r7 = opus_val32 sum1
359 ; r8 = opus_val32 sum2
360 ; r9 = opus_val32 sum3
361 ; r1 = int max_pitch
362 ; r12 = int j
363 ; ignored:
364 ; int arch
365 STMFD sp!, {r4-r11, lr}
366 MOV r5, r1
367 LDR r1, [sp, #36]
368 MOV r4, r0
369 TST r5, #3
370 ; maxcorr = 1
371 MOV r0, #1
372 BEQ celt_pitch_xcorr_edsp_process1u_done
373; Compute one sum at the start to make y 32-bit aligned.
374 SUBS r12, r3, #4
375 ; r14 = sum = 0
376 MOV r14, #0
377 LDRH r8, [r5], #2
378 BLE celt_pitch_xcorr_edsp_process1u_loop4_done
379 LDR r6, [r4], #4
380 MOV r8, r8, LSL #16
381celt_pitch_xcorr_edsp_process1u_loop4
382 LDR r9, [r5], #4
383 SMLABT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
384 LDR r7, [r4], #4
385 SMLATB r14, r6, r9, r14 ; sum = MAC16_16(sum, x_1, y_1)
386 LDR r8, [r5], #4
387 SMLABT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2)
388 SUBS r12, r12, #4 ; j-=4
389 SMLATB r14, r7, r8, r14 ; sum = MAC16_16(sum, x_3, y_3)
390 LDRGT r6, [r4], #4
391 BGT celt_pitch_xcorr_edsp_process1u_loop4
392 MOV r8, r8, LSR #16
393celt_pitch_xcorr_edsp_process1u_loop4_done
394 ADDS r12, r12, #4
395celt_pitch_xcorr_edsp_process1u_loop1
396 LDRHGE r6, [r4], #2
397 ; Stall
398 SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y)
399 SUBSGE r12, r12, #1
400 LDRHGT r8, [r5], #2
401 BGT celt_pitch_xcorr_edsp_process1u_loop1
402 ; Restore _x
403 SUB r4, r4, r3, LSL #1
404 ; Restore and advance _y
405 SUB r5, r5, r3, LSL #1
406 ; maxcorr = max(maxcorr, sum)
407 CMP r0, r14
408 ADD r5, r5, #2
409 MOVLT r0, r14
410 SUBS r1, r1, #1
411 ; xcorr[i] = sum
412 STR r14, [r2], #4
413 BLE celt_pitch_xcorr_edsp_done
414celt_pitch_xcorr_edsp_process1u_done
415 ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
416 SUBS r1, r1, #4
417 BLT celt_pitch_xcorr_edsp_process2
418celt_pitch_xcorr_edsp_process4
419 ; xcorr_kernel_edsp parameters:
420 ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
421 MOV r6, #0
422 MOV r7, #0
423 MOV r8, #0
424 MOV r9, #0
425 BL xcorr_kernel_edsp_start ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
426 ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
427 CMP r0, r6
428 ; _y+=4
429 ADD r5, r5, #8
430 MOVLT r0, r6
431 CMP r0, r7
432 MOVLT r0, r7
433 CMP r0, r8
434 MOVLT r0, r8
435 CMP r0, r9
436 MOVLT r0, r9
437 STMIA r2!, {r6-r9}
438 SUBS r1, r1, #4
439 BGE celt_pitch_xcorr_edsp_process4
440celt_pitch_xcorr_edsp_process2
441 ADDS r1, r1, #2
442 BLT celt_pitch_xcorr_edsp_process1a
443 SUBS r12, r3, #4
444 ; {r10, r11} = {sum0, sum1} = {0, 0}
445 MOV r10, #0
446 MOV r11, #0
447 LDR r8, [r5], #4
448 BLE celt_pitch_xcorr_edsp_process2_loop_done
449 LDR r6, [r4], #4
450 LDR r9, [r5], #4
451celt_pitch_xcorr_edsp_process2_loop4
452 SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
453 LDR r7, [r4], #4
454 SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
455 SUBS r12, r12, #4 ; j-=4
456 SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1)
457 LDR r8, [r5], #4
458 SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2)
459 LDRGT r6, [r4], #4
460 SMLABB r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_2, y_2)
461 SMLABT r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_2, y_3)
462 SMLATT r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_3, y_3)
463 LDRGT r9, [r5], #4
464 SMLATB r11, r7, r8, r11 ; sum1 = MAC16_16(sum1, x_3, y_4)
465 BGT celt_pitch_xcorr_edsp_process2_loop4
466celt_pitch_xcorr_edsp_process2_loop_done
467 ADDS r12, r12, #2
468 BLE celt_pitch_xcorr_edsp_process2_1
469 LDR r6, [r4], #4
470 ; Stall
471 SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
472 LDR r9, [r5], #4
473 SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
474 SUB r12, r12, #2
475 SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1)
476 MOV r8, r9
477 SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2)
478celt_pitch_xcorr_edsp_process2_1
479 LDRH r6, [r4], #2
480 ADDS r12, r12, #1
481 ; Stall
482 SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
483 LDRHGT r7, [r4], #2
484 SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
485 BLE celt_pitch_xcorr_edsp_process2_done
486 LDRH r9, [r5], #2
487 SMLABT r10, r7, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_1)
488 SMLABB r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_0, y_2)
489celt_pitch_xcorr_edsp_process2_done
490 ; Restore _x
491 SUB r4, r4, r3, LSL #1
492 ; Restore and advance _y
493 SUB r5, r5, r3, LSL #1
494 ; maxcorr = max(maxcorr, sum0)
495 CMP r0, r10
496 ADD r5, r5, #2
497 MOVLT r0, r10
498 SUB r1, r1, #2
499 ; maxcorr = max(maxcorr, sum1)
500 CMP r0, r11
501 ; xcorr[i] = sum
502 STR r10, [r2], #4
503 MOVLT r0, r11
504 STR r11, [r2], #4
505celt_pitch_xcorr_edsp_process1a
506 ADDS r1, r1, #1
507 BLT celt_pitch_xcorr_edsp_done
508 SUBS r12, r3, #4
509 ; r14 = sum = 0
510 MOV r14, #0
511 BLT celt_pitch_xcorr_edsp_process1a_loop_done
512 LDR r6, [r4], #4
513 LDR r8, [r5], #4
514 LDR r7, [r4], #4
515 LDR r9, [r5], #4
516celt_pitch_xcorr_edsp_process1a_loop4
517 SMLABB r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
518 SUBS r12, r12, #4 ; j-=4
519 SMLATT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1)
520 LDRGE r6, [r4], #4
521 SMLABB r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2)
522 LDRGE r8, [r5], #4
523 SMLATT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_3, y_3)
524 LDRGE r7, [r4], #4
525 LDRGE r9, [r5], #4
526 BGE celt_pitch_xcorr_edsp_process1a_loop4
527celt_pitch_xcorr_edsp_process1a_loop_done
528 ADDS r12, r12, #2
529 LDRGE r6, [r4], #4
530 LDRGE r8, [r5], #4
531 ; Stall
532 SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
533 SUBGE r12, r12, #2
534 SMLATTGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1)
535 ADDS r12, r12, #1
536 LDRHGE r6, [r4], #2
537 LDRHGE r8, [r5], #2
538 ; Stall
539 SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y)
540 ; maxcorr = max(maxcorr, sum)
541 CMP r0, r14
542 ; xcorr[i] = sum
543 STR r14, [r2], #4
544 MOVLT r0, r14
545celt_pitch_xcorr_edsp_done
546 LDMFD sp!, {r4-r11, pc}
547 ENDP
548
549ENDIF
550
551END