diff options
Diffstat (limited to 'lib/rbcodec/codecs/libopus/celt/arm/celt_pitch_xcorr_arm.s')
-rw-r--r-- | lib/rbcodec/codecs/libopus/celt/arm/celt_pitch_xcorr_arm.s | 551 |
1 files changed, 551 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/libopus/celt/arm/celt_pitch_xcorr_arm.s b/lib/rbcodec/codecs/libopus/celt/arm/celt_pitch_xcorr_arm.s new file mode 100644 index 0000000000..6e873afc37 --- /dev/null +++ b/lib/rbcodec/codecs/libopus/celt/arm/celt_pitch_xcorr_arm.s | |||
@@ -0,0 +1,551 @@ | |||
1 | ; Copyright (c) 2007-2008 CSIRO | ||
2 | ; Copyright (c) 2007-2009 Xiph.Org Foundation | ||
3 | ; Copyright (c) 2013 Parrot | ||
4 | ; Written by Aurélien Zanelli | ||
5 | ; | ||
6 | ; Redistribution and use in source and binary forms, with or without | ||
7 | ; modification, are permitted provided that the following conditions | ||
8 | ; are met: | ||
9 | ; | ||
10 | ; - Redistributions of source code must retain the above copyright | ||
11 | ; notice, this list of conditions and the following disclaimer. | ||
12 | ; | ||
13 | ; - Redistributions in binary form must reproduce the above copyright | ||
14 | ; notice, this list of conditions and the following disclaimer in the | ||
15 | ; documentation and/or other materials provided with the distribution. | ||
16 | ; | ||
17 | ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
18 | ; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
19 | ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
20 | ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER | ||
21 | ; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
22 | ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
23 | ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
24 | ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||
25 | ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | ||
26 | ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
27 | ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
28 | |||
29 | AREA |.text|, CODE, READONLY | ||
30 | |||
31 | GET celt/arm/armopts.s | ||
32 | |||
33 | IF OPUS_ARM_MAY_HAVE_EDSP | ||
34 | EXPORT celt_pitch_xcorr_edsp | ||
35 | ENDIF | ||
36 | |||
37 | IF OPUS_ARM_MAY_HAVE_NEON | ||
38 | EXPORT celt_pitch_xcorr_neon | ||
39 | ENDIF | ||
40 | |||
41 | IF OPUS_ARM_MAY_HAVE_NEON | ||
42 | |||
43 | ; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3 | ||
44 | xcorr_kernel_neon PROC | ||
45 | xcorr_kernel_neon_start | ||
46 | ; input: | ||
47 | ; r3 = int len | ||
48 | ; r4 = opus_val16 *x | ||
49 | ; r5 = opus_val16 *y | ||
50 | ; q0 = opus_val32 sum[4] | ||
51 | ; output: | ||
52 | ; q0 = opus_val32 sum[4] | ||
53 | ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15 | ||
54 | ; internal usage: | ||
55 | ; r12 = int j | ||
56 | ; d3 = y_3|y_2|y_1|y_0 | ||
57 | ; q2 = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4 | ||
58 | ; q3 = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0 | ||
59 | ; q8 = scratch | ||
60 | ; | ||
61 | ; Load y[0...3] | ||
62 | ; This requires len>0 to always be valid (which we assert in the C code). | ||
63 | VLD1.16 {d5}, [r5]! | ||
64 | SUBS r12, r3, #8 | ||
65 | BLE xcorr_kernel_neon_process4 | ||
66 | ; Process 8 samples at a time. | ||
67 | ; This loop loads one y value more than we actually need. Therefore we have to | ||
68 | ; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid | ||
69 | ; reading past the end of the array. | ||
70 | xcorr_kernel_neon_process8 | ||
71 | ; This loop has 19 total instructions (10 cycles to issue, minimum), with | ||
72 | ; - 2 cycles of ARM insrtuctions, | ||
73 | ; - 10 cycles of load/store/byte permute instructions, and | ||
74 | ; - 9 cycles of data processing instructions. | ||
75 | ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the | ||
76 | ; latter two categories, meaning the whole loop should run in 10 cycles per | ||
77 | ; iteration, barring cache misses. | ||
78 | ; | ||
79 | ; Load x[0...7] | ||
80 | VLD1.16 {d6, d7}, [r4]! | ||
81 | ; Unlike VMOV, VAND is a data processsing instruction (and doesn't get | ||
82 | ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1. | ||
83 | VAND d3, d5, d5 | ||
84 | SUBS r12, r12, #8 | ||
85 | ; Load y[4...11] | ||
86 | VLD1.16 {d4, d5}, [r5]! | ||
87 | VMLAL.S16 q0, d3, d6[0] | ||
88 | VEXT.16 d16, d3, d4, #1 | ||
89 | VMLAL.S16 q0, d4, d7[0] | ||
90 | VEXT.16 d17, d4, d5, #1 | ||
91 | VMLAL.S16 q0, d16, d6[1] | ||
92 | VEXT.16 d16, d3, d4, #2 | ||
93 | VMLAL.S16 q0, d17, d7[1] | ||
94 | VEXT.16 d17, d4, d5, #2 | ||
95 | VMLAL.S16 q0, d16, d6[2] | ||
96 | VEXT.16 d16, d3, d4, #3 | ||
97 | VMLAL.S16 q0, d17, d7[2] | ||
98 | VEXT.16 d17, d4, d5, #3 | ||
99 | VMLAL.S16 q0, d16, d6[3] | ||
100 | VMLAL.S16 q0, d17, d7[3] | ||
101 | BGT xcorr_kernel_neon_process8 | ||
102 | ; Process 4 samples here if we have > 4 left (still reading one extra y value). | ||
103 | xcorr_kernel_neon_process4 | ||
104 | ADDS r12, r12, #4 | ||
105 | BLE xcorr_kernel_neon_process2 | ||
106 | ; Load x[0...3] | ||
107 | VLD1.16 d6, [r4]! | ||
108 | ; Use VAND since it's a data processing instruction again. | ||
109 | VAND d4, d5, d5 | ||
110 | SUB r12, r12, #4 | ||
111 | ; Load y[4...7] | ||
112 | VLD1.16 d5, [r5]! | ||
113 | VMLAL.S16 q0, d4, d6[0] | ||
114 | VEXT.16 d16, d4, d5, #1 | ||
115 | VMLAL.S16 q0, d16, d6[1] | ||
116 | VEXT.16 d16, d4, d5, #2 | ||
117 | VMLAL.S16 q0, d16, d6[2] | ||
118 | VEXT.16 d16, d4, d5, #3 | ||
119 | VMLAL.S16 q0, d16, d6[3] | ||
120 | ; Process 2 samples here if we have > 2 left (still reading one extra y value). | ||
121 | xcorr_kernel_neon_process2 | ||
122 | ADDS r12, r12, #2 | ||
123 | BLE xcorr_kernel_neon_process1 | ||
124 | ; Load x[0...1] | ||
125 | VLD2.16 {d6[],d7[]}, [r4]! | ||
126 | ; Use VAND since it's a data processing instruction again. | ||
127 | VAND d4, d5, d5 | ||
128 | SUB r12, r12, #2 | ||
129 | ; Load y[4...5] | ||
130 | VLD1.32 {d5[]}, [r5]! | ||
131 | VMLAL.S16 q0, d4, d6 | ||
132 | VEXT.16 d16, d4, d5, #1 | ||
133 | ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI | ||
134 | ; instead of VEXT, since it's a data-processing instruction. | ||
135 | VSRI.64 d5, d4, #32 | ||
136 | VMLAL.S16 q0, d16, d7 | ||
137 | ; Process 1 sample using the extra y value we loaded above. | ||
138 | xcorr_kernel_neon_process1 | ||
139 | ; Load next *x | ||
140 | VLD1.16 {d6[]}, [r4]! | ||
141 | ADDS r12, r12, #1 | ||
142 | ; y[0...3] are left in d5 from prior iteration(s) (if any) | ||
143 | VMLAL.S16 q0, d5, d6 | ||
144 | MOVLE pc, lr | ||
145 | ; Now process 1 last sample, not reading ahead. | ||
146 | ; Load last *y | ||
147 | VLD1.16 {d4[]}, [r5]! | ||
148 | VSRI.64 d4, d5, #16 | ||
149 | ; Load last *x | ||
150 | VLD1.16 {d6[]}, [r4]! | ||
151 | VMLAL.S16 q0, d4, d6 | ||
152 | MOV pc, lr | ||
153 | ENDP | ||
154 | |||
155 | ; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y, | ||
156 | ; opus_val32 *xcorr, int len, int max_pitch, int arch) | ||
157 | celt_pitch_xcorr_neon PROC | ||
158 | ; input: | ||
159 | ; r0 = opus_val16 *_x | ||
160 | ; r1 = opus_val16 *_y | ||
161 | ; r2 = opus_val32 *xcorr | ||
162 | ; r3 = int len | ||
163 | ; output: | ||
164 | ; r0 = int maxcorr | ||
165 | ; internal usage: | ||
166 | ; r4 = opus_val16 *x (for xcorr_kernel_neon()) | ||
167 | ; r5 = opus_val16 *y (for xcorr_kernel_neon()) | ||
168 | ; r6 = int max_pitch | ||
169 | ; r12 = int j | ||
170 | ; q15 = int maxcorr[4] (q15 is not used by xcorr_kernel_neon()) | ||
171 | ; ignored: | ||
172 | ; int arch | ||
173 | STMFD sp!, {r4-r6, lr} | ||
174 | LDR r6, [sp, #16] | ||
175 | VMOV.S32 q15, #1 | ||
176 | ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done | ||
177 | SUBS r6, r6, #4 | ||
178 | BLT celt_pitch_xcorr_neon_process4_done | ||
179 | celt_pitch_xcorr_neon_process4 | ||
180 | ; xcorr_kernel_neon parameters: | ||
181 | ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0} | ||
182 | MOV r4, r0 | ||
183 | MOV r5, r1 | ||
184 | VEOR q0, q0, q0 | ||
185 | ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3. | ||
186 | ; So we don't save/restore any other registers. | ||
187 | BL xcorr_kernel_neon_start | ||
188 | SUBS r6, r6, #4 | ||
189 | VST1.32 {q0}, [r2]! | ||
190 | ; _y += 4 | ||
191 | ADD r1, r1, #8 | ||
192 | VMAX.S32 q15, q15, q0 | ||
193 | ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done | ||
194 | BGE celt_pitch_xcorr_neon_process4 | ||
195 | ; We have less than 4 sums left to compute. | ||
196 | celt_pitch_xcorr_neon_process4_done | ||
197 | ADDS r6, r6, #4 | ||
198 | ; Reduce maxcorr to a single value | ||
199 | VMAX.S32 d30, d30, d31 | ||
200 | VPMAX.S32 d30, d30, d30 | ||
201 | ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done | ||
202 | BLE celt_pitch_xcorr_neon_done | ||
203 | ; Now compute each remaining sum one at a time. | ||
204 | celt_pitch_xcorr_neon_process_remaining | ||
205 | MOV r4, r0 | ||
206 | MOV r5, r1 | ||
207 | VMOV.I32 q0, #0 | ||
208 | SUBS r12, r3, #8 | ||
209 | BLT celt_pitch_xcorr_neon_process_remaining4 | ||
210 | ; Sum terms 8 at a time. | ||
211 | celt_pitch_xcorr_neon_process_remaining_loop8 | ||
212 | ; Load x[0...7] | ||
213 | VLD1.16 {q1}, [r4]! | ||
214 | ; Load y[0...7] | ||
215 | VLD1.16 {q2}, [r5]! | ||
216 | SUBS r12, r12, #8 | ||
217 | VMLAL.S16 q0, d4, d2 | ||
218 | VMLAL.S16 q0, d5, d3 | ||
219 | BGE celt_pitch_xcorr_neon_process_remaining_loop8 | ||
220 | ; Sum terms 4 at a time. | ||
221 | celt_pitch_xcorr_neon_process_remaining4 | ||
222 | ADDS r12, r12, #4 | ||
223 | BLT celt_pitch_xcorr_neon_process_remaining4_done | ||
224 | ; Load x[0...3] | ||
225 | VLD1.16 {d2}, [r4]! | ||
226 | ; Load y[0...3] | ||
227 | VLD1.16 {d3}, [r5]! | ||
228 | SUB r12, r12, #4 | ||
229 | VMLAL.S16 q0, d3, d2 | ||
230 | celt_pitch_xcorr_neon_process_remaining4_done | ||
231 | ; Reduce the sum to a single value. | ||
232 | VADD.S32 d0, d0, d1 | ||
233 | VPADDL.S32 d0, d0 | ||
234 | ADDS r12, r12, #4 | ||
235 | BLE celt_pitch_xcorr_neon_process_remaining_loop_done | ||
236 | ; Sum terms 1 at a time. | ||
237 | celt_pitch_xcorr_neon_process_remaining_loop1 | ||
238 | VLD1.16 {d2[]}, [r4]! | ||
239 | VLD1.16 {d3[]}, [r5]! | ||
240 | SUBS r12, r12, #1 | ||
241 | VMLAL.S16 q0, d2, d3 | ||
242 | BGT celt_pitch_xcorr_neon_process_remaining_loop1 | ||
243 | celt_pitch_xcorr_neon_process_remaining_loop_done | ||
244 | VST1.32 {d0[0]}, [r2]! | ||
245 | VMAX.S32 d30, d30, d0 | ||
246 | SUBS r6, r6, #1 | ||
247 | ; _y++ | ||
248 | ADD r1, r1, #2 | ||
249 | ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining | ||
250 | BGT celt_pitch_xcorr_neon_process_remaining | ||
251 | celt_pitch_xcorr_neon_done | ||
252 | VMOV.32 r0, d30[0] | ||
253 | LDMFD sp!, {r4-r6, pc} | ||
254 | ENDP | ||
255 | |||
256 | ENDIF | ||
257 | |||
258 | IF OPUS_ARM_MAY_HAVE_EDSP | ||
259 | |||
260 | ; This will get used on ARMv7 devices without NEON, so it has been optimized | ||
261 | ; to take advantage of dual-issuing where possible. | ||
262 | xcorr_kernel_edsp PROC | ||
263 | xcorr_kernel_edsp_start | ||
264 | ; input: | ||
265 | ; r3 = int len | ||
266 | ; r4 = opus_val16 *_x (must be 32-bit aligned) | ||
267 | ; r5 = opus_val16 *_y (must be 32-bit aligned) | ||
268 | ; r6...r9 = opus_val32 sum[4] | ||
269 | ; output: | ||
270 | ; r6...r9 = opus_val32 sum[4] | ||
271 | ; preserved: r0-r5 | ||
272 | ; internal usage | ||
273 | ; r2 = int j | ||
274 | ; r12,r14 = opus_val16 x[4] | ||
275 | ; r10,r11 = opus_val16 y[4] | ||
276 | STMFD sp!, {r2,r4,r5,lr} | ||
277 | LDR r10, [r5], #4 ; Load y[0...1] | ||
278 | SUBS r2, r3, #4 ; j = len-4 | ||
279 | LDR r11, [r5], #4 ; Load y[2...3] | ||
280 | BLE xcorr_kernel_edsp_process4_done | ||
281 | LDR r12, [r4], #4 ; Load x[0...1] | ||
282 | ; Stall | ||
283 | xcorr_kernel_edsp_process4 | ||
284 | ; The multiplies must issue from pipeline 0, and can't dual-issue with each | ||
285 | ; other. Every other instruction here dual-issues with a multiply, and is | ||
286 | ; thus "free". There should be no stalls in the body of the loop. | ||
287 | SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_0,y_0) | ||
288 | LDR r14, [r4], #4 ; Load x[2...3] | ||
289 | SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x_0,y_1) | ||
290 | SUBS r2, r2, #4 ; j-=4 | ||
291 | SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_0,y_2) | ||
292 | SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x_0,y_3) | ||
293 | SMLATT r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_1,y_1) | ||
294 | LDR r10, [r5], #4 ; Load y[4...5] | ||
295 | SMLATB r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],x_1,y_2) | ||
296 | SMLATT r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_1,y_3) | ||
297 | SMLATB r9, r12, r10, r9 ; sum[3] = MAC16_16(sum[3],x_1,y_4) | ||
298 | LDRGT r12, [r4], #4 ; Load x[0...1] | ||
299 | SMLABB r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_2,y_2) | ||
300 | SMLABT r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x_2,y_3) | ||
301 | SMLABB r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_2,y_4) | ||
302 | SMLABT r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x_2,y_5) | ||
303 | SMLATT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_3,y_3) | ||
304 | LDR r11, [r5], #4 ; Load y[6...7] | ||
305 | SMLATB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],x_3,y_4) | ||
306 | SMLATT r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_3,y_5) | ||
307 | SMLATB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],x_3,y_6) | ||
308 | BGT xcorr_kernel_edsp_process4 | ||
309 | xcorr_kernel_edsp_process4_done | ||
310 | ADDS r2, r2, #4 | ||
311 | BLE xcorr_kernel_edsp_done | ||
312 | LDRH r12, [r4], #2 ; r12 = *x++ | ||
313 | SUBS r2, r2, #1 ; j-- | ||
314 | ; Stall | ||
315 | SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_0) | ||
316 | LDRHGT r14, [r4], #2 ; r14 = *x++ | ||
317 | SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x,y_1) | ||
318 | SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_2) | ||
319 | SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x,y_3) | ||
320 | BLE xcorr_kernel_edsp_done | ||
321 | SMLABT r6, r14, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_1) | ||
322 | SUBS r2, r2, #1 ; j-- | ||
323 | SMLABB r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x,y_2) | ||
324 | LDRH r10, [r5], #2 ; r10 = y_4 = *y++ | ||
325 | SMLABT r8, r14, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_3) | ||
326 | LDRHGT r12, [r4], #2 ; r12 = *x++ | ||
327 | SMLABB r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x,y_4) | ||
328 | BLE xcorr_kernel_edsp_done | ||
329 | SMLABB r6, r12, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_2) | ||
330 | CMP r2, #1 ; j-- | ||
331 | SMLABT r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_3) | ||
332 | LDRH r2, [r5], #2 ; r2 = y_5 = *y++ | ||
333 | SMLABB r8, r12, r10, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_4) | ||
334 | LDRHGT r14, [r4] ; r14 = *x | ||
335 | SMLABB r9, r12, r2, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_5) | ||
336 | BLE xcorr_kernel_edsp_done | ||
337 | SMLABT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_3) | ||
338 | LDRH r11, [r5] ; r11 = y_6 = *y | ||
339 | SMLABB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_4) | ||
340 | SMLABB r8, r14, r2, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_5) | ||
341 | SMLABB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_6) | ||
342 | xcorr_kernel_edsp_done | ||
343 | LDMFD sp!, {r2,r4,r5,pc} | ||
344 | ENDP | ||
345 | |||
346 | celt_pitch_xcorr_edsp PROC | ||
347 | ; input: | ||
348 | ; r0 = opus_val16 *_x (must be 32-bit aligned) | ||
349 | ; r1 = opus_val16 *_y (only needs to be 16-bit aligned) | ||
350 | ; r2 = opus_val32 *xcorr | ||
351 | ; r3 = int len | ||
352 | ; output: | ||
353 | ; r0 = maxcorr | ||
354 | ; internal usage | ||
355 | ; r4 = opus_val16 *x | ||
356 | ; r5 = opus_val16 *y | ||
357 | ; r6 = opus_val32 sum0 | ||
358 | ; r7 = opus_val32 sum1 | ||
359 | ; r8 = opus_val32 sum2 | ||
360 | ; r9 = opus_val32 sum3 | ||
361 | ; r1 = int max_pitch | ||
362 | ; r12 = int j | ||
363 | ; ignored: | ||
364 | ; int arch | ||
365 | STMFD sp!, {r4-r11, lr} | ||
366 | MOV r5, r1 | ||
367 | LDR r1, [sp, #36] | ||
368 | MOV r4, r0 | ||
369 | TST r5, #3 | ||
370 | ; maxcorr = 1 | ||
371 | MOV r0, #1 | ||
372 | BEQ celt_pitch_xcorr_edsp_process1u_done | ||
373 | ; Compute one sum at the start to make y 32-bit aligned. | ||
374 | SUBS r12, r3, #4 | ||
375 | ; r14 = sum = 0 | ||
376 | MOV r14, #0 | ||
377 | LDRH r8, [r5], #2 | ||
378 | BLE celt_pitch_xcorr_edsp_process1u_loop4_done | ||
379 | LDR r6, [r4], #4 | ||
380 | MOV r8, r8, LSL #16 | ||
381 | celt_pitch_xcorr_edsp_process1u_loop4 | ||
382 | LDR r9, [r5], #4 | ||
383 | SMLABT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0) | ||
384 | LDR r7, [r4], #4 | ||
385 | SMLATB r14, r6, r9, r14 ; sum = MAC16_16(sum, x_1, y_1) | ||
386 | LDR r8, [r5], #4 | ||
387 | SMLABT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2) | ||
388 | SUBS r12, r12, #4 ; j-=4 | ||
389 | SMLATB r14, r7, r8, r14 ; sum = MAC16_16(sum, x_3, y_3) | ||
390 | LDRGT r6, [r4], #4 | ||
391 | BGT celt_pitch_xcorr_edsp_process1u_loop4 | ||
392 | MOV r8, r8, LSR #16 | ||
393 | celt_pitch_xcorr_edsp_process1u_loop4_done | ||
394 | ADDS r12, r12, #4 | ||
395 | celt_pitch_xcorr_edsp_process1u_loop1 | ||
396 | LDRHGE r6, [r4], #2 | ||
397 | ; Stall | ||
398 | SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y) | ||
399 | SUBSGE r12, r12, #1 | ||
400 | LDRHGT r8, [r5], #2 | ||
401 | BGT celt_pitch_xcorr_edsp_process1u_loop1 | ||
402 | ; Restore _x | ||
403 | SUB r4, r4, r3, LSL #1 | ||
404 | ; Restore and advance _y | ||
405 | SUB r5, r5, r3, LSL #1 | ||
406 | ; maxcorr = max(maxcorr, sum) | ||
407 | CMP r0, r14 | ||
408 | ADD r5, r5, #2 | ||
409 | MOVLT r0, r14 | ||
410 | SUBS r1, r1, #1 | ||
411 | ; xcorr[i] = sum | ||
412 | STR r14, [r2], #4 | ||
413 | BLE celt_pitch_xcorr_edsp_done | ||
414 | celt_pitch_xcorr_edsp_process1u_done | ||
415 | ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2 | ||
416 | SUBS r1, r1, #4 | ||
417 | BLT celt_pitch_xcorr_edsp_process2 | ||
418 | celt_pitch_xcorr_edsp_process4 | ||
419 | ; xcorr_kernel_edsp parameters: | ||
420 | ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0} | ||
421 | MOV r6, #0 | ||
422 | MOV r7, #0 | ||
423 | MOV r8, #0 | ||
424 | MOV r9, #0 | ||
425 | BL xcorr_kernel_edsp_start ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len) | ||
426 | ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3) | ||
427 | CMP r0, r6 | ||
428 | ; _y+=4 | ||
429 | ADD r5, r5, #8 | ||
430 | MOVLT r0, r6 | ||
431 | CMP r0, r7 | ||
432 | MOVLT r0, r7 | ||
433 | CMP r0, r8 | ||
434 | MOVLT r0, r8 | ||
435 | CMP r0, r9 | ||
436 | MOVLT r0, r9 | ||
437 | STMIA r2!, {r6-r9} | ||
438 | SUBS r1, r1, #4 | ||
439 | BGE celt_pitch_xcorr_edsp_process4 | ||
440 | celt_pitch_xcorr_edsp_process2 | ||
441 | ADDS r1, r1, #2 | ||
442 | BLT celt_pitch_xcorr_edsp_process1a | ||
443 | SUBS r12, r3, #4 | ||
444 | ; {r10, r11} = {sum0, sum1} = {0, 0} | ||
445 | MOV r10, #0 | ||
446 | MOV r11, #0 | ||
447 | LDR r8, [r5], #4 | ||
448 | BLE celt_pitch_xcorr_edsp_process2_loop_done | ||
449 | LDR r6, [r4], #4 | ||
450 | LDR r9, [r5], #4 | ||
451 | celt_pitch_xcorr_edsp_process2_loop4 | ||
452 | SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0) | ||
453 | LDR r7, [r4], #4 | ||
454 | SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1) | ||
455 | SUBS r12, r12, #4 ; j-=4 | ||
456 | SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1) | ||
457 | LDR r8, [r5], #4 | ||
458 | SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2) | ||
459 | LDRGT r6, [r4], #4 | ||
460 | SMLABB r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_2, y_2) | ||
461 | SMLABT r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_2, y_3) | ||
462 | SMLATT r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_3, y_3) | ||
463 | LDRGT r9, [r5], #4 | ||
464 | SMLATB r11, r7, r8, r11 ; sum1 = MAC16_16(sum1, x_3, y_4) | ||
465 | BGT celt_pitch_xcorr_edsp_process2_loop4 | ||
466 | celt_pitch_xcorr_edsp_process2_loop_done | ||
467 | ADDS r12, r12, #2 | ||
468 | BLE celt_pitch_xcorr_edsp_process2_1 | ||
469 | LDR r6, [r4], #4 | ||
470 | ; Stall | ||
471 | SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0) | ||
472 | LDR r9, [r5], #4 | ||
473 | SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1) | ||
474 | SUB r12, r12, #2 | ||
475 | SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1) | ||
476 | MOV r8, r9 | ||
477 | SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2) | ||
478 | celt_pitch_xcorr_edsp_process2_1 | ||
479 | LDRH r6, [r4], #2 | ||
480 | ADDS r12, r12, #1 | ||
481 | ; Stall | ||
482 | SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0) | ||
483 | LDRHGT r7, [r4], #2 | ||
484 | SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1) | ||
485 | BLE celt_pitch_xcorr_edsp_process2_done | ||
486 | LDRH r9, [r5], #2 | ||
487 | SMLABT r10, r7, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_1) | ||
488 | SMLABB r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_0, y_2) | ||
489 | celt_pitch_xcorr_edsp_process2_done | ||
490 | ; Restore _x | ||
491 | SUB r4, r4, r3, LSL #1 | ||
492 | ; Restore and advance _y | ||
493 | SUB r5, r5, r3, LSL #1 | ||
494 | ; maxcorr = max(maxcorr, sum0) | ||
495 | CMP r0, r10 | ||
496 | ADD r5, r5, #2 | ||
497 | MOVLT r0, r10 | ||
498 | SUB r1, r1, #2 | ||
499 | ; maxcorr = max(maxcorr, sum1) | ||
500 | CMP r0, r11 | ||
501 | ; xcorr[i] = sum | ||
502 | STR r10, [r2], #4 | ||
503 | MOVLT r0, r11 | ||
504 | STR r11, [r2], #4 | ||
505 | celt_pitch_xcorr_edsp_process1a | ||
506 | ADDS r1, r1, #1 | ||
507 | BLT celt_pitch_xcorr_edsp_done | ||
508 | SUBS r12, r3, #4 | ||
509 | ; r14 = sum = 0 | ||
510 | MOV r14, #0 | ||
511 | BLT celt_pitch_xcorr_edsp_process1a_loop_done | ||
512 | LDR r6, [r4], #4 | ||
513 | LDR r8, [r5], #4 | ||
514 | LDR r7, [r4], #4 | ||
515 | LDR r9, [r5], #4 | ||
516 | celt_pitch_xcorr_edsp_process1a_loop4 | ||
517 | SMLABB r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0) | ||
518 | SUBS r12, r12, #4 ; j-=4 | ||
519 | SMLATT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1) | ||
520 | LDRGE r6, [r4], #4 | ||
521 | SMLABB r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2) | ||
522 | LDRGE r8, [r5], #4 | ||
523 | SMLATT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_3, y_3) | ||
524 | LDRGE r7, [r4], #4 | ||
525 | LDRGE r9, [r5], #4 | ||
526 | BGE celt_pitch_xcorr_edsp_process1a_loop4 | ||
527 | celt_pitch_xcorr_edsp_process1a_loop_done | ||
528 | ADDS r12, r12, #2 | ||
529 | LDRGE r6, [r4], #4 | ||
530 | LDRGE r8, [r5], #4 | ||
531 | ; Stall | ||
532 | SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0) | ||
533 | SUBGE r12, r12, #2 | ||
534 | SMLATTGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1) | ||
535 | ADDS r12, r12, #1 | ||
536 | LDRHGE r6, [r4], #2 | ||
537 | LDRHGE r8, [r5], #2 | ||
538 | ; Stall | ||
539 | SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y) | ||
540 | ; maxcorr = max(maxcorr, sum) | ||
541 | CMP r0, r14 | ||
542 | ; xcorr[i] = sum | ||
543 | STR r14, [r2], #4 | ||
544 | MOVLT r0, r14 | ||
545 | celt_pitch_xcorr_edsp_done | ||
546 | LDMFD sp!, {r4-r11, pc} | ||
547 | ENDP | ||
548 | |||
549 | ENDIF | ||
550 | |||
551 | END | ||