summaryrefslogtreecommitdiff
path: root/lib/rbcodec/codecs/libmad/imdct_l_arm.S
diff options
context:
space:
mode:
Diffstat (limited to 'lib/rbcodec/codecs/libmad/imdct_l_arm.S')
-rw-r--r--lib/rbcodec/codecs/libmad/imdct_l_arm.S1001
1 files changed, 1001 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/libmad/imdct_l_arm.S b/lib/rbcodec/codecs/libmad/imdct_l_arm.S
new file mode 100644
index 0000000000..b511ff169d
--- /dev/null
+++ b/lib/rbcodec/codecs/libmad/imdct_l_arm.S
@@ -0,0 +1,1001 @@
1/*****************************************************************************
2* Copyright (C) 2000-2001 Andre McCurdy <armccurdy@yahoo.co.uk>
3*
4* This program is free software. you can redistribute it and/or modify
5* it under the terms of the GNU General Public License as published by
6* the Free Software Foundation@ either version 2 of the License, or
7* (at your option) any later version.
8*
9* This program is distributed in the hope that it will be useful,
10* but WITHOUT ANY WARRANTY, without even the implied warranty of
11* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12* GNU General Public License for more details.
13*
14* You should have received a copy of the GNU General Public License
15* along with this program@ if not, write to the Free Software
16* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*
18*****************************************************************************
19*
20* Notes:
21*
22*
23*****************************************************************************
24*
25* $Id$
26*
27* 2001/03/24: Andre McCurdy <armccurdy@yahoo.co.uk>
28* - Corrected PIC unsafe loading of address of 'imdct36_long_karray'
29*
30* 2000/09/20: Robert Leslie <rob@mars.org>
31* - Added a global symbol with leading underscore per suggestion of
32* Simon Burge to support linking with the a.out format.
33*
34* 2000/09/15: Robert Leslie <rob@mars.org>
35* - Fixed a small bug where flags were changed before a conditional branch.
36*
37* 2000/09/15: Andre McCurdy <armccurdy@yahoo.co.uk>
38* - Applied Nicolas Pitre's rounding optimisation in all remaining places.
39*
40* 2000/09/09: Nicolas Pitre <nico@cam.org>
41* - Optimized rounding + scaling operations.
42*
43* 2000/08/09: Andre McCurdy <armccurdy@yahoo.co.uk>
44* - Original created.
45*
46****************************************************************************/
47
48#include "config.h"
49
50/*
51 On entry:
52
53 r0 = pointer to 18 element input array
54 r1 = pointer to 36 element output array
55 r2 = windowing block type
56
57
58 Stack frame created during execution of the function:
59
60 Initial Holds:
61 Stack
62 pointer
63 minus:
64
65 0
66 4 lr
67 8 r11
68 12 r10
69 16 r9
70 20 r8
71 24 r7
72 28 r6
73 32 r5
74 36 r4
75
76 40 r2 : windowing block type
77
78 44 ct00 high
79 48 ct00 low
80 52 ct01 high
81 56 ct01 low
82 60 ct04 high
83 64 ct04 low
84 68 ct06 high
85 72 ct06 low
86 76 ct05 high
87 80 ct05 low
88 84 ct03 high
89 88 ct03 low
90 92 -ct05 high
91 96 -ct05 low
92 100 -ct07 high
93 104 -ct07 low
94 108 ct07 high
95 112 ct07 low
96 116 ct02 high
97 120 ct02 low
98*/
99
100#define BLOCK_MODE_NORMAL 0
101#define BLOCK_MODE_START 1
102#define BLOCK_MODE_STOP 3
103
104
105#define X0 0x00
106#define X1 0x04
107#define X2 0x08
108#define X3 0x0C
109#define X4 0x10
110#define X5 0x14
111#define X6 0x18
112#define X7 0x1c
113#define X8 0x20
114#define X9 0x24
115#define X10 0x28
116#define X11 0x2c
117#define X12 0x30
118#define X13 0x34
119#define X14 0x38
120#define X15 0x3c
121#define X16 0x40
122#define X17 0x44
123
124#define x0 0x00
125#define x1 0x04
126#define x2 0x08
127#define x3 0x0C
128#define x4 0x10
129#define x5 0x14
130#define x6 0x18
131#define x7 0x1c
132#define x8 0x20
133#define x9 0x24
134#define x10 0x28
135#define x11 0x2c
136#define x12 0x30
137#define x13 0x34
138#define x14 0x38
139#define x15 0x3c
140#define x16 0x40
141#define x17 0x44
142#define x18 0x48
143#define x19 0x4c
144#define x20 0x50
145#define x21 0x54
146#define x22 0x58
147#define x23 0x5c
148#define x24 0x60
149#define x25 0x64
150#define x26 0x68
151#define x27 0x6c
152#define x28 0x70
153#define x29 0x74
154#define x30 0x78
155#define x31 0x7c
156#define x32 0x80
157#define x33 0x84
158#define x34 0x88
159#define x35 0x8c
160
161#define K00 0x0ffc19fd
162#define K01 0x00b2aa3e
163#define K02 0x0fdcf549
164#define K03 0x0216a2a2
165#define K04 0x0f9ee890
166#define K05 0x03768962
167#define K06 0x0f426cb5
168#define K07 0x04cfb0e2
169#define K08 0x0ec835e8
170#define K09 0x061f78aa
171#define K10 0x0e313245
172#define K11 0x07635284
173#define K12 0x0d7e8807
174#define K13 0x0898c779
175#define K14 0x0cb19346
176#define K15 0x09bd7ca0
177#define K16 0x0bcbe352
178#define K17 0x0acf37ad
179
180#define minus_K02 0xf0230ab7
181
182#define WL0 0x00b2aa3e
183#define WL1 0x0216a2a2
184#define WL2 0x03768962
185#define WL3 0x04cfb0e2
186#define WL4 0x061f78aa
187#define WL5 0x07635284
188#define WL6 0x0898c779
189#define WL7 0x09bd7ca0
190#define WL8 0x0acf37ad
191#define WL9 0x0bcbe352
192#define WL10 0x0cb19346
193#define WL11 0x0d7e8807
194#define WL12 0x0e313245
195#define WL13 0x0ec835e8
196#define WL14 0x0f426cb5
197#define WL15 0x0f9ee890
198#define WL16 0x0fdcf549
199#define WL17 0x0ffc19fd
200
201
202@*****************************************************************************
203
204
205 .text
206 .align
207
208 .global III_imdct_l
209 .global _III_imdct_l
210
211III_imdct_l:
212_III_imdct_l:
213
214 stmdb sp!, { r2, r4 - r11, lr } @ all callee saved regs, plus arg3
215
216 ldr r4, =K08 @ r4 = K08
217 ldr r5, =K09 @ r5 = K09
218 ldr r8, [r0, #X4] @ r8 = X4
219 ldr r9, [r0, #X13] @ r9 = X13
220 rsb r6, r4, #0 @ r6 = -K08
221 rsb r7, r5, #0 @ r7 = -K09
222
223 smull r2, r3, r4, r8 @ r2..r3 = (X4 * K08)
224 smlal r2, r3, r5, r9 @ r2..r3 = (X4 * K08) + (X13 * K09) = ct01
225
226 smull r10, lr, r8, r5 @ r10..lr = (X4 * K09)
227 smlal r10, lr, r9, r6 @ r10..lr = (X4 * K09) + (X13 * -K08) = ct00
228
229 ldr r8, [r0, #X7] @ r8 = X7
230 ldr r9, [r0, #X16] @ r9 = X16
231
232 stmdb sp!, { r2, r3, r10, lr } @ stack ct00_h, ct00_l, ct01_h, ct01_l
233
234 add r8, r8, r9 @ r8 = (X7 + X16)
235 ldr r9, [r0, #X1] @ r9 = X1
236
237 smlal r2, r3, r6, r8 @ r2..r3 = ct01 + ((X7 + X16) * -K08)
238 smlal r2, r3, r7, r9 @ r2..r3 += (X1 * -K09)
239
240 ldr r7, [r0, #X10] @ r7 = X10
241
242 rsbs r10, r10, #0
243 rsc lr, lr, #0 @ r10..lr = -ct00
244
245 smlal r2, r3, r5, r7 @ r2..r3 += (X10 * K09) = ct06
246
247 smlal r10, lr, r9, r6 @ r10..lr = -ct00 + ( X1 * -K08)
248 smlal r10, lr, r8, r5 @ r10..lr += ((X7 + X16) * K09)
249 smlal r10, lr, r7, r4 @ r10..lr += ( X10 * K08) = ct04
250
251 stmdb sp!, { r2, r3, r10, lr } @ stack ct04_h, ct04_l, ct06_h, ct06_l
252
253 @----
254
255 ldr r7, [r0, #X0]
256 ldr r8, [r0, #X11]
257 ldr r9, [r0, #X12]
258 sub r7, r7, r8
259 sub r7, r7, r9 @ r7 = (X0 - X11 -X12) = ct14
260
261 ldr r9, [r0, #X3]
262 ldr r8, [r0, #X8]
263 ldr r11, [r0, #X15]
264 sub r8, r8, r9
265 add r8, r8, r11 @ r8 = (X8 - X3 + X15) = ct16
266
267 add r11, r7, r8 @ r11 = ct14 + ct16 = ct18
268
269 smlal r2, r3, r6, r11 @ r2..r3 = ct06 + ((X0 - X11 - X3 + X15 + X8 - X12) * -K08)
270
271 ldr r6, [r0, #X2]
272 ldr r9, [r0, #X9]
273 ldr r12, [r0, #X14]
274 sub r6, r6, r9
275 sub r6, r6, r12 @ r6 = (X2 - X9 - X14) = ct15
276
277 ldr r9, [r0, #X5]
278 ldr r12, [r0, #X6]
279 sub r9, r9, r12
280 ldr r12, [r0, #X17]
281 sub r9, r9, r12 @ r9 = (X5 - X6 - X17) = ct17
282
283 add r12, r9, r6 @ r12 = ct15 + ct17 = ct19
284
285 smlal r2, r3, r5, r12 @ r2..r3 += ((X2 - X9 + X5 - X6 - X17 - X14) * K09)
286
287 smlal r10, lr, r11, r5 @ r10..lr = ct04 + (ct18 * K09)
288 smlal r10, lr, r12, r4 @ r10..lr = ct04 + (ct18 * K09) + (ct19 * K08)
289
290 movs r2, r2, lsr #28
291 adc r2, r2, r3, lsl #4 @ r2 = bits[59..28] of r2..r3
292 str r2, [r1, #x22] @ store result x22
293
294 movs r10, r10, lsr #28
295 adc r10, r10, lr, lsl #4 @ r10 = bits[59..28] of r10..lr
296 str r10, [r1, #x4] @ store result x4
297
298 @----
299
300 ldmia sp, { r2, r3, r4, r5 } @ r2..r3 = ct06, r4..r5 = ct04 (dont update sp)
301
302 @ r2..r3 = ct06
303 @ r4..r5 = ct04
304 @ r6 = ct15
305 @ r7 = ct14
306 @ r8 = ct16
307 @ r9 = ct17
308 @ r10 = .
309 @ r11 = .
310 @ r12 = .
311 @ lr = .
312
313 ldr r10, =K03 @ r10 = K03
314 ldr lr, =K15 @ lr = K15
315
316 smlal r2, r3, r10, r7 @ r2..r3 = ct06 + (ct14 * K03)
317 smlal r4, r5, lr, r7 @ r4..r5 = ct04 + (ct14 * K15)
318
319 ldr r12, =K14 @ r12 = K14
320 rsb r10, r10, #0 @ r10 = -K03
321
322 smlal r2, r3, lr, r6 @ r2..r3 += (ct15 * K15)
323 smlal r4, r5, r10, r6 @ r4..r5 += (ct15 * -K03)
324 smlal r2, r3, r12, r8 @ r2..r3 += (ct16 * K14)
325
326 ldr r11, =minus_K02 @ r11 = -K02
327 rsb r12, r12, #0 @ r12 = -K14
328
329 smlal r4, r5, r12, r9 @ r4..r5 += (ct17 * -K14)
330 smlal r2, r3, r11, r9 @ r2..r3 += (ct17 * -K02)
331 smlal r4, r5, r11, r8 @ r4..r5 += (ct16 * -K02)
332
333 movs r2, r2, lsr #28
334 adc r2, r2, r3, lsl #4 @ r2 = bits[59..28] of r2..r3
335 str r2, [r1, #x7] @ store result x7
336
337 movs r4, r4, lsr #28
338 adc r4, r4, r5, lsl #4 @ r4 = bits[59..28] of r4..r5
339 str r4, [r1, #x1] @ store result x1
340
341 @----
342
343 ldmia sp, { r2, r3, r4, r5 } @ r2..r3 = ct06, r4..r5 = ct04 (dont update sp)
344
345 @ r2..r3 = ct06
346 @ r4..r5 = ct04
347 @ r6 = ct15
348 @ r7 = ct14
349 @ r8 = ct16
350 @ r9 = ct17
351 @ r10 = -K03
352 @ r11 = -K02
353 @ r12 = -K14
354 @ lr = K15
355
356 rsbs r2, r2, #0
357 rsc r3, r3, #0 @ r2..r3 = -ct06
358
359 smlal r2, r3, r12, r7 @ r2..r3 = -ct06 + (ct14 * -K14)
360 smlal r2, r3, r10, r8 @ r2..r3 += (ct16 * -K03)
361
362 smlal r4, r5, r12, r6 @ r4..r5 = ct04 + (ct15 * -K14)
363 smlal r4, r5, r10, r9 @ r4..r5 += (ct17 * -K03)
364 smlal r4, r5, lr, r8 @ r4..r5 += (ct16 * K15)
365 smlal r4, r5, r11, r7 @ r4..r5 += (ct14 * -K02)
366
367 rsb lr, lr, #0 @ lr = -K15
368 rsb r11, r11, #0 @ r11 = K02
369
370 smlal r2, r3, lr, r9 @ r2..r3 += (ct17 * -K15)
371 smlal r2, r3, r11, r6 @ r2..r3 += (ct15 * K02)
372
373 movs r4, r4, lsr #28
374 adc r4, r4, r5, lsl #4 @ r4 = bits[59..28] of r4..r5
375 str r4, [r1, #x25] @ store result x25
376
377 movs r2, r2, lsr #28
378 adc r2, r2, r3, lsl #4 @ r2 = bits[59..28] of r2..r3
379 str r2, [r1, #x19] @ store result x19
380
381 @----
382
383 ldr r2, [sp, #16] @ r2 = ct01_l
384 ldr r3, [sp, #20] @ r3 = ct01_h
385
386 ldr r6, [r0, #X1]
387 ldr r8, [r0, #X7]
388 ldr r9, [r0, #X10]
389 ldr r7, [r0, #X16]
390
391 rsbs r2, r2, #0
392 rsc r3, r3, #0 @ r2..r3 = -ct01
393
394 mov r4, r2
395 mov r5, r3 @ r4..r5 = -ct01
396
397 @ r2..r3 = -ct01
398 @ r4..r5 = -ct01
399 @ r6 = X1
400 @ r7 = X16
401 @ r8 = X7
402 @ r9 = X10
403 @ r10 = -K03
404 @ r11 = K02
405 @ r12 = -K14
406 @ lr = -K15
407
408 smlal r4, r5, r12, r7 @ r4..r5 = -ct01 + (X16 * -K14)
409 smlal r2, r3, lr, r9 @ r2..r3 = -ct01 + (X10 * -K15)
410
411 smlal r4, r5, r10, r8 @ r4..r5 += (X7 * -K03)
412 smlal r2, r3, r10, r7 @ r2..r3 += (X16 * -K03)
413
414 smlal r4, r5, r11, r9 @ r4..r5 += (X10 * K02)
415 smlal r2, r3, r12, r8 @ r2..r3 += (X7 * -K14)
416
417 rsb lr, lr, #0 @ lr = K15
418 rsb r11, r11, #0 @ r11 = -K02
419
420 smlal r4, r5, lr, r6 @ r4..r5 += (X1 * K15) = ct05
421 smlal r2, r3, r11, r6 @ r2..r3 += (X1 * -K02) = ct03
422
423 stmdb sp!, { r2, r3, r4, r5 } @ stack ct05_h, ct05_l, ct03_h, ct03_l
424
425 rsbs r4, r4, #0
426 rsc r5, r5, #0 @ r4..r5 = -ct05
427
428 stmdb sp!, { r4, r5 } @ stack -ct05_h, -ct05_l
429
430 ldr r2, [sp, #48] @ r2 = ct00_l
431 ldr r3, [sp, #52] @ r3 = ct00_h
432
433 rsb r10, r10, #0 @ r10 = K03
434
435 rsbs r4, r2, #0
436 rsc r5, r3, #0 @ r4..r5 = -ct00
437
438 @ r2..r3 = ct00
439 @ r4..r5 = -ct00
440 @ r6 = X1
441 @ r7 = X16
442 @ r8 = X7
443 @ r9 = X10
444 @ r10 = K03
445 @ r11 = -K02
446 @ r12 = -K14
447 @ lr = K15
448
449 smlal r4, r5, r10, r6 @ r4..r5 = -ct00 + (X1 * K03)
450 smlal r2, r3, r10, r9 @ r2..r3 = ct00 + (X10 * K03)
451
452 smlal r4, r5, r12, r9 @ r4..r5 += (X10 * -K14)
453 smlal r2, r3, r12, r6 @ r2..r3 += (X1 * -K14)
454
455 smlal r4, r5, r11, r7 @ r4..r5 += (X16 * -K02)
456 smlal r4, r5, lr, r8 @ r4..r5 += (X7 * K15) = ct07
457
458 rsb lr, lr, #0 @ lr = -K15
459 rsb r11, r11, #0 @ r11 = K02
460
461 smlal r2, r3, r11, r8 @ r2..r3 += (X7 * K02)
462 smlal r2, r3, lr, r7 @ r2..r3 += (X16 * -K15) = ct02
463
464 rsbs r6, r4, #0
465 rsc r7, r5, #0 @ r6..r7 = -ct07
466
467 stmdb sp!, { r2 - r7 } @ stack -ct07_h, -ct07_l, ct07_h, ct07_l, ct02_h, ct02_l
468
469
470 @----
471
472 add r2, pc, #(imdct36_long_karray-.-8) @ r2 = base address of Knn array (PIC safe ?)
473
474
475loop:
476 ldr r12, [r0, #X0]
477
478 ldmia r2!, { r5 - r11 } @ first 7 words from Karray element
479
480 smull r3, r4, r5, r12 @ sum = (Kxx * X0)
481 ldr r12, [r0, #X2]
482 ldr r5, [r0, #X3]
483 smlal r3, r4, r6, r12 @ sum += (Kxx * X2)
484 ldr r12, [r0, #X5]
485 ldr r6, [r0, #X6]
486 smlal r3, r4, r7, r5 @ sum += (Kxx * X3)
487 smlal r3, r4, r8, r12 @ sum += (Kxx * X5)
488 ldr r12, [r0, #X8]
489 ldr r5, [r0, #X9]
490 smlal r3, r4, r9, r6 @ sum += (Kxx * X6)
491 smlal r3, r4, r10, r12 @ sum += (Kxx * X8)
492 smlal r3, r4, r11, r5 @ sum += (Kxx * X9)
493
494 ldmia r2!, { r5 - r10 } @ final 6 words from Karray element
495
496 ldr r11, [r0, #X11]
497 ldr r12, [r0, #X12]
498 smlal r3, r4, r5, r11 @ sum += (Kxx * X11)
499 ldr r11, [r0, #X14]
500 ldr r5, [r0, #X15]
501 smlal r3, r4, r6, r12 @ sum += (Kxx * X12)
502 smlal r3, r4, r7, r11 @ sum += (Kxx * X14)
503 ldr r11, [r0, #X17]
504 smlal r3, r4, r8, r5 @ sum += (Kxx * X15)
505 smlal r3, r4, r9, r11 @ sum += (Kxx * X17)
506
507 add r5, sp, r10, lsr #16 @ create index back into stack for required ctxx
508
509 ldmia r5, { r6, r7 } @ r6..r7 = ctxx
510
511 mov r8, r10, lsl #16 @ push ctxx index off the top end
512
513 adds r3, r3, r6 @ add low words
514 adc r4, r4, r7 @ add high words, with carry
515 movs r3, r3, lsr #28
516 adc r3, r3, r4, lsl #4 @ r3 = bits[59..28] of r3..r4
517
518 str r3, [r1, r8, lsr #24] @ push completion flag off the bottom end
519
520 movs r8, r8, lsl #8 @ push result location index off the top end
521 beq loop @ loop back if completion flag not set
522 b imdct_l_windowing @ branch to windowing stage if looping finished
523
524imdct36_long_karray:
525
526 .word K17, -K13, K10, -K06, -K05, K01, -K00, K04, -K07, K11, K12, -K16, 0x00000000
527 .word K13, K07, K16, K01, K10, -K05, K04, -K11, K00, -K17, K06, -K12, 0x00200800
528 .word K11, K17, K05, K12, -K01, K06, -K07, K00, -K13, K04, -K16, K10, 0x00200c00
529 .word K07, K00, -K12, K05, -K16, -K10, K11, -K17, K04, K13, K01, K06, 0x00001400
530 .word K05, K10, -K00, -K17, K07, -K13, K12, K06, -K16, K01, -K11, -K04, 0x00181800
531 .word K01, K05, -K07, -K11, K13, K17, -K16, -K12, K10, K06, -K04, -K00, 0x00102000
532 .word -K16, K12, -K11, K07, K04, -K00, -K01, K05, -K06, K10, K13, -K17, 0x00284800
533 .word -K12, K06, K17, -K00, -K11, K04, K05, -K10, K01, K16, -K07, -K13, 0x00085000
534 .word -K10, K16, K04, -K13, -K00, K07, K06, -K01, -K12, -K05, K17, K11, 0x00105400
535 .word -K06, -K01, K13, K04, K17, -K11, -K10, -K16, -K05, K12, K00, K07, 0x00185c00
536 .word -K04, -K11, -K01, K16, K06, K12, K13, -K07, -K17, -K00, -K10, -K05, 0x00006000
537 .word -K00, -K04, -K06, -K10, -K12, -K16, -K17, -K13, -K11, -K07, -K05, -K01, 0x00206801
538
539
540 @----
541 @-------------------------------------------------------------------------
542 @----
543
544imdct_l_windowing:
545
546 ldr r11, [sp, #80] @ fetch function parameter 3 from out of the stack
547 ldmia r1!, { r0, r2 - r9 } @ load 9 words from x0, update pointer
548
549 @ r0 = x0
550 @ r1 = &x[9]
551 @ r2 = x1
552 @ r3 = x2
553 @ r4 = x3
554 @ r5 = x4
555 @ r6 = x5
556 @ r7 = x6
557 @ r8 = x7
558 @ r9 = x8
559 @ r10 = .
560 @ r11 = window mode: (0 == normal), (1 == start block), (3 == stop block)
561 @ r12 = .
562 @ lr = .
563
564 cmp r11, #BLOCK_MODE_STOP @ setup flags
565 rsb r10, r0, #0 @ r10 = -x0 (DONT change flags !!)
566 beq stop_block_x0_to_x17
567
568
569 @ start and normal blocks are treated the same for x[0]..x[17]
570
571normal_block_x0_to_x17:
572
573 ldr r12, =WL9 @ r12 = window_l[9]
574
575 rsb r0, r9, #0 @ r0 = -x8
576 rsb r9, r2, #0 @ r9 = -x1
577 rsb r2, r8, #0 @ r2 = -x7
578 rsb r8, r3, #0 @ r8 = -x2
579 rsb r3, r7, #0 @ r3 = -x6
580 rsb r7, r4, #0 @ r7 = -x3
581 rsb r4, r6, #0 @ r4 = -x5
582 rsb r6, r5, #0 @ r6 = -x4
583
584 @ r0 = -x8
585 @ r1 = &x[9]
586 @ r2 = -x7
587 @ r3 = -x6
588 @ r4 = -x5
589 @ r5 = .
590 @ r6 = -x4
591 @ r7 = -x3
592 @ r8 = -x2
593 @ r9 = -x1
594 @ r10 = -x0
595 @ r11 = window mode: (0 == normal), (1 == start block), (3 == stop block)
596 @ r12 = window_l[9]
597 @ lr = .
598
599 smull r5, lr, r12, r0 @ r5..lr = (window_l[9] * (x[9] == -x[8]))
600 ldr r12, =WL10 @ r12 = window_l[10]
601 movs r5, r5, lsr #28
602 adc r0, r5, lr, lsl #4 @ r0 = bits[59..28] of windowed x9
603
604 smull r5, lr, r12, r2 @ r5..lr = (window_l[10] * (x[10] == -x[7]))
605 ldr r12, =WL11 @ r12 = window_l[11]
606 movs r5, r5, lsr #28
607 adc r2, r5, lr, lsl #4 @ r2 = bits[59..28] of windowed x10
608
609 smull r5, lr, r12, r3 @ r5..lr = (window_l[11] * (x[11] == -x[6]))
610 ldr r12, =WL12 @ r12 = window_l[12]
611 movs r5, r5, lsr #28
612 adc r3, r5, lr, lsl #4 @ r3 = bits[59..28] of windowed x11
613
614 smull r5, lr, r12, r4 @ r5..lr = (window_l[12] * (x[12] == -x[5]))
615 ldr r12, =WL13 @ r12 = window_l[13]
616 movs r5, r5, lsr #28
617 adc r4, r5, lr, lsl #4 @ r4 = bits[59..28] of windowed x12
618
619 smull r5, lr, r12, r6 @ r5..lr = (window_l[13] * (x[13] == -x[4]))
620 ldr r12, =WL14 @ r12 = window_l[14]
621 movs r5, r5, lsr #28
622 adc r6, r5, lr, lsl #4 @ r6 = bits[59..28] of windowed x13
623
624 smull r5, lr, r12, r7 @ r5..lr = (window_l[14] * (x[14] == -x[3]))
625 ldr r12, =WL15 @ r12 = window_l[15]
626 movs r5, r5, lsr #28
627 adc r7, r5, lr, lsl #4 @ r7 = bits[59..28] of windowed x14
628
629 smull r5, lr, r12, r8 @ r5..lr = (window_l[15] * (x[15] == -x[2]))
630 ldr r12, =WL16 @ r12 = window_l[16]
631 movs r5, r5, lsr #28
632 adc r8, r5, lr, lsl #4 @ r8 = bits[59..28] of windowed x15
633
634 smull r5, lr, r12, r9 @ r5..lr = (window_l[16] * (x[16] == -x[1]))
635 ldr r12, =WL17 @ r12 = window_l[17]
636 movs r5, r5, lsr #28
637 adc r9, r5, lr, lsl #4 @ r9 = bits[59..28] of windowed x16
638
639 smull r5, lr, r12, r10 @ r5..lr = (window_l[17] * (x[17] == -x[0]))
640 ldr r12, =WL0 @ r12 = window_l[0]
641 movs r5, r5, lsr #28
642 adc r10, r5, lr, lsl #4 @ r10 = bits[59..28] of windowed x17
643
644
645 stmia r1, { r0, r2 - r4, r6 - r10 } @ store windowed x[9] .. x[17]
646 ldmdb r1!, { r0, r2 - r9 } @ load 9 words downto (and including) x0
647
648
649 smull r10, lr, r12, r0 @ r10..lr = (window_l[0] * x[0])
650 ldr r12, =WL1 @ r12 = window_l[1]
651 movs r10, r10, lsr #28
652 adc r0, r10, lr, lsl #4 @ r0 = bits[59..28] of windowed x0
653
654 smull r10, lr, r12, r2 @ r10..lr = (window_l[1] * x[1])
655 ldr r12, =WL2 @ r12 = window_l[2]
656 movs r10, r10, lsr #28
657 adc r2, r10, lr, lsl #4 @ r2 = bits[59..28] of windowed x1
658
659 smull r10, lr, r12, r3 @ r10..lr = (window_l[2] * x[2])
660 ldr r12, =WL3 @ r12 = window_l[3]
661 movs r10, r10, lsr #28
662 adc r3, r10, lr, lsl #4 @ r3 = bits[59..28] of windowed x2
663
664 smull r10, lr, r12, r4 @ r10..lr = (window_l[3] * x[3])
665 ldr r12, =WL4 @ r12 = window_l[4]
666 movs r10, r10, lsr #28
667 adc r4, r10, lr, lsl #4 @ r4 = bits[59..28] of windowed x3
668
669 smull r10, lr, r12, r5 @ r10..lr = (window_l[4] * x[4])
670 ldr r12, =WL5 @ r12 = window_l[5]
671 movs r10, r10, lsr #28
672 adc r5, r10, lr, lsl #4 @ r5 = bits[59..28] of windowed x4
673
674 smull r10, lr, r12, r6 @ r10..lr = (window_l[5] * x[5])
675 ldr r12, =WL6 @ r12 = window_l[6]
676 movs r10, r10, lsr #28
677 adc r6, r10, lr, lsl #4 @ r6 = bits[59..28] of windowed x5
678
679 smull r10, lr, r12, r7 @ r10..lr = (window_l[6] * x[6])
680 ldr r12, =WL7 @ r12 = window_l[7]
681 movs r10, r10, lsr #28
682 adc r7, r10, lr, lsl #4 @ r7 = bits[59..28] of windowed x6
683
684 smull r10, lr, r12, r8 @ r10..lr = (window_l[7] * x[7])
685 ldr r12, =WL8 @ r12 = window_l[8]
686 movs r10, r10, lsr #28
687 adc r8, r10, lr, lsl #4 @ r8 = bits[59..28] of windowed x7
688
689 smull r10, lr, r12, r9 @ r10..lr = (window_l[8] * x[8])
690 movs r10, r10, lsr #28
691 adc r9, r10, lr, lsl #4 @ r9 = bits[59..28] of windowed x8
692
693 stmia r1, { r0, r2 - r9 } @ store windowed x[0] .. x[8]
694
695 cmp r11, #BLOCK_MODE_START
696 beq start_block_x18_to_x35
697
698
699 @----
700
701
702normal_block_x18_to_x35:
703
704 ldr r11, =WL3 @ r11 = window_l[3]
705 ldr r12, =WL4 @ r12 = window_l[4]
706
707 add r1, r1, #(18*4) @ r1 = &x[18]
708
709 ldmia r1!, { r0, r2 - r4, r6 - r10 } @ load 9 words from x18, update pointer
710
711 @ r0 = x18
712 @ r1 = &x[27]
713 @ r2 = x19
714 @ r3 = x20
715 @ r4 = x21
716 @ r5 = .
717 @ r6 = x22
718 @ r7 = x23
719 @ r8 = x24
720 @ r9 = x25
721 @ r10 = x26
722 @ r11 = window_l[3]
723 @ r12 = window_l[4]
724 @ lr = .
725
726 smull r5, lr, r12, r6 @ r5..lr = (window_l[4] * (x[22] == x[31]))
727 movs r5, r5, lsr #28
728 adc r5, r5, lr, lsl #4 @ r5 = bits[59..28] of windowed x31
729
730 smull r6, lr, r11, r4 @ r5..lr = (window_l[3] * (x[21] == x[32]))
731 ldr r12, =WL5 @ r12 = window_l[5]
732 movs r6, r6, lsr #28
733 adc r6, r6, lr, lsl #4 @ r6 = bits[59..28] of windowed x32
734
735 smull r4, lr, r12, r7 @ r4..lr = (window_l[5] * (x[23] == x[30]))
736 ldr r11, =WL1 @ r11 = window_l[1]
737 ldr r12, =WL2 @ r12 = window_l[2]
738 movs r4, r4, lsr #28
739 adc r4, r4, lr, lsl #4 @ r4 = bits[59..28] of windowed x30
740
741 smull r7, lr, r12, r3 @ r7..lr = (window_l[2] * (x[20] == x[33]))
742 ldr r12, =WL6 @ r12 = window_l[6]
743 movs r7, r7, lsr #28
744 adc r7, r7, lr, lsl #4 @ r7 = bits[59..28] of windowed x33
745
746 smull r3, lr, r12, r8 @ r3..lr = (window_l[6] * (x[24] == x[29]))
747 movs r3, r3, lsr #28
748 adc r3, r3, lr, lsl #4 @ r3 = bits[59..28] of windowed x29
749
750 smull r8, lr, r11, r2 @ r7..lr = (window_l[1] * (x[19] == x[34]))
751 ldr r12, =WL7 @ r12 = window_l[7]
752 ldr r11, =WL8 @ r11 = window_l[8]
753 movs r8, r8, lsr #28
754 adc r8, r8, lr, lsl #4 @ r8 = bits[59..28] of windowed x34
755
756 smull r2, lr, r12, r9 @ r7..lr = (window_l[7] * (x[25] == x[28]))
757 ldr r12, =WL0 @ r12 = window_l[0]
758 movs r2, r2, lsr #28
759 adc r2, r2, lr, lsl #4 @ r2 = bits[59..28] of windowed x28
760
761 smull r9, lr, r12, r0 @ r3..lr = (window_l[0] * (x[18] == x[35]))
762 movs r9, r9, lsr #28
763 adc r9, r9, lr, lsl #4 @ r9 = bits[59..28] of windowed x35
764
765 smull r0, lr, r11, r10 @ r7..lr = (window_l[8] * (x[26] == x[27]))
766 ldr r11, =WL16 @ r11 = window_l[16]
767 ldr r12, =WL17 @ r12 = window_l[17]
768 movs r0, r0, lsr #28
769 adc r0, r0, lr, lsl #4 @ r0 = bits[59..28] of windowed x27
770
771
772 stmia r1, { r0, r2 - r9 } @ store windowed x[27] .. x[35]
773 ldmdb r1!, { r0, r2 - r9 } @ load 9 words downto (and including) x18
774
775
776 smull r10, lr, r12, r0 @ r10..lr = (window_l[17] * x[18])
777 movs r10, r10, lsr #28
778 adc r0, r10, lr, lsl #4 @ r0 = bits[59..28] of windowed x0
779
780 smull r10, lr, r11, r2 @ r10..lr = (window_l[16] * x[19])
781 ldr r11, =WL14 @ r11 = window_l[14]
782 ldr r12, =WL15 @ r12 = window_l[15]
783 movs r10, r10, lsr #28
784 adc r2, r10, lr, lsl #4 @ r2 = bits[59..28] of windowed x1
785
786 smull r10, lr, r12, r3 @ r10..lr = (window_l[15] * x[20])
787 movs r10, r10, lsr #28
788 adc r3, r10, lr, lsl #4 @ r3 = bits[59..28] of windowed x2
789
790 smull r10, lr, r11, r4 @ r10..lr = (window_l[14] * x[21])
791 ldr r11, =WL12 @ r11 = window_l[12]
792 ldr r12, =WL13 @ r12 = window_l[13]
793 movs r10, r10, lsr #28
794 adc r4, r10, lr, lsl #4 @ r4 = bits[59..28] of windowed x3
795
796 smull r10, lr, r12, r5 @ r10..lr = (window_l[13] * x[22])
797 movs r10, r10, lsr #28
798 adc r5, r10, lr, lsl #4 @ r5 = bits[59..28] of windowed x4
799
800 smull r10, lr, r11, r6 @ r10..lr = (window_l[12] * x[23])
801 ldr r11, =WL10 @ r12 = window_l[10]
802 ldr r12, =WL11 @ r12 = window_l[11]
803 movs r10, r10, lsr #28
804 adc r6, r10, lr, lsl #4 @ r6 = bits[59..28] of windowed x5
805
806 smull r10, lr, r12, r7 @ r10..lr = (window_l[11] * x[24])
807 movs r10, r10, lsr #28
808 adc r7, r10, lr, lsl #4 @ r7 = bits[59..28] of windowed x6
809
810 smull r10, lr, r11, r8 @ r10..lr = (window_l[10] * x[25])
811 ldr r12, =WL9 @ r12 = window_l[9]
812 movs r10, r10, lsr #28
813 adc r8, r10, lr, lsl #4 @ r8 = bits[59..28] of windowed x7
814
815 smull r10, lr, r12, r9 @ r10..lr = (window_l[9] * x[26])
816
817 movs r10, r10, lsr #28
818 adc r9, r10, lr, lsl #4 @ r9 = bits[59..28] of windowed x8
819
820 stmia r1, { r0, r2 - r9 } @ store windowed x[18] .. x[26]
821
822 @----
823 @ NB there are 2 possible exits from this function - this is only one of them
824 @----
825
826 add sp, sp, #(21*4) @ return stack frame
827 ldmpc regs=r4-r11 @ restore callee saved regs, and return
828
829 @----
830
831
832stop_block_x0_to_x17:
833
834 @ r0 = x0
835 @ r1 = &x[9]
836 @ r2 = x1
837 @ r3 = x2
838 @ r4 = x3
839 @ r5 = x4
840 @ r6 = x5
841 @ r7 = x6
842 @ r8 = x7
843 @ r9 = x8
844 @ r10 = -x0
845 @ r11 = window mode: (0 == normal), (1 == start block), (3 == stop block)
846 @ r12 = .
847 @ lr = .
848
849 rsb r0, r6, #0 @ r0 = -x5
850 rsb r6, r2, #0 @ r6 = -x1
851 rsb r2, r5, #0 @ r2 = -x4
852 rsb r5, r3, #0 @ r5 = -x2
853 rsb r3, r4, #0 @ r3 = -x3
854
855 add r1, r1, #(3*4) @ r1 = &x[12]
856 stmia r1, { r0, r2, r3, r5, r6, r10 } @ store unchanged x[12] .. x[17]
857
858 ldr r0, =WL1 @ r0 = window_l[1] == window_s[0]
859
860 rsb r10, r9, #0 @ r10 = -x8
861 rsb r12, r8, #0 @ r12 = -x7
862 rsb lr, r7, #0 @ lr = -x6
863
864 @ r0 = WL1
865 @ r1 = &x[12]
866 @ r2 = .
867 @ r3 = .
868 @ r4 = .
869 @ r5 = .
870 @ r6 = .
871 @ r7 = x6
872 @ r8 = x7
873 @ r9 = x8
874 @ r10 = -x8
875 @ r11 = window mode: (0 == normal), (1 == start block), (3 == stop block)
876 @ r12 = -x7
877 @ lr = -x6
878
879 smull r5, r6, r0, r7 @ r5..r6 = (window_l[1] * x[6])
880 ldr r2, =WL4 @ r2 = window_l[4] == window_s[1]
881 movs r5, r5, lsr #28
882 adc r7, r5, r6, lsl #4 @ r7 = bits[59..28] of windowed x6
883
884 smull r5, r6, r2, r8 @ r5..r6 = (window_l[4] * x[7])
885 ldr r3, =WL7 @ r3 = window_l[7] == window_s[2]
886 movs r5, r5, lsr #28
887 adc r8, r5, r6, lsl #4 @ r8 = bits[59..28] of windowed x7
888
889 smull r5, r6, r3, r9 @ r5..r6 = (window_l[7] * x[8])
890 ldr r4, =WL10 @ r4 = window_l[10] == window_s[3]
891 movs r5, r5, lsr #28
892 adc r9, r5, r6, lsl #4 @ r9 = bits[59..28] of windowed x8
893
894 smull r5, r6, r4, r10 @ r5..r6 = (window_l[10] * (x[9] == -x[8]))
895 ldr r0, =WL13 @ r0 = window_l[13] == window_s[4]
896 movs r5, r5, lsr #28
897 adc r10, r5, r6, lsl #4 @ r10 = bits[59..28] of windowed x9
898
899 smull r5, r6, r0, r12 @ r5..r6 = (window_l[13] * (x[10] == -x[7]))
900 ldr r2, =WL16 @ r2 = window_l[16] == window_s[5]
901 movs r5, r5, lsr #28
902 adc r12, r5, r6, lsl #4 @ r10 = bits[59..28] of windowed x9
903
904 smull r5, r6, r2, lr @ r5..r6 = (window_l[16] * (x[11] == -x[6]))
905
906 ldr r0, =0x00
907
908 movs r5, r5, lsr #28
909 adc lr, r5, r6, lsl #4 @ r10 = bits[59..28] of windowed x9
910
911 stmdb r1!, { r7 - r10, r12, lr } @ store windowed x[6] .. x[11]
912
913 ldr r5, =0x00
914 ldr r6, =0x00
915 ldr r2, =0x00
916 ldr r3, =0x00
917 ldr r4, =0x00
918
919 stmdb r1!, { r0, r2 - r6 } @ store windowed x[0] .. x[5]
920
921 b normal_block_x18_to_x35
922
923
924 @----
925
926
927start_block_x18_to_x35:
928
929 ldr r4, =WL1 @ r0 = window_l[1] == window_s[0]
930
931 add r1, r1, #(24*4) @ r1 = &x[24]
932
933 ldmia r1, { r0, r2, r3 } @ load 3 words from x24, dont update pointer
934
935 @ r0 = x24
936 @ r1 = &x[24]
937 @ r2 = x25
938 @ r3 = x26
939 @ r4 = WL1
940 @ r5 = WL4
941 @ r6 = WL7
942 @ r7 = WL10
943 @ r8 = WL13
944 @ r9 = WL16
945 @ r10 = .
946 @ r11 = .
947 @ r12 = .
948 @ lr = .
949
950 ldr r5, =WL4 @ r5 = window_l[4] == window_s[1]
951
952 smull r10, r11, r4, r0 @ r10..r11 = (window_l[1] * (x[24] == x[29]))
953 ldr r6, =WL7 @ r6 = window_l[7] == window_s[2]
954 movs r10, r10, lsr #28
955 adc lr, r10, r11, lsl #4 @ lr = bits[59..28] of windowed x29
956
957 smull r10, r11, r5, r2 @ r10..r11 = (window_l[4] * (x[25] == x[28]))
958 ldr r7, =WL10 @ r7 = window_l[10] == window_s[3]
959 movs r10, r10, lsr #28
960 adc r12, r10, r11, lsl #4 @ r12 = bits[59..28] of windowed x28
961
962 smull r10, r11, r6, r3 @ r10..r11 = (window_l[7] * (x[26] == x[27]))
963 ldr r8, =WL13 @ r8 = window_l[13] == window_s[4]
964 movs r10, r10, lsr #28
965 adc r4, r10, r11, lsl #4 @ r4 = bits[59..28] of windowed x27
966
967 smull r10, r11, r7, r3 @ r10..r11 = (window_l[10] * x[26])
968 ldr r9, =WL16 @ r9 = window_l[16] == window_s[5]
969 movs r10, r10, lsr #28
970 adc r3, r10, r11, lsl #4 @ r3 = bits[59..28] of windowed x26
971
972 smull r10, r11, r8, r2 @ r10..r11 = (window_l[13] * x[25])
973 ldr r5, =0x00
974 movs r10, r10, lsr #28
975 adc r2, r10, r11, lsl #4 @ r2 = bits[59..28] of windowed x25
976
977 smull r10, r11, r9, r0 @ r10..r11 = (window_l[16] * x[24])
978 ldr r6, =0x00
979 movs r10, r10, lsr #28
980 adc r0, r10, r11, lsl #4 @ r0 = bits[59..28] of windowed x24
981
982 stmia r1!, { r0, r2, r3, r4, r12, lr } @ store windowed x[24] .. x[29]
983
984 ldr r7, =0x00
985 ldr r8, =0x00
986 ldr r9, =0x00
987 ldr r10, =0x00
988
989 stmia r1!, { r5 - r10 } @ store windowed x[30] .. x[35]
990
991 @----
992 @ NB there are 2 possible exits from this function - this is only one of them
993 @----
994
995 add sp, sp, #(21*4) @ return stack frame
996 ldmpc regs=r4-r11 @ restore callee saved regs, and return
997
998 @----
999 @END
1000 @----
1001