summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDave Chapman <dave@dchapman.com>2007-07-28 15:21:25 +0000
committerDave Chapman <dave@dchapman.com>2007-07-28 15:21:25 +0000
commit66b51909c09bda9910b5c891b241cb9cf8556970 (patch)
treecffa940b47e71edd3370159eba80a551d4cfe36b
parent488b3db547a09b25eac212e77ccb64ef81f8ce3f (diff)
downloadrockbox-66b51909c09bda9910b5c891b241cb9cf8556970.tar.gz
rockbox-66b51909c09bda9910b5c891b241cb9cf8556970.zip
FS #6705 - ARM optimisations for libmad by Tomasz Malesinski. Modified slightly by me to not put code in IRAM for PP502x (it's slower), and for the mpegplayer version of libmad for PP5002 (there isn't enough room). On my ipod Color, it increases a 320kbps MP3 test file from 169% realtime to 188% realtime. Reported speedup on the ipod 3G was from 118% to 155% realtime for a 192kbps MP3.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@14041 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/libmad/SOURCES2
-rw-r--r--apps/codecs/libmad/bit.c28
-rw-r--r--apps/codecs/libmad/dct32_arm.S332
-rw-r--r--apps/codecs/libmad/layer3.c32
-rw-r--r--apps/codecs/libmad/synth.c375
-rw-r--r--apps/codecs/libmad/synth_full_arm.S343
6 files changed, 1091 insertions, 21 deletions
diff --git a/apps/codecs/libmad/SOURCES b/apps/codecs/libmad/SOURCES
index 74b8f2889d..a4b6e8ca9a 100644
--- a/apps/codecs/libmad/SOURCES
+++ b/apps/codecs/libmad/SOURCES
@@ -14,4 +14,6 @@ imdct_mcf5249.S
14#endif 14#endif
15#if defined(CPU_ARM) && !defined(SIMULATOR) 15#if defined(CPU_ARM) && !defined(SIMULATOR)
16imdct_l_arm.S 16imdct_l_arm.S
17dct32_arm.S
18synth_full_arm.S
17#endif 19#endif
diff --git a/apps/codecs/libmad/bit.c b/apps/codecs/libmad/bit.c
index 85c5baadd7..6c984ef078 100644
--- a/apps/codecs/libmad/bit.c
+++ b/apps/codecs/libmad/bit.c
@@ -128,13 +128,7 @@ void mad_bit_skip(struct mad_bitptr *bitptr, unsigned int len)
128 * NAME: bit->read() 128 * NAME: bit->read()
129 * DESCRIPTION: read an arbitrary number of bits and return their UIMSBF value 129 * DESCRIPTION: read an arbitrary number of bits and return their UIMSBF value
130 */ 130 */
131unsigned long bmask[] ICONST_ATTR = 131
132{ 0x00000000, 0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f,
133 0x0000003f, 0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff,
134 0x00000fff, 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, 0x0001ffff,
135 0x0003ffff, 0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff,
136 0x00ffffff, 0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff,
137 0x3fffffff, 0x7fffffff, 0xffffffff };
138unsigned long mad_bit_read(struct mad_bitptr *bitptr, unsigned int len) ICODE_ATTR; 132unsigned long mad_bit_read(struct mad_bitptr *bitptr, unsigned int len) ICODE_ATTR;
139unsigned long mad_bit_read(struct mad_bitptr *bitptr, unsigned int len) 133unsigned long mad_bit_read(struct mad_bitptr *bitptr, unsigned int len)
140{ 134{
@@ -142,19 +136,13 @@ unsigned long mad_bit_read(struct mad_bitptr *bitptr, unsigned int len)
142 136
143 if(len) 137 if(len)
144 { 138 {
145 if((bitptr->readbit ^ (bitptr->readbit + len - 1)) < 32) 139 unsigned long r = betoh32(curr[0]) << (bitptr->readbit & 31);
146 { 140
147 bitptr->readbit += len; 141 if((bitptr->readbit & 31) + len > 32)
148 142 r += betoh32(curr[1]) >> (-bitptr->readbit & 31);
149 return (betoh32(curr[0]) >> (-bitptr->readbit & 31)) & bmask[len]; 143
150 } 144 bitptr->readbit += len;
151 else 145 return r >> (32 - len);
152 {
153 bitptr->readbit += len;
154
155 return ((betoh32(curr[0]) << ( bitptr->readbit & 31))
156 + (betoh32(curr[1]) >> (-bitptr->readbit & 31))) & bmask[len];
157 }
158 } 146 }
159 147
160 return 0; 148 return 0;
diff --git a/apps/codecs/libmad/dct32_arm.S b/apps/codecs/libmad/dct32_arm.S
new file mode 100644
index 0000000000..4d94896b0b
--- /dev/null
+++ b/apps/codecs/libmad/dct32_arm.S
@@ -0,0 +1,332 @@
1/***************************************************************************
2 * __________ __ ___.
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 * \/ \/ \/ \/ \/
8 * $Id$
9 *
10 * Copyright (C) 2007 by Tomasz Malesinski
11 *
12 * All files in this archive are subject to the GNU General Public License.
13 * See the file COPYING in the source tree root for full license agreement.
14 *
15 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
16 * KIND, either express or implied.
17 *
18 ****************************************************************************/
19
20#include "config.h"
21
22 .global dct32
23
24/* This performs slower in IRAM on PP502x and there is no space in
25 mpegplayer on the PP5002 */
26#if defined(CPU_PP502x) || (CONFIG_CPU == PP5002 && defined(MPEGPLAYER))
27 .section .text,"ax",%progbits
28#else
29 .section .icode,"ax",%progbits
30#endif
31
32dct32:
33 stmdb r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
34 sub r13, r13, #144
35 str r0, [r13, #12]
36 str r1, [r13, #8]
37 str r2, [r13, #4]
38 str r3, [r13]
39 add r0, r13, #16
40 add r1, r0, #128
41 ldr r2, =bitrev
42.shuffle:
43 ldr r5, [r13, #12]
44 ldr r3, [r2], #4
45 sub r4, r5, r3, lsl #4
46 add r3, r5, r3, lsl #4
47 ldr r6, [r3]
48 ldr r8, [r4, #124]
49 add r6, r6, r8
50 sub r8, r6, r8, lsl #1
51 ldr r7, [r3, #8]
52 ldr lr, [r4, #116]
53 add r7, r7, lr
54 sub lr, r7, lr, lsl #1
55 ldr r10, [r3, #64]
56 ldr r9, [r4, #60]
57 add r10, r10, r9
58 sub r9, r10, r9, lsl #1
59 ldr r11, [r3, #72]
60 ldr r12, [r4, #52]
61 add r11, r11, r12
62 sub r12, r11, r12, lsl #1
63 add r6, r6, r10
64 sub r10, r6, r10, lsl #1
65 add r7, r7, r11
66 sub r11, r7, r11, lsl #1
67 add r8, r8, r12
68 sub r12, r8, r12, lsl #1
69 add lr, lr, r9
70 sub r9, lr, r9, lsl #1
71 stmia r0!, {r6, r7, r8, r9, r10, r11, r12, lr}
72 cmp r0, r1
73 bne .shuffle
74 ldr r0, =189812531
75 add r1, r13, #16
76 add r3, r1, #128
77.l2:
78 add r2, r1, #32
79 ldmia r2, {r4, r5, r8, r9}
80 ldmia r1, {r6, r7, r10, r11}
81 add r6, r6, r4
82 sub r4, r6, r4, lsl #1
83 add r7, r7, r5
84 sub r5, r7, r5, lsl #1
85 stmia r2!, {r4, r5}
86 stmia r1!, {r6, r7}
87 add r9, r9, r8
88 sub r8, r9, r8, lsl #1
89 smull r4, r6, r9, r0
90 movs r4, r4, lsr #28
91 adc r4, r4, r6, lsl #4
92 smull r5, r6, r8, r0
93 movs r5, r5, lsr #28
94 adc r5, r5, r6, lsl #4
95 add r10, r10, r4
96 sub r4, r10, r4, lsl #1
97 add r11, r11, r5
98 sub r5, r11, r5, lsl #1
99 stmia r2!, {r4, r5}
100 stmia r1!, {r10, r11}
101 ldmia r2, {r5, r6, r8, r11}
102 ldmia r1, {r4, r7, r9, r10}
103 add r4, r4, r6
104 sub r6, r4, r6, lsl #1
105 add r7, r7, r5
106 sub r5, r7, r5, lsl #1
107 stmia r2!, {r6, r7}
108 stmia r1!, {r4, r5}
109 add r11, r11, r8
110 sub r8, r11, r8, lsl #1
111 smull r5, r4, r8, r0
112 movs r5, r5, lsr #28
113 adc r5, r5, r4, lsl #4
114 smull r6, r4, r11, r0
115 movs r6, r6, lsr #28
116 adc r6, r6, r4, lsl #4
117 add r9, r9, r5
118 sub r5, r9, r5, lsl #1
119 sub r10, r10, r6
120 add r6, r10, r6, lsl #1
121 stmia r2!, {r5, r6}
122 stmia r1!, {r9, r10}
123 add r1, r1, #32
124 cmp r1, r3
125 bne .l2
126 add r2, r13, #16
127 add r3, r2, #64
128 ldr r0, =sincos
129 add r1, r0, #128
130.lbut8:
131 ldmia r3, {r7, r8}
132 ldmia r0, {r9, r10}
133 add r0, r0, #16
134 smull r6, r5, r7, r9
135 smlal r6, r5, r10, r8
136 movs r6, r6, lsr #28
137 adc r6, r6, r5, lsl #4
138 smull r10, r5, r7, r10
139 rsb r9, r9, #0
140 smlal r10, r5, r8, r9
141 movs r10, r10, lsr #28
142 adc r5, r10, r5, lsl #4
143 ldmia r2, {r7, r8}
144 add r7, r7, r5
145 sub r5, r7, r5, lsl #1
146 add r8, r8, r6
147 sub r6, r8, r6, lsl #1
148 stmia r3!, {r5, r6}
149 stmia r2!, {r7, r8}
150 cmp r0, r1
151 bne .lbut8
152 add r1, r13, #16
153 ldr r2, =sincos
154 ldr r3, =sincos2
155 ldr r0, [r13, #8]
156 mov r0, r0, lsl #2
157 ldr r4, [r13, #4]
158 add r4, r4, r0
159 ldr r5, [r13]
160 add r5, r5, #480
161 add r5, r5, r0
162 mov r0, #0
163.l4:
164 rsb r12, r0, #16
165 and r12, r12, #15
166 add lr, r13, #16
167 add r12, lr, r12, lsl #3
168 ldmia r1!, {r10, r11}
169 ldmia r12, {r6, r7}
170 add r6, r6, r10
171 sub r10, r6, r10, lsl #1
172 add r11, r11, r7
173 sub r7, r11, r7, lsl #1
174 ldmia r2!, {r12, lr}
175 smull r9, r8, r11, r12
176 smlal r9, r8, lr, r10
177 movs r9, r9, lsr #28
178 adc r9, r9, r8, lsl #4
179 smull lr, r8, r11, lr
180 rsb r12, r12, #0
181 smlal lr, r8, r10, r12
182 movs lr, lr, lsr #28
183 adc r8, lr, r8, lsl #4
184 add r6, r6, r8
185 sub r8, r6, r8, lsl #1
186 add r7, r7, r9
187 sub r9, r7, r9, lsl #1
188 add lr, r3, #128
189 ldmia lr, {r10, r11}
190 smull lr, r12, r8, r11
191 smlal lr, r12, r9, r10
192 movs lr, lr, lsr #28
193 adc r12, lr, r12, lsl #4
194 str r12, [r4], #32
195 cmp r0, #0
196 cmpne r0, #8
197 beq .skip1
198 smull lr, r12, r8, r10
199 rsb r9, r9, #0
200 smlal lr, r12, r9, r11
201 movs lr, lr, lsr #28
202 adc r12, lr, r12, lsl #4
203 add lr, r5, r0, lsl #6
204 str r12, [lr, #-512]
205.skip1:
206 ldmia r3!, {r10, r11}
207 smull lr, r12, r7, r10
208 smlal lr, r12, r6, r11
209 movs lr, lr, lsr #28
210 adc r12, lr, r12, lsl #4
211 str r12, [r5], #-32
212 cmp r0, #0
213 cmpne r0, #8
214 beq .skip2
215 smull lr, r12, r6, r10
216 rsb r7, r7, #0
217 smlal lr, r12, r7, r11
218 movs lr, lr, lsr #28
219 adc r12, lr, r12, lsl #4
220 sub lr, r4, r0, lsl #6
221 str r12, [lr, #480]
222.skip2:
223 add r0, r0, #1
224 cmp r0, #9
225 bne .l4
226 add r13, r13, #144
227 ldmia r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
228bitrev:
229 .word 0x0
230 .word 0x2
231 .word 0x1
232 .word 0x3
233
234sincos:
235 .word 0x0
236 .word 0x10000000
237 .word -0x31f1708
238 .word 0xfb14be8
239 .word -0x61f78aa
240 .word 0xec835e8
241 .word -0x8e39d9d
242 .word 0xd4db315
243 .word -0xb504f33
244 .word 0xb504f33
245 .word -0xd4db315
246 .word 0x8e39d9d
247 .word -0xec835e8
248 .word 0x61f78aa
249 .word -0xfb14be8
250 .word 0x31f1708
251 .word -0x10000000
252 .word 0x0
253 .word -0xfb14be8
254 .word -0x31f1708
255 .word -0xec835e8
256 .word -0x61f78aa
257 .word -0xd4db315
258 .word -0x8e39d9d
259 .word -0xb504f33
260 .word -0xb504f33
261 .word -0x8e39d9d
262 .word -0xd4db315
263 .word -0x61f78aa
264 .word -0xec835e8
265 .word -0x31f1708
266 .word -0xfb14be8
267
268sincos2:
269 .word 0x0
270 .word 0x8000000
271 .word 0x647d98
272 .word 0x7fd8879
273 .word 0xc8bd36
274 .word 0x7f62369
275 .word 0x12c8107
276 .word 0x7e9d560
277 .word 0x18f8b84
278 .word 0x7d8a5f4
279 .word 0x1f19f98
280 .word 0x7c29fbf
281 .word 0x25280c6
282 .word 0x7a7d056
283 .word 0x2b1f34f
284 .word 0x7884841
285 .word 0x30fbc55
286 .word 0x7641af4
287 .word 0x36ba201
288 .word 0x73b5ebd
289 .word 0x3c56ba7
290 .word 0x70e2cbc
291 .word 0x41ce1e6
292 .word 0x6dca0d1
293 .word 0x471cece
294 .word 0x6a6d98a
295 .word 0x4c3fdff
296 .word 0x66cf812
297 .word 0x5133cc9
298 .word 0x62f201b
299 .word 0x55f5a4d
300 .word 0x5ed77c9
301 .word 0x5a8279a
302 .word 0x5a8279a
303 .word 0x5ed77c9
304 .word 0x55f5a4d
305 .word 0x62f201b
306 .word 0x5133cc9
307 .word 0x66cf812
308 .word 0x4c3fdff
309 .word 0x6a6d98a
310 .word 0x471cece
311 .word 0x6dca0d1
312 .word 0x41ce1e6
313 .word 0x70e2cbc
314 .word 0x3c56ba7
315 .word 0x73b5ebd
316 .word 0x36ba201
317 .word 0x7641af4
318 .word 0x30fbc55
319 .word 0x7884841
320 .word 0x2b1f34f
321 .word 0x7a7d056
322 .word 0x25280c6
323 .word 0x7c29fbf
324 .word 0x1f19f98
325 .word 0x7d8a5f4
326 .word 0x18f8b84
327 .word 0x7e9d560
328 .word 0x12c8107
329 .word 0x7f62369
330 .word 0xc8bd36
331 .word 0x7fd8879
332 .word 0x647d98
diff --git a/apps/codecs/libmad/layer3.c b/apps/codecs/libmad/layer3.c
index 38e488ddbb..a95927e10f 100644
--- a/apps/codecs/libmad/layer3.c
+++ b/apps/codecs/libmad/layer3.c
@@ -922,8 +922,19 @@ mad_fixed_t III_requantize(unsigned int value, signed int exp)
922} 922}
923 923
924/* we must take care that sz >= bits and sz < sizeof(long) lest bits == 0 */ 924/* we must take care that sz >= bits and sz < sizeof(long) lest bits == 0 */
925# if defined(CPU_ARM)
926# define MASK(cache, sz, bits) \
927 ({ unsigned long res; \
928 asm ("mov %0, #1\n\t" \
929 "rsb %0, %0, %0, lsl %3\n\t" \
930 "and %0, %0, %1, lsr %2" \
931 : "=&r" (res) : "r" (cache), "r" ((sz) - (bits)), "r" (bits)); \
932 res; \
933 })
934#else
925# define MASK(cache, sz, bits) \ 935# define MASK(cache, sz, bits) \
926 (((cache) >> ((sz) - (bits))) & ((1 << (bits)) - 1)) 936 (((cache) >> ((sz) - (bits))) & ((1 << (bits)) - 1))
937#endif
927# define MASK1BIT(cache, sz) \ 938# define MASK1BIT(cache, sz) \
928 ((cache) & (1 << ((sz) - 1))) 939 ((cache) & (1 << ((sz) - 1)))
929 940
@@ -1546,6 +1557,9 @@ enum mad_error III_stereo(mad_fixed_t xr[2][576],
1546 return MAD_ERROR_NONE; 1557 return MAD_ERROR_NONE;
1547} 1558}
1548 1559
1560#if defined(CPU_ARM)
1561void III_aliasreduce(mad_fixed_t xr[576], int lines);
1562#else
1549/* 1563/*
1550 * NAME: III_aliasreduce() 1564 * NAME: III_aliasreduce()
1551 * DESCRIPTION: perform frequency line alias reduction 1565 * DESCRIPTION: perform frequency line alias reduction
@@ -1600,6 +1614,7 @@ void III_aliasreduce(mad_fixed_t xr[576], int lines)
1600 } 1614 }
1601 } 1615 }
1602} 1616}
1617#endif
1603 1618
1604# if defined(ASO_IMDCT) 1619# if defined(ASO_IMDCT)
1605void III_imdct_l(mad_fixed_t const [18], mad_fixed_t [36], unsigned int); 1620void III_imdct_l(mad_fixed_t const [18], mad_fixed_t [36], unsigned int);
@@ -2894,6 +2909,11 @@ void III_imdct_s(mad_fixed_t const X[18], mad_fixed_t z[36])
2894 2909
2895#endif 2910#endif
2896 2911
2912#ifdef CPU_ARM
2913void III_overlap(mad_fixed_t const output[36], mad_fixed_t overlap[18],
2914 mad_fixed_t sample[18][32], unsigned int sb);
2915#else
2916
2897/* 2917/*
2898 * NAME: III_overlap() 2918 * NAME: III_overlap()
2899 * DESCRIPTION: perform overlap-add of windowed IMDCT outputs 2919 * DESCRIPTION: perform overlap-add of windowed IMDCT outputs
@@ -2941,6 +2961,7 @@ void III_overlap(mad_fixed_t const output[36], mad_fixed_t overlap[18],
2941 } 2961 }
2942# endif 2962# endif
2943} 2963}
2964#endif
2944 2965
2945/* 2966/*
2946 * NAME: III_overlap_z() 2967 * NAME: III_overlap_z()
@@ -3142,10 +3163,21 @@ enum mad_error III_decode(struct mad_bitptr *ptr, struct mad_frame *frame,
3142 3163
3143 /* (nonzero) subbands 2-31 */ 3164 /* (nonzero) subbands 2-31 */
3144 3165
3166/*
3145 i = 576; 3167 i = 576;
3146 while (i > 36 && xr[ch][i - 1] == 0) 3168 while (i > 36 && xr[ch][i - 1] == 0)
3147 --i; 3169 --i;
3170*/
3148 3171
3172 {
3173 /* saves ~600k cycles */
3174 mad_fixed_t *p = &xr[ch][576];
3175 mad_fixed_t tmp = xr[ch][35];
3176 xr[ch][35] = 1;
3177 while (!*--p);
3178 xr[ch][35] = tmp;
3179 i = p - &xr[ch][0] + 1;
3180 }
3149 sblimit = 32 - (576 - i) / 18; 3181 sblimit = 32 - (576 - i) / 18;
3150 3182
3151 if (channel->block_type != 2) { 3183 if (channel->block_type != 2) {
diff --git a/apps/codecs/libmad/synth.c b/apps/codecs/libmad/synth.c
index 8613f77f79..c3a868a0dc 100644
--- a/apps/codecs/libmad/synth.c
+++ b/apps/codecs/libmad/synth.c
@@ -67,6 +67,13 @@ void mad_synth_mute(struct mad_synth *synth)
67 } 67 }
68} 68}
69 69
70#ifdef FPM_ARM
71
72void dct32(mad_fixed_t const in[32], unsigned int slot,
73 mad_fixed_t lo[16][8], mad_fixed_t hi[16][8]);
74
75#else
76
70/* 77/*
71 * An optional optimization called here the Subband Synthesis Optimization 78 * An optional optimization called here the Subband Synthesis Optimization
72 * (SSO) improves the performance of subband synthesis at the expense of 79 * (SSO) improves the performance of subband synthesis at the expense of
@@ -533,6 +540,8 @@ void dct32(mad_fixed_t const in[32], unsigned int slot,
533# undef MUL 540# undef MUL
534# undef SHIFT 541# undef SHIFT
535 542
543#endif
544
536/* third SSO shift and/or D[] optimization preshift */ 545/* third SSO shift and/or D[] optimization preshift */
537 546
538# if defined(OPT_SSO) 547# if defined(OPT_SSO)
@@ -816,7 +825,370 @@ void synth_full(struct mad_synth *synth, struct mad_frame const *frame,
816 } 825 }
817} 826}
818 827
819#else 828#elif defined(FPM_ARM)
829
830#define PROD_ODD_0(hi, lo, f, ptr) \
831 do { \
832 mad_fixed_t *__p = (f); \
833 asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \
834 "ldr r4, [%3, #4]\n\t" \
835 "smull %0, %1, r0, r4\n\t" \
836 "ldr r4, [%3, #60]\n\t" \
837 "smlal %0, %1, r1, r4\n\t" \
838 "ldr r4, [%3, #52]\n\t" \
839 "smlal %0, %1, r2, r4\n\t" \
840 "ldr r4, [%3, #44]\n\t" \
841 "smlal %0, %1, r3, r4\n\t" \
842 "ldmia %2, {r0, r1, r2, r3}\n\t" \
843 "ldr r4, [%3, #36]\n\t" \
844 "smlal %0, %1, r0, r4\n\t" \
845 "ldr r4, [%3, #28]\n\t" \
846 "smlal %0, %1, r1, r4\n\t" \
847 "ldr r4, [%3, #20]\n\t" \
848 "smlal %0, %1, r2, r4\n\t" \
849 "ldr r4, [%3, #12]\n\t" \
850 "smlal %0, %1, r3, r4\n\t" \
851 : "=&r" (lo), "=&r" (hi), "+r" (__p) \
852 : "r" (ptr) \
853 : "r0", "r1", "r2", "r3", "r4"); \
854 } while (0)
855
856#define PROD_ODD_A(hi, lo, f, ptr) \
857 do { \
858 mad_fixed_t *__p = (f); \
859 asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \
860 "ldr r4, [%3, #4]\n\t" \
861 "smlal %0, %1, r0, r4\n\t" \
862 "ldr r4, [%3, #60]\n\t" \
863 "smlal %0, %1, r1, r4\n\t" \
864 "ldr r4, [%3, #52]\n\t" \
865 "smlal %0, %1, r2, r4\n\t" \
866 "ldr r4, [%3, #44]\n\t" \
867 "smlal %0, %1, r3, r4\n\t" \
868 "ldmia %2, {r0, r1, r2, r3}\n\t" \
869 "ldr r4, [%3, #36]\n\t" \
870 "smlal %0, %1, r0, r4\n\t" \
871 "ldr r4, [%3, #28]\n\t" \
872 "smlal %0, %1, r1, r4\n\t" \
873 "ldr r4, [%3, #20]\n\t" \
874 "smlal %0, %1, r2, r4\n\t" \
875 "ldr r4, [%3, #12]\n\t" \
876 "smlal %0, %1, r3, r4\n\t" \
877 : "+r" (lo), "+r" (hi), "+r" (__p) \
878 : "r" (ptr) \
879 : "r0", "r1", "r2", "r3", "r4"); \
880 } while (0)
881
882#define PROD_EVEN_0(hi, lo, f, ptr) \
883 do { \
884 mad_fixed_t *__p = (f); \
885 asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \
886 "ldr r4, [%3, #0]\n\t" \
887 "smull %0, %1, r0, r4\n\t" \
888 "ldr r4, [%3, #56]\n\t" \
889 "smlal %0, %1, r1, r4\n\t" \
890 "ldr r4, [%3, #48]\n\t" \
891 "smlal %0, %1, r2, r4\n\t" \
892 "ldr r4, [%3, #40]\n\t" \
893 "smlal %0, %1, r3, r4\n\t" \
894 "ldmia %2, {r0, r1, r2, r3}\n\t" \
895 "ldr r4, [%3, #32]\n\t" \
896 "smlal %0, %1, r0, r4\n\t" \
897 "ldr r4, [%3, #24]\n\t" \
898 "smlal %0, %1, r1, r4\n\t" \
899 "ldr r4, [%3, #16]\n\t" \
900 "smlal %0, %1, r2, r4\n\t" \
901 "ldr r4, [%3, #8]\n\t" \
902 "smlal %0, %1, r3, r4\n\t" \
903 : "=&r" (lo), "=&r" (hi), "+r" (__p) \
904 : "r" (ptr) \
905 : "r0", "r1", "r2", "r3", "r4"); \
906 } while (0)
907
908#define PROD_EVEN_A(hi, lo, f, ptr) \
909 do { \
910 mad_fixed_t *__p = (f); \
911 asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \
912 "ldr r4, [%3, #0]\n\t" \
913 "smlal %0, %1, r0, r4\n\t" \
914 "ldr r4, [%3, #56]\n\t" \
915 "smlal %0, %1, r1, r4\n\t" \
916 "ldr r4, [%3, #48]\n\t" \
917 "smlal %0, %1, r2, r4\n\t" \
918 "ldr r4, [%3, #40]\n\t" \
919 "smlal %0, %1, r3, r4\n\t" \
920 "ldmia %2, {r0, r1, r2, r3}\n\t" \
921 "ldr r4, [%3, #32]\n\t" \
922 "smlal %0, %1, r0, r4\n\t" \
923 "ldr r4, [%3, #24]\n\t" \
924 "smlal %0, %1, r1, r4\n\t" \
925 "ldr r4, [%3, #16]\n\t" \
926 "smlal %0, %1, r2, r4\n\t" \
927 "ldr r4, [%3, #8]\n\t" \
928 "smlal %0, %1, r3, r4\n\t" \
929 : "+r" (lo), "+r" (hi), "+r" (__p) \
930 : "r" (ptr) \
931 : "r0", "r1", "r2", "r3", "r4"); \
932 } while (0)
933
934#define PROD_EVENBACK_0(hi, lo, f, ptr) \
935 do { \
936 mad_fixed_t *__p = (f); \
937 asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \
938 "ldr r4, [%3, #60]\n\t" \
939 "smull %0, %1, r0, r4\n\t" \
940 "ldr r4, [%3, #68]\n\t" \
941 "smlal %0, %1, r1, r4\n\t" \
942 "ldr r4, [%3, #76]\n\t" \
943 "smlal %0, %1, r2, r4\n\t" \
944 "ldr r4, [%3, #84]\n\t" \
945 "smlal %0, %1, r3, r4\n\t" \
946 "ldmia %2, {r0, r1, r2, r3}\n\t" \
947 "ldr r4, [%3, #92]\n\t" \
948 "smlal %0, %1, r0, r4\n\t" \
949 "ldr r4, [%3, #100]\n\t" \
950 "smlal %0, %1, r1, r4\n\t" \
951 "ldr r4, [%3, #108]\n\t" \
952 "smlal %0, %1, r2, r4\n\t" \
953 "ldr r4, [%3, #116]\n\t" \
954 "smlal %0, %1, r3, r4\n\t" \
955 : "=&r" (lo), "=&r" (hi), "+r" (__p) \
956 : "r" (ptr) \
957 : "r0", "r1", "r2", "r3", "r4"); \
958 } while (0)
959
960#define PROD_EVENBACK_A(hi, lo, f, ptr) \
961 do { \
962 mad_fixed_t *__p = (f); \
963 asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \
964 "ldr r4, [%3, #60]\n\t" \
965 "smlal %0, %1, r0, r4\n\t" \
966 "ldr r4, [%3, #68]\n\t" \
967 "smlal %0, %1, r1, r4\n\t" \
968 "ldr r4, [%3, #76]\n\t" \
969 "smlal %0, %1, r2, r4\n\t" \
970 "ldr r4, [%3, #84]\n\t" \
971 "smlal %0, %1, r3, r4\n\t" \
972 "ldmia %2, {r0, r1, r2, r3}\n\t" \
973 "ldr r4, [%3, #92]\n\t" \
974 "smlal %0, %1, r0, r4\n\t" \
975 "ldr r4, [%3, #100]\n\t" \
976 "smlal %0, %1, r1, r4\n\t" \
977 "ldr r4, [%3, #108]\n\t" \
978 "smlal %0, %1, r2, r4\n\t" \
979 "ldr r4, [%3, #116]\n\t" \
980 "smlal %0, %1, r3, r4\n\t" \
981 : "+r" (lo), "+r" (hi), "+r" (__p) \
982 : "r" (ptr) \
983 : "r0", "r1", "r2", "r3", "r4"); \
984 } while (0)
985
986#define PROD_ODDBACK_0(hi, lo, f, ptr) \
987 do { \
988 mad_fixed_t *__p = (f); \
989 asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \
990 "ldr r4, [%3, #120]\n\t" \
991 "smull %0, %1, r0, r4\n\t" \
992 "ldr r4, [%3, #64]\n\t" \
993 "smlal %0, %1, r1, r4\n\t" \
994 "ldr r4, [%3, #72]\n\t" \
995 "smlal %0, %1, r2, r4\n\t" \
996 "ldr r4, [%3, #80]\n\t" \
997 "smlal %0, %1, r3, r4\n\t" \
998 "ldmia %2, {r0, r1, r2, r3}\n\t" \
999 "ldr r4, [%3, #88]\n\t" \
1000 "smlal %0, %1, r0, r4\n\t" \
1001 "ldr r4, [%3, #96]\n\t" \
1002 "smlal %0, %1, r1, r4\n\t" \
1003 "ldr r4, [%3, #104]\n\t" \
1004 "smlal %0, %1, r2, r4\n\t" \
1005 "ldr r4, [%3, #112]\n\t" \
1006 "smlal %0, %1, r3, r4\n\t" \
1007 : "=&r" (lo), "=&r" (hi), "+r" (__p) \
1008 : "r" (ptr) \
1009 : "r0", "r1", "r2", "r3", "r4"); \
1010 } while (0)
1011
1012#define PROD_ODDBACK_A(hi, lo, f, ptr) \
1013 do { \
1014 mad_fixed_t *__p = (f); \
1015 asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \
1016 "ldr r4, [%3, #120]\n\t" \
1017 "smlal %0, %1, r0, r4\n\t" \
1018 "ldr r4, [%3, #64]\n\t" \
1019 "smlal %0, %1, r1, r4\n\t" \
1020 "ldr r4, [%3, #72]\n\t" \
1021 "smlal %0, %1, r2, r4\n\t" \
1022 "ldr r4, [%3, #80]\n\t" \
1023 "smlal %0, %1, r3, r4\n\t" \
1024 "ldmia %2, {r0, r1, r2, r3}\n\t" \
1025 "ldr r4, [%3, #88]\n\t" \
1026 "smlal %0, %1, r0, r4\n\t" \
1027 "ldr r4, [%3, #96]\n\t" \
1028 "smlal %0, %1, r1, r4\n\t" \
1029 "ldr r4, [%3, #104]\n\t" \
1030 "smlal %0, %1, r2, r4\n\t" \
1031 "ldr r4, [%3, #112]\n\t" \
1032 "smlal %0, %1, r3, r4\n\t" \
1033 : "+r" (lo), "+r" (hi), "+r" (__p) \
1034 : "r" (ptr) \
1035 : "r0", "r1", "r2", "r3", "r4"); \
1036 } while (0)
1037
1038void synth_full1(mad_fixed_t *pcm, mad_fixed_t (*fo)[8], mad_fixed_t (*fe)[8],
1039 mad_fixed_t const (*D0ptr)[32],
1040 mad_fixed_t const (*D1ptr)[32]);
1041void synth_full2(mad_fixed_t *pcm, mad_fixed_t (*fo)[8], mad_fixed_t (*fe)[8],
1042 mad_fixed_t const (*D0ptr)[32],
1043 mad_fixed_t const (*D1ptr)[32]);
1044
1045/* This performs slower in IRAM on PP502x and there is no space in
1046 mpegplayer on the PP5002 */
1047#if !defined(CPU_PP502x) && !(CONFIG_CPU == PP5002 && defined(MPEGPLAYER))
1048static
1049void synth_full(struct mad_synth *synth, struct mad_frame const *frame,
1050 unsigned int nch, unsigned int ns) ICODE_ATTR;
1051#endif
1052static
1053void synth_full(struct mad_synth *synth, struct mad_frame const *frame,
1054 unsigned int nch, unsigned int ns)
1055{
1056 int p;
1057 unsigned int phase, ch, s;
1058 mad_fixed_t *pcm, (*filter)[2][2][16][8];
1059 mad_fixed_t const (*sbsample)[36][32];
1060 mad_fixed_t (*fe)[8], (*fx)[8], (*fo)[8];
1061 mad_fixed_t const (*D0ptr)[32], *ptr;
1062 mad_fixed_t const (*D1ptr)[32];
1063 mad_fixed64hi_t hi;
1064 mad_fixed64lo_t lo;
1065
1066 for (ch = 0; ch < nch; ++ch) {
1067 sbsample = &frame->sbsample[ch];
1068 filter = &synth->filter[ch];
1069 phase = synth->phase;
1070 pcm = synth->pcm.samples[ch];
1071
1072 for (s = 0; s < ns; ++s) {
1073 dct32((*sbsample)[s], phase >> 1,
1074 (*filter)[0][phase & 1], (*filter)[1][phase & 1]);
1075
1076 p = (phase - 1) & 0xf;
1077
1078 /* calculate 32 samples */
1079 fe = &(*filter)[0][ phase & 1][0];
1080 fx = &(*filter)[0][~phase & 1][0];
1081 fo = &(*filter)[1][~phase & 1][0];
1082
1083 D0ptr = (void*)&D[0][ p];
1084 D1ptr = (void*)&D[0][-p];
1085
1086 if(s & 1)
1087 {
1088 ptr = *D0ptr;
1089/*
1090 ML0(hi, lo, (*fx)[0], ptr[ 1]);
1091 MLA(hi, lo, (*fx)[1], ptr[15]);
1092 MLA(hi, lo, (*fx)[2], ptr[13]);
1093 MLA(hi, lo, (*fx)[3], ptr[11]);
1094 MLA(hi, lo, (*fx)[4], ptr[ 9]);
1095 MLA(hi, lo, (*fx)[5], ptr[ 7]);
1096 MLA(hi, lo, (*fx)[6], ptr[ 5]);
1097 MLA(hi, lo, (*fx)[7], ptr[ 3]);
1098*/
1099 PROD_ODD_0(hi, lo, *fx, ptr);
1100 MLN(hi, lo);
1101/*
1102 MLA(hi, lo, (*fe)[0], ptr[ 0]);
1103 MLA(hi, lo, (*fe)[1], ptr[14]);
1104 MLA(hi, lo, (*fe)[2], ptr[12]);
1105 MLA(hi, lo, (*fe)[3], ptr[10]);
1106 MLA(hi, lo, (*fe)[4], ptr[ 8]);
1107 MLA(hi, lo, (*fe)[5], ptr[ 6]);
1108 MLA(hi, lo, (*fe)[6], ptr[ 4]);
1109 MLA(hi, lo, (*fe)[7], ptr[ 2]);
1110*/
1111 PROD_EVEN_A(hi, lo, *fe, ptr);
1112 pcm[0] = SHIFT(MLZ(hi, lo));
1113 pcm += 16;
1114
1115 synth_full1(pcm, fo, fe, D0ptr, D1ptr);
1116 D0ptr += 15;
1117 D1ptr += 15;
1118 fo += 15;
1119 fe += 15;
1120
1121 ptr = *(D0ptr + 1);
1122 PROD_ODD_0(hi, lo, *fo, ptr);
1123/*
1124 ML0(hi, lo, (*fo)[0], ptr[ 1]);
1125 MLA(hi, lo, (*fo)[1], ptr[15]);
1126 MLA(hi, lo, (*fo)[2], ptr[13]);
1127 MLA(hi, lo, (*fo)[3], ptr[11]);
1128 MLA(hi, lo, (*fo)[4], ptr[ 9]);
1129 MLA(hi, lo, (*fo)[5], ptr[ 7]);
1130 MLA(hi, lo, (*fo)[6], ptr[ 5]);
1131 MLA(hi, lo, (*fo)[7], ptr[ 3]);
1132*/
1133 pcm[0] = SHIFT(-MLZ(hi, lo));
1134 }
1135 else
1136 {
1137 ptr = *D0ptr;
1138/*
1139 ML0(hi, lo, (*fx)[0], ptr[ 0]);
1140 MLA(hi, lo, (*fx)[1], ptr[14]);
1141 MLA(hi, lo, (*fx)[2], ptr[12]);
1142 MLA(hi, lo, (*fx)[3], ptr[10]);
1143 MLA(hi, lo, (*fx)[4], ptr[ 8]);
1144 MLA(hi, lo, (*fx)[5], ptr[ 6]);
1145 MLA(hi, lo, (*fx)[6], ptr[ 4]);
1146 MLA(hi, lo, (*fx)[7], ptr[ 2]);
1147*/
1148 PROD_EVEN_0(hi, lo, *fx, ptr);
1149 MLN(hi, lo);
1150/*
1151 MLA(hi, lo, (*fe)[0], ptr[ 1]);
1152 MLA(hi, lo, (*fe)[1], ptr[15]);
1153 MLA(hi, lo, (*fe)[2], ptr[13]);
1154 MLA(hi, lo, (*fe)[3], ptr[11]);
1155 MLA(hi, lo, (*fe)[4], ptr[ 9]);
1156 MLA(hi, lo, (*fe)[5], ptr[ 7]);
1157 MLA(hi, lo, (*fe)[6], ptr[ 5]);
1158 MLA(hi, lo, (*fe)[7], ptr[ 3]);
1159*/
1160 PROD_ODD_A(hi, lo, *fe, ptr);
1161 pcm[0] = SHIFT(MLZ(hi, lo));
1162 pcm += 16;
1163
1164 synth_full2(pcm, fo, fe, D0ptr, D1ptr);
1165 D0ptr += 15;
1166 D1ptr += 15;
1167 fo += 15;
1168 fe += 15;
1169
1170 ptr = *(D0ptr + 1);
1171/*
1172 ML0(hi, lo, (*fo)[0], ptr[ 0]);
1173 MLA(hi, lo, (*fo)[1], ptr[14]);
1174 MLA(hi, lo, (*fo)[2], ptr[12]);
1175 MLA(hi, lo, (*fo)[3], ptr[10]);
1176 MLA(hi, lo, (*fo)[4], ptr[ 8]);
1177 MLA(hi, lo, (*fo)[5], ptr[ 6]);
1178 MLA(hi, lo, (*fo)[6], ptr[ 4]);
1179 MLA(hi, lo, (*fo)[7], ptr[ 2]);
1180*/
1181 PROD_EVEN_0(hi, lo, *fo, ptr);
1182 pcm[0] = SHIFT(-MLZ(hi, lo));
1183 }
1184
1185 pcm += 16;
1186 phase = (phase + 1) % 16;
1187 }
1188 }
1189}
1190
1191# else
820 1192
821static 1193static
822void synth_full(struct mad_synth *synth, struct mad_frame const *frame, 1194void synth_full(struct mad_synth *synth, struct mad_frame const *frame,
@@ -1020,6 +1392,7 @@ void synth_full(struct mad_synth *synth, struct mad_frame const *frame,
1020 } 1392 }
1021 } 1393 }
1022} 1394}
1395
1023# endif 1396# endif
1024# endif 1397# endif
1025 1398
diff --git a/apps/codecs/libmad/synth_full_arm.S b/apps/codecs/libmad/synth_full_arm.S
new file mode 100644
index 0000000000..b880a7b3c6
--- /dev/null
+++ b/apps/codecs/libmad/synth_full_arm.S
@@ -0,0 +1,343 @@
1/***************************************************************************
2 * __________ __ ___.
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 * \/ \/ \/ \/ \/
8 * $Id$
9 *
10 * Copyright (C) 2007 by Tomasz Malesinski
11 *
12 * All files in this archive are subject to the GNU General Public License.
13 * See the file COPYING in the source tree root for full license agreement.
14 *
15 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
16 * KIND, either express or implied.
17 *
18 ****************************************************************************/
19
20#include "config.h"
21
22/* This performs slower in IRAM on PP502x and there is no space in
23 mpegplayer on the PP5002 */
24#if defined(CPU_PP502x) || (CONFIG_CPU == PP5002 && defined(MPEGPLAYER))
25 .section .text,"ax",%progbits
26#else
27 .section .icode,"ax",%progbits
28#endif
29
30 .global synth_full1
31 .global synth_full2
32
33 ;; r0 = pcm
34 ;; r1 = fo
35 ;; r2 = fe
36 ;; r3 = D0ptr
37 ;; r4 = D1ptr
38synth_full1:
39 stmdb sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
40 ldr r4, [sp, #40]
41 ldr r5, =synth_full_sp
42 str sp, [r5]
43 mov r5, #15
44 add r2, r2, #32
45.l:
46 add r3, r3, #128
47 add r4, r4, #128
48 ldmia r1!, {r10, r11, r12, lr}
49 ldr r7, [r3, #4]
50 smull r6, r7, r10, r7
51 ldr r9, [r4, #120]
52 smull r8, r9, r10, r9
53
54 ldr r10, [r3, #60]
55 smlal r6, r7, r11, r10
56 ldr r10, [r3, #52]
57 smlal r6, r7, r12, r10
58 ldr r10, [r3, #44]
59 smlal r6, r7, lr, r10
60
61 ldr r10, [r4, #64]
62 smlal r8, r9, r11, r10
63 ldr r10, [r4, #72]
64 smlal r8, r9, r12, r10
65 ldr r10, [r4, #80]
66 smlal r8, r9, lr, r10
67
68 ldmia r1!, {r11, r12, sp, lr}
69 ldr r10, [r3, #36]
70 smlal r6, r7, r11, r10
71 ldr r10, [r3, #28]
72 smlal r6, r7, r12, r10
73 ldr r10, [r3, #20]
74 smlal r6, r7, sp, r10
75 ldr r10, [r3, #12]
76 smlal r6, r7, lr, r10
77
78 ldr r10, [r4, #88]
79 smlal r8, r9, r11, r10
80 ldr r10, [r4, #96]
81 smlal r8, r9, r12, r10
82 ldr r10, [r4, #104]
83 smlal r8, r9, sp, r10
84 ldr r10, [r4, #112]
85 smlal r8, r9, lr, r10
86
87 rsbs r6, r6, #0
88 rsc r7, r7, #0
89
90 ldmia r2!, {r11, r12, sp, lr}
91
92 ldr r10, [r3, #0]
93 smlal r6, r7, r11, r10
94 ldr r10, [r3, #56]
95 smlal r6, r7, r12, r10
96 ldr r10, [r3, #48]
97 smlal r6, r7, sp, r10
98 ldr r10, [r3, #40]
99 smlal r6, r7, lr, r10
100
101 ldr r10, [r4, #60]
102 smlal r8, r9, r11, r10
103 ldr r10, [r4, #68]
104 smlal r8, r9, r12, r10
105 ldr r10, [r4, #76]
106 smlal r8, r9, sp, r10
107 ldr r10, [r4, #84]
108 smlal r8, r9, lr, r10
109
110 ldmia r2!, {r11, r12, sp, lr}
111 ldr r10, [r3, #32]
112 smlal r6, r7, r11, r10
113 ldr r10, [r3, #24]
114 smlal r6, r7, r12, r10
115 ldr r10, [r3, #16]
116 smlal r6, r7, sp, r10
117 ldr r10, [r3, #8]
118 smlal r6, r7, lr, r10
119
120 ldr r10, [r4, #92]
121 smlal r8, r9, r11, r10
122 ldr r10, [r4, #100]
123 smlal r8, r9, r12, r10
124 ldr r10, [r4, #108]
125 smlal r8, r9, sp, r10
126 ldr r10, [r4, #116]
127 smlal r8, r9, lr, r10
128
129 movs r6, r6, lsr #16
130 adc r6, r6, r7, lsl #16
131 str r6, [r0, -r5, lsl #2]
132
133 movs r8, r8, lsr #16
134 adc r8, r8, r9, lsl #16
135 str r8, [r0, r5, lsl #2]
136
137 subs r5, r5, #1
138 bne .l
139
140 ldr r5, =synth_full_sp
141 ldr sp, [r5]
142 ldmia sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
143
144synth_full2:
145 stmdb sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
146 ldr r4, [sp, #40]
147 ldr r5, =synth_full_sp
148 str sp, [r5]
149 mov r5, #15
150 add r2, r2, #32
151.l2:
152 add r3, r3, #128
153 add r4, r4, #128
154 ldmia r1!, {r10, r11, r12, lr}
155 ldr r7, [r3, #0]
156 smull r6, r7, r10, r7
157 ldr r9, [r4, #60]
158 smull r8, r9, r10, r9
159
160 ldr r10, [r3, #56]
161 smlal r6, r7, r11, r10
162 ldr r10, [r3, #48]
163 smlal r6, r7, r12, r10
164 ldr r10, [r3, #40]
165 smlal r6, r7, lr, r10
166
167 ldr r10, [r4, #68]
168 smlal r8, r9, r11, r10
169 ldr r10, [r4, #76]
170 smlal r8, r9, r12, r10
171 ldr r10, [r4, #84]
172 smlal r8, r9, lr, r10
173
174 ldmia r1!, {r11, r12, sp, lr}
175 ldr r10, [r3, #32]
176 smlal r6, r7, r11, r10
177 ldr r10, [r3, #24]
178 smlal r6, r7, r12, r10
179 ldr r10, [r3, #16]
180 smlal r6, r7, sp, r10
181 ldr r10, [r3, #8]
182 smlal r6, r7, lr, r10
183
184 ldr r10, [r4, #92]
185 smlal r8, r9, r11, r10
186 ldr r10, [r4, #100]
187 smlal r8, r9, r12, r10
188 ldr r10, [r4, #108]
189 smlal r8, r9, sp, r10
190 ldr r10, [r4, #116]
191 smlal r8, r9, lr, r10
192
193 rsbs r6, r6, #0
194 rsc r7, r7, #0
195
196 ldmia r2!, {r11, r12, sp, lr}
197
198 ldr r10, [r3, #4]
199 smlal r6, r7, r11, r10
200 ldr r10, [r3, #60]
201 smlal r6, r7, r12, r10
202 ldr r10, [r3, #52]
203 smlal r6, r7, sp, r10
204 ldr r10, [r3, #44]
205 smlal r6, r7, lr, r10
206
207 ldr r10, [r4, #120]
208 smlal r8, r9, r11, r10
209 ldr r10, [r4, #64]
210 smlal r8, r9, r12, r10
211 ldr r10, [r4, #72]
212 smlal r8, r9, sp, r10
213 ldr r10, [r4, #80]
214 smlal r8, r9, lr, r10
215
216 ldmia r2!, {r11, r12, sp, lr}
217 ldr r10, [r3, #36]
218 smlal r6, r7, r11, r10
219 ldr r10, [r3, #28]
220 smlal r6, r7, r12, r10
221 ldr r10, [r3, #20]
222 smlal r6, r7, sp, r10
223 ldr r10, [r3, #12]
224 smlal r6, r7, lr, r10
225
226 ldr r10, [r4, #88]
227 smlal r8, r9, r11, r10
228 ldr r10, [r4, #96]
229 smlal r8, r9, r12, r10
230 ldr r10, [r4, #104]
231 smlal r8, r9, sp, r10
232 ldr r10, [r4, #112]
233 smlal r8, r9, lr, r10
234
235 movs r6, r6, lsr #16
236 adc r6, r6, r7, lsl #16
237 str r6, [r0, -r5, lsl #2]
238
239 movs r8, r8, lsr #16
240 adc r8, r8, r9, lsl #16
241 str r8, [r0, r5, lsl #2]
242
243 subs r5, r5, #1
244 bne .l2
245
246 ldr r5, =synth_full_sp
247 ldr sp, [r5]
248 ldmia sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
249
250 .global III_aliasreduce
251
252III_aliasreduce:
253 stmdb sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
254 add r1, r0, r1, lsl #2
255 add r0, r0, #72
256.arl1:
257 mov r2, #8
258 mov r3, r0 @ a
259 mov r4, r0 @ b
260 ldr r5, =csa @ cs/ca
261.arl2:
262 ldmdb r3, {r6, r12}
263 ldmia r4, {r7, lr}
264
265 ldmia r5!, {r8, r9}
266 smull r10, r11, r7, r8
267 smlal r10, r11, r12, r9
268 movs r10, r10, lsr #28
269 adc r10, r10, r11, lsl #4
270
271 rsb r7, r7, #0
272 smull r11, r8, r12, r8
273 smlal r11, r8, r7, r9
274 movs r11, r11, lsr #28
275 adc r11, r11, r8, lsl #4
276
277 ldmia r5!, {r8, r9}
278 smull r12, r7, lr, r8
279 smlal r12, r7, r6, r9
280 movs r12, r12, lsr #28
281 adc r12, r12, r7, lsl #4
282 stmia r4!, {r10, r12}
283
284 rsb lr, lr, #0
285 smull r7, r10, r6, r8
286 smlal r7, r10, lr, r9
287 movs r7, r7, lsr #28
288 adc r7, r7, r10, lsl #4
289 stmdb r3!, {r7, r11}
290
291 subs r2, r2, #2
292 bne .arl2
293 add r0, r0, #72
294 cmp r0, r1
295 blo .arl1
296 ldmia sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
297
298csa:
299 .word +0x0db84a81
300 .word -0x083b5fe7
301 .word +0x0e1b9d7f
302 .word -0x078c36d2
303 .word +0x0f31adcf
304 .word -0x05039814
305 .word +0x0fbba815
306 .word -0x02e91dd1
307 .word +0x0feda417
308 .word -0x0183603a
309 .word +0x0ffc8fc8
310 .word -0x00a7cb87
311 .word +0x0fff964c
312 .word -0x003a2847
313 .word +0x0ffff8d3
314 .word -0x000f27b4
315
316 .global III_overlap
317III_overlap:
318 stmdb sp!, {r4, r5, r6, r7, r8, lr}
319 add r2, r2, r3, lsl #2
320 mov r3, #6
321.ol:
322 ldmia r0!, {r4, r5, r6}
323 ldmia r1!, {r7, r8, lr}
324 add r4, r4, r7
325 add r5, r5, r8
326 add r6, r6, lr
327 str r4, [r2], #128
328 str r5, [r2], #128
329 str r6, [r2], #128
330 subs r3, r3, #1
331 bne .ol
332 sub r1, r1, #72
333 ldmia r0!, {r4, r5, r6, r7, r8, lr}
334 stmia r1!, {r4, r5, r6, r7, r8, lr}
335 ldmia r0!, {r4, r5, r6, r7, r8, lr}
336 stmia r1!, {r4, r5, r6, r7, r8, lr}
337 ldmia r0!, {r4, r5, r6, r7, r8, lr}
338 stmia r1!, {r4, r5, r6, r7, r8, lr}
339 ldmia sp!, {r4, r5, r6, r7, r8, pc}
340
341 .section .ibss,"aw",%nobits
342synth_full_sp:
343 .space 4