summaryrefslogtreecommitdiff
path: root/apps/recorder/jpeg_idct_arm.S
diff options
context:
space:
mode:
Diffstat (limited to 'apps/recorder/jpeg_idct_arm.S')
-rw-r--r--apps/recorder/jpeg_idct_arm.S287
1 files changed, 287 insertions, 0 deletions
diff --git a/apps/recorder/jpeg_idct_arm.S b/apps/recorder/jpeg_idct_arm.S
new file mode 100644
index 0000000000..2ef868e753
--- /dev/null
+++ b/apps/recorder/jpeg_idct_arm.S
@@ -0,0 +1,287 @@
1/***************************************************************************
2* __________ __ ___.
3* Open \______ \ ____ ____ | | _\_ |__ _______ ___
4* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7* \/ \/ \/ \/ \/
8* $Id$
9*
10* JPEG assembly IDCT
11*
12* Copyright (C) 2009 Andrew Mahone asm versions of the C IDCT algorithms used
13* jpeg_load.c with
14*
15* This program is free software; you can redistribute it and/or
16* modify it under the terms of the GNU General Public License
17* as published by the Free Software Foundation; either version 2
18* of the License, or (at your option) any later version.
19*
20* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
21* KIND, either express or implied.
22*
23****************************************************************************/
24#include "config.h"
25
26 .section .text
27 .align 2
28 .global jpeg_idct4v
29 .type jpeg_idct4v, %function
30 .global jpeg_idct4h
31 .type jpeg_idct4h, %function
32
33jpeg_idct4v:
34#if ARM_ARCH < 5
35 stmdb sp!, { r4-r7, lr }
36 ldr r14, =-15137
37 ldr r12, =6270
381:
39 ldrsh r4, [r0, #32]
40 ldrsh r2, [r0]
41 ldrsh r5, [r0, #48]
42 ldrsh r3, [r0, #16]
43 add r6, r2, r4 /* r6 = tmp10 >> 2 = d0 + d2 */
44 sub r2, r2, r4 /* r2 = tmp12 >> 2= d0 - d2 */
45 add r4, r3, r5 /* r4 = z1 = d1 + d3 */
46 add r7, r4, r4, lsl #3
47 rsb r4, r4, r7, lsl #4
48 rsb r4, r4, r4, lsl #5 /* z1 *= 4433 */
49 add r4, r4, #1024
50 mla r3, r12, r3, r4 /* r3 = tmp2 = z1 + z2 * 6270 */
51 mla r5, r14, r5, r4 /* r5 = tmp0 = z1 - z3 * 15137 */
52 mov r6, r6, lsl #2 /* r6 <<= 2 */
53 mov r2, r2, lsl #2 /* r2 <<= 2 */
54 add r7, r6, r3, asr #11 /* r7 = o0 */
55 sub r3, r6, r3, asr #11 /* r3 = o3 */
56 add r6, r2, r5, asr #11 /* r6 = o1 */
57 sub r2, r2, r5, asr #11 /* r2 = o2 */
58 strh r7, [r0]
59 strh r3, [r0, #48]
60 strh r6, [r0, #16]
61 strh r2, [r0, #32]
62 add r0, r0, #2
63 teq r0, r1
64 bne 1b
65 ldmia sp!, { r4-r7, pc }
66#elif ARM_ARCH < 6
67 stmdb sp!, { r4-r8, lr }
68 ldr r8, =1024
69 ldr r14, =4433
70 ldr r12, =3302955134
711:
72 ldrsh r5, [r0, #48]
73 ldrsh r3, [r0, #16]
74 ldrsh r4, [r0, #32]
75 ldrsh r2, [r0]
76 add r6, r3, r5 /* r6 = z1 = d1 + d3 */
77 add r7, r2, r4 /* r7 = tmp10 >> 2 = d0 + d2 */
78 smlabb r6, r14, r6, r8 /* z1 *= 4433 */
79 sub r2, r2, r4 /* r2 = tmp12 >> 2= d0 - d2 */
80 smlabb r3, r12, r3, r6 /* r3 = tmp2 = z1 + z2 * 6270 */
81 smlatb r5, r12, r5, r6 /* r5 = tmp0 = z1 - z3 * 15137 */
82 mov r7, r7, lsl #2
83 mov r2, r2, lsl #2
84 add r4, r7, r3, asr #11 /* r4 = o0 */
85 sub r7, r7, r3, asr #11 /* r7 = o3 */
86 add r3, r2, r5, asr #11 /* r3 = o1 */
87 sub r2, r2, r5, asr #11 /* r2 = o2 */
88 strh r4, [r0]
89 strh r7, [r0, #48]
90 strh r3, [r0, #16]
91 strh r2, [r0, #32]
92 add r0, r0, #2
93 teq r0, r1
94 bne 1b
95 ldmia sp!, { r4-r8, pc }
96#else
97 stmdb sp!, { r4-r10, lr }
98 ldr r2, =1024
99 ldr r3, =4433
100 ldr r12, =3302955134
1011:
102 ldr r6, [r0, #32]
103 ldr r4, [r0]
104 ldr r7, [r0, #48]
105 ldr r5, [r0, #16]
106 /* this part is being done in parallel on two columns */
107 sadd16 r8, r4, r6 /* r8 = d0 + d2 */
108 ssub16 r4, r4, r6 /* r4 = d0 - d2 */
109 sadd16 r6, r5, r7 /* r6 = d1 + d3 */
110 /* there is no parallel shift operation, but we can fake it with bic
111 and lsl */
112 bic r8, r8, #0xc000
113 bic r4, r4, #0xc000
114 /* multiplication expands values beyond 16 bits, so this part needs to be
115 split. the values will be merged below so that the rest of the addition
116 can be done in parallel */
117 smlabb r9, r3, r6, r2 /* r9 = z1[0] = (d1 * d3) * 4433 + 1024 */
118 smlabt r6, r3, r6, r2 /* r6 = z1[1] = (d1 * d3) * 4433 + 1024 */
119 smlabb r10, r12, r5, r9 /* r10 = tmp2[0] = z1 + d1 * 6270 */
120 smlatb r14, r12, r7, r9 /* r14 = tmp0[0] = z1 - d3 * 15137 */
121 smlabt r5, r12, r5, r6 /* r5 = tmp2[1] */
122 smlatt r6, r12, r7, r6 /* r6 = tmp0[1] */
123 mov r8, r8, lsl #2 /* complete the parallel shift started */
124 mov r4, r4, lsl #2 /* with the earlier bic instructions */
125 /* tmp2 are in r10, r5; tmp0 are in r14, r6 */
126 /* tmp10, tmp12 are in r4, r8 */
127 mov r10, r10, asr #11
128 mov r14, r14, asr #11
129 pkhbt r5, r10, r5, lsl #5 /* parallel tmp2 */
130 pkhbt r6, r14, r6, lsl #5 /* parallel tmp0 */
131 sadd16 r10, r8, r5 /* d0 */
132 ssub16 r5, r8, r5 /* d3 */
133 sadd16 r14, r4, r6 /* d1 */
134 ssub16 r6, r4, r6 /* d2 */
135 str r10, [r0]
136 str r5, [r0, #48]
137 str r14, [r0, #16]
138 str r6, [r0, #32]
139 add r0, r0, #4
140 cmp r0, r1
141 bcc 1b
142 ldmia sp!, { r4-r10, pc }
143#endif
144 .size jpeg_idct4v, .-jpeg_idct4v
145
146jpeg_idct4h:
147#if ARM_ARCH < 5
148 stmdb sp!, { r4-r10, lr }
149 ldr r10, =-15137
150 ldr r14, =4112
151 ldr r12, =6270
1521:
153 ldrsh r4, [r0]
154 ldrsh r6, [r0, #4]
155 ldrsh r7, [r0, #6]
156 ldrsh r5, [r0, #2]
157 add r4, r4, r14
158 add r8, r4, r6 /* r8 = tmp10 >> 2 = d0 + d2 */
159 sub r4, r4, r6 /* r4 = tmp12 >> 2= d0 - d2 */
160 add r6, r5, r7 /* r6 = z1 = d1 + d3 */
161 add r9, r6, r6, lsl #3
162 rsb r6, r6, r9, lsl #4
163 rsb r6, r6, r6, lsl #5 /* z1 *= 4433 */
164 mla r7, r10, r7, r6 /* r5 = tmp0 = z1 - z3 * 15137 */
165 mla r5, r12, r5, r6 /* r3 = tmp2 = z1 + z2 * 6270 */
166 add r9, r5, r8, lsl #13 /* r7 = o0 */
167 rsb r5, r5, r8, lsl #13 /* r3 = o3 */
168 add r8, r7, r4, lsl #13 /* r6 = o1 */
169 rsb r4, r7, r4, lsl #13 /* r2 = o2 */
170 mov r9, r9, asr #18
171 mov r8, r8, asr #18
172 mov r4, r4, asr #18
173 mov r5, r5, asr #18
174 cmp r9, #255
175 mvnhi r9, r9, asr #31
176 cmp r8, #255
177 mvnhi r8, r8, asr #31
178 cmp r4, #255
179 mvnhi r4, r4, asr #31
180 cmp r5, #255
181 mvnhi r5, r5, asr #31
182#ifdef HAVE_LCD_COLOR
183 strb r9, [r1]
184 strb r8, [r1, #4]
185 strb r4, [r1, #8]
186 strb r5, [r1, #12]
187#else
188 strb r9, [r1]
189 strb r8, [r1, #1]
190 strb r4, [r1, #2]
191 strb r5, [r1, #3]
192#endif
193 add r0, r0, #16
194 add r1, r1, r3
195 teq r0, r2
196 bne 1b
197 ldmia sp!, { r4-r10, pc }
198#elif ARM_ARCH < 6
199 stmdb sp!, { r4-r10, lr }
200 ldr r10, =4433
201 ldr r14, =4112
202 ldr r12, =3302955134
2031:
204 ldrsh r7, [r0, #6]
205 ldrsh r5, [r0, #2]
206 ldrsh r4, [r0]
207 ldrsh r6, [r0, #4]
208 add r8, r5, r7 /* r8 = z1 = d1 + d3 */
209 add r4, r4, r14
210 smulbb r8, r10, r8 /* z1 *= 4433 */
211 add r9, r4, r6 /* r9 = tmp10 >> 13 = d0 + d2 */
212 smlabb r5, r12, r5, r8 /* r5 = tmp2 = z1 + z2 * 6270 */
213 smlatb r7, r12, r7, r8 /* r7 = tmp0 = z1 - z3 * 15137 */
214 sub r4, r5, r6 /* r4 = tmp12 >> 13 = d0 - d2 */
215 add r6, r5, r9, lsl #13 /* r6 = o0 */
216 rsb r9, r5, r9, lsl #13 /* r9 = o3 */
217 add r5, r7, r4, lsl #13 /* r5 = o1 */
218 rsb r4, r7, r4, lsl #13 /* r4 = o2 */
219 mov r6, r6, asr #18
220 mov r5, r5, asr #18
221 mov r4, r4, asr #18
222 mov r9, r9, asr #18
223 cmp r6, #255
224 mvnhi r6, r6, asr #31
225 cmp r5, #255
226 mvnhi r5, r5, asr #31
227 cmp r4, #255
228 mvnhi r4, r4, asr #31
229 cmp r9, #255
230 mvnhi r9, r9, asr #31
231#ifdef HAVE_LCD_COLOR
232 strb r6, [r1]
233 strb r5, [r1, #4]
234 strb r4, [r1, #8]
235 strb r9, [r1, #12]
236#else
237 strb r6, [r1]
238 strb r5, [r1, #1]
239 strb r4, [r1, #2]
240 strb r9, [r1, #3]
241#endif
242 add r0, r0, #16
243 add r1, r1, r3
244 teq r0, r2
245 bne 1b
246 ldmia sp!, { r4-r10, pc }
247#else
248 stmdb sp!, { r4-r9, lr }
249 ldr r9, =4433
250 ldr r14, =4112
251 ldr r12, =3302955134
2521:
253 ldmia r0, { r4-r5 }
254 sadd16 r4, r4, r14
255 sadd16 r6, r4, r5 /* r6lo = d0 + d2, r6hi = d1 + d3 */
256 ssub16 r7, r4, r5 /* r7lo = d0 - d2 */
257 smulbt r8, r9, r6
258 sxth r6, r6
259 smlabt r4, r12, r4, r8 /* r4 = tmp2 = z1 + z2 * 6270 */
260 smlatt r5, r12, r5, r8 /* r5 = tmp0 = z1 - z3 * 15137 */
261 sxth r7, r7
262 add r8, r4, r6, lsl #13 /* r8 = o0 */
263 rsb r6, r4, r6, lsl #13 /* r6 = o3 */
264 add r4, r5, r7, lsl #13 /* r4 = o1 */
265 rsb r5, r5, r7, lsl #13 /* r5 = o2 */
266 usat r8, #8, r8, asr #18
267 usat r6, #8, r6, asr #18
268 usat r4, #8, r4, asr #18
269 usat r5, #8, r5, asr #18
270#ifdef HAVE_LCD_COLOR
271 strb r8, [r1]
272 strb r6, [r1, #12]
273 strb r4, [r1, #4]
274 strb r5, [r1, #8]
275#else
276 strb r8, [r1]
277 strb r6, [r1, #3]
278 strb r4, [r1, #1]
279 strb r5, [r1, #2]
280#endif
281 add r0, r0, #16
282 add r1, r1, r3
283 teq r0, r2
284 bne 1b
285 ldmia sp!, { r4-r9, pc }
286#endif
287 .size jpeg_idct4h, .-jpeg_idct4h