summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Mahone <andrew.mahone@gmail.com>2009-06-19 02:56:00 +0000
committerAndrew Mahone <andrew.mahone@gmail.com>2009-06-19 02:56:00 +0000
commit6a0d931f383259b4b82fcfd1cc87700f53bbcb02 (patch)
tree04960b8122f35101bb31603af084536f307e0ae1
parent4c58ad26ba462309f95790c32421130a73909f05 (diff)
downloadrockbox-6a0d931f383259b4b82fcfd1cc87700f53bbcb02.tar.gz
rockbox-6a0d931f383259b4b82fcfd1cc87700f53bbcb02.zip
Core JPEG decoder improvements:
For >8-point vertical IDCT, transpose the coefficients while decoding them, so that the vertical IDCT can read in rows rather than columns. This improves speed a bit for this size even using the C IDCT. Remove inline ARM asm, replacing it with an external file containing pure asm IDCT functions. Add jpeg_ prefix to JPEG IDCT functions since some of them will now be visible globally. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@21345 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/SOURCES3
-rw-r--r--apps/plugins/lib/SOURCES3
-rw-r--r--apps/plugins/lib/pluginlib_jpeg_idct_arm.S24
-rw-r--r--apps/recorder/jpeg_idct_arm.S287
-rw-r--r--apps/recorder/jpeg_load.c367
5 files changed, 465 insertions, 219 deletions
diff --git a/apps/SOURCES b/apps/SOURCES
index 527b0b20a9..4caf32d822 100644
--- a/apps/SOURCES
+++ b/apps/SOURCES
@@ -104,6 +104,9 @@ recorder/resize.c
104#endif 104#endif
105#ifdef HAVE_JPEG 105#ifdef HAVE_JPEG
106recorder/jpeg_load.c 106recorder/jpeg_load.c
107#ifdef CPU_ARM
108recorder/jpeg_idct_arm.S
109#endif
107#endif 110#endif
108#ifdef HAVE_ALBUMART 111#ifdef HAVE_ALBUMART
109recorder/albumart.c 112recorder/albumart.c
diff --git a/apps/plugins/lib/SOURCES b/apps/plugins/lib/SOURCES
index 7211109271..2ed38c4f8b 100644
--- a/apps/plugins/lib/SOURCES
+++ b/apps/plugins/lib/SOURCES
@@ -27,6 +27,9 @@ playergfx.c
27profile_plugin.c 27profile_plugin.c
28#endif 28#endif
29#ifdef HAVE_LCD_BITMAP 29#ifdef HAVE_LCD_BITMAP
30#ifdef CPU_ARM
31pluginlib_jpeg_idct_arm.S
32#endif
30pluginlib_jpeg_mem.c 33pluginlib_jpeg_mem.c
31pluginlib_resize.c 34pluginlib_resize.c
32#ifndef HAVE_JPEG 35#ifndef HAVE_JPEG
diff --git a/apps/plugins/lib/pluginlib_jpeg_idct_arm.S b/apps/plugins/lib/pluginlib_jpeg_idct_arm.S
new file mode 100644
index 0000000000..5e6149d59f
--- /dev/null
+++ b/apps/plugins/lib/pluginlib_jpeg_idct_arm.S
@@ -0,0 +1,24 @@
1/***************************************************************************
2* __________ __ ___.
3* Open \______ \ ____ ____ | | _\_ |__ _______ ___
4* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7* \/ \/ \/ \/ \/
8* $Id$
9*
10* Copyright (C) 2009 by Andrew Mahone
11*
12* This is a wrapper for the core jpeg_idct_arm.S
13*
14* This program is free software; you can redistribute it and/or
15* modify it under the terms of the GNU General Public License
16* as published by the Free Software Foundation; either version 2
17* of the License, or (at your option) any later version.
18*
19* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
20* KIND, either express or implied.
21*
22****************************************************************************/
23
24#include "recorder/jpeg_idct_arm.S"
diff --git a/apps/recorder/jpeg_idct_arm.S b/apps/recorder/jpeg_idct_arm.S
new file mode 100644
index 0000000000..2ef868e753
--- /dev/null
+++ b/apps/recorder/jpeg_idct_arm.S
@@ -0,0 +1,287 @@
1/***************************************************************************
2* __________ __ ___.
3* Open \______ \ ____ ____ | | _\_ |__ _______ ___
4* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7* \/ \/ \/ \/ \/
8* $Id$
9*
10* JPEG assembly IDCT
11*
12* Copyright (C) 2009 Andrew Mahone asm versions of the C IDCT algorithms used
13* jpeg_load.c with
14*
15* This program is free software; you can redistribute it and/or
16* modify it under the terms of the GNU General Public License
17* as published by the Free Software Foundation; either version 2
18* of the License, or (at your option) any later version.
19*
20* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
21* KIND, either express or implied.
22*
23****************************************************************************/
24#include "config.h"
25
26 .section .text
27 .align 2
28 .global jpeg_idct4v
29 .type jpeg_idct4v, %function
30 .global jpeg_idct4h
31 .type jpeg_idct4h, %function
32
33jpeg_idct4v:
34#if ARM_ARCH < 5
35 stmdb sp!, { r4-r7, lr }
36 ldr r14, =-15137
37 ldr r12, =6270
381:
39 ldrsh r4, [r0, #32]
40 ldrsh r2, [r0]
41 ldrsh r5, [r0, #48]
42 ldrsh r3, [r0, #16]
43 add r6, r2, r4 /* r6 = tmp10 >> 2 = d0 + d2 */
44 sub r2, r2, r4 /* r2 = tmp12 >> 2= d0 - d2 */
45 add r4, r3, r5 /* r4 = z1 = d1 + d3 */
46 add r7, r4, r4, lsl #3
47 rsb r4, r4, r7, lsl #4
48 rsb r4, r4, r4, lsl #5 /* z1 *= 4433 */
49 add r4, r4, #1024
50 mla r3, r12, r3, r4 /* r3 = tmp2 = z1 + z2 * 6270 */
51 mla r5, r14, r5, r4 /* r5 = tmp0 = z1 - z3 * 15137 */
52 mov r6, r6, lsl #2 /* r6 <<= 2 */
53 mov r2, r2, lsl #2 /* r2 <<= 2 */
54 add r7, r6, r3, asr #11 /* r7 = o0 */
55 sub r3, r6, r3, asr #11 /* r3 = o3 */
56 add r6, r2, r5, asr #11 /* r6 = o1 */
57 sub r2, r2, r5, asr #11 /* r2 = o2 */
58 strh r7, [r0]
59 strh r3, [r0, #48]
60 strh r6, [r0, #16]
61 strh r2, [r0, #32]
62 add r0, r0, #2
63 teq r0, r1
64 bne 1b
65 ldmia sp!, { r4-r7, pc }
66#elif ARM_ARCH < 6
67 stmdb sp!, { r4-r8, lr }
68 ldr r8, =1024
69 ldr r14, =4433
70 ldr r12, =3302955134
711:
72 ldrsh r5, [r0, #48]
73 ldrsh r3, [r0, #16]
74 ldrsh r4, [r0, #32]
75 ldrsh r2, [r0]
76 add r6, r3, r5 /* r6 = z1 = d1 + d3 */
77 add r7, r2, r4 /* r7 = tmp10 >> 2 = d0 + d2 */
78 smlabb r6, r14, r6, r8 /* z1 *= 4433 */
79 sub r2, r2, r4 /* r2 = tmp12 >> 2= d0 - d2 */
80 smlabb r3, r12, r3, r6 /* r3 = tmp2 = z1 + z2 * 6270 */
81 smlatb r5, r12, r5, r6 /* r5 = tmp0 = z1 - z3 * 15137 */
82 mov r7, r7, lsl #2
83 mov r2, r2, lsl #2
84 add r4, r7, r3, asr #11 /* r4 = o0 */
85 sub r7, r7, r3, asr #11 /* r7 = o3 */
86 add r3, r2, r5, asr #11 /* r3 = o1 */
87 sub r2, r2, r5, asr #11 /* r2 = o2 */
88 strh r4, [r0]
89 strh r7, [r0, #48]
90 strh r3, [r0, #16]
91 strh r2, [r0, #32]
92 add r0, r0, #2
93 teq r0, r1
94 bne 1b
95 ldmia sp!, { r4-r8, pc }
96#else
97 stmdb sp!, { r4-r10, lr }
98 ldr r2, =1024
99 ldr r3, =4433
100 ldr r12, =3302955134
1011:
102 ldr r6, [r0, #32]
103 ldr r4, [r0]
104 ldr r7, [r0, #48]
105 ldr r5, [r0, #16]
106 /* this part is being done in parallel on two columns */
107 sadd16 r8, r4, r6 /* r8 = d0 + d2 */
108 ssub16 r4, r4, r6 /* r4 = d0 - d2 */
109 sadd16 r6, r5, r7 /* r6 = d1 + d3 */
110 /* there is no parallel shift operation, but we can fake it with bic
111 and lsl */
112 bic r8, r8, #0xc000
113 bic r4, r4, #0xc000
114 /* multiplication expands values beyond 16 bits, so this part needs to be
115 split. the values will be merged below so that the rest of the addition
116 can be done in parallel */
117 smlabb r9, r3, r6, r2 /* r9 = z1[0] = (d1 * d3) * 4433 + 1024 */
118 smlabt r6, r3, r6, r2 /* r6 = z1[1] = (d1 * d3) * 4433 + 1024 */
119 smlabb r10, r12, r5, r9 /* r10 = tmp2[0] = z1 + d1 * 6270 */
120 smlatb r14, r12, r7, r9 /* r14 = tmp0[0] = z1 - d3 * 15137 */
121 smlabt r5, r12, r5, r6 /* r5 = tmp2[1] */
122 smlatt r6, r12, r7, r6 /* r6 = tmp0[1] */
123 mov r8, r8, lsl #2 /* complete the parallel shift started */
124 mov r4, r4, lsl #2 /* with the earlier bic instructions */
125 /* tmp2 are in r10, r5; tmp0 are in r14, r6 */
126 /* tmp10, tmp12 are in r4, r8 */
127 mov r10, r10, asr #11
128 mov r14, r14, asr #11
129 pkhbt r5, r10, r5, lsl #5 /* parallel tmp2 */
130 pkhbt r6, r14, r6, lsl #5 /* parallel tmp0 */
131 sadd16 r10, r8, r5 /* d0 */
132 ssub16 r5, r8, r5 /* d3 */
133 sadd16 r14, r4, r6 /* d1 */
134 ssub16 r6, r4, r6 /* d2 */
135 str r10, [r0]
136 str r5, [r0, #48]
137 str r14, [r0, #16]
138 str r6, [r0, #32]
139 add r0, r0, #4
140 cmp r0, r1
141 bcc 1b
142 ldmia sp!, { r4-r10, pc }
143#endif
144 .size jpeg_idct4v, .-jpeg_idct4v
145
146jpeg_idct4h:
147#if ARM_ARCH < 5
148 stmdb sp!, { r4-r10, lr }
149 ldr r10, =-15137
150 ldr r14, =4112
151 ldr r12, =6270
1521:
153 ldrsh r4, [r0]
154 ldrsh r6, [r0, #4]
155 ldrsh r7, [r0, #6]
156 ldrsh r5, [r0, #2]
157 add r4, r4, r14
158 add r8, r4, r6 /* r8 = tmp10 >> 2 = d0 + d2 */
159 sub r4, r4, r6 /* r4 = tmp12 >> 2= d0 - d2 */
160 add r6, r5, r7 /* r6 = z1 = d1 + d3 */
161 add r9, r6, r6, lsl #3
162 rsb r6, r6, r9, lsl #4
163 rsb r6, r6, r6, lsl #5 /* z1 *= 4433 */
164 mla r7, r10, r7, r6 /* r5 = tmp0 = z1 - z3 * 15137 */
165 mla r5, r12, r5, r6 /* r3 = tmp2 = z1 + z2 * 6270 */
166 add r9, r5, r8, lsl #13 /* r7 = o0 */
167 rsb r5, r5, r8, lsl #13 /* r3 = o3 */
168 add r8, r7, r4, lsl #13 /* r6 = o1 */
169 rsb r4, r7, r4, lsl #13 /* r2 = o2 */
170 mov r9, r9, asr #18
171 mov r8, r8, asr #18
172 mov r4, r4, asr #18
173 mov r5, r5, asr #18
174 cmp r9, #255
175 mvnhi r9, r9, asr #31
176 cmp r8, #255
177 mvnhi r8, r8, asr #31
178 cmp r4, #255
179 mvnhi r4, r4, asr #31
180 cmp r5, #255
181 mvnhi r5, r5, asr #31
182#ifdef HAVE_LCD_COLOR
183 strb r9, [r1]
184 strb r8, [r1, #4]
185 strb r4, [r1, #8]
186 strb r5, [r1, #12]
187#else
188 strb r9, [r1]
189 strb r8, [r1, #1]
190 strb r4, [r1, #2]
191 strb r5, [r1, #3]
192#endif
193 add r0, r0, #16
194 add r1, r1, r3
195 teq r0, r2
196 bne 1b
197 ldmia sp!, { r4-r10, pc }
198#elif ARM_ARCH < 6
199 stmdb sp!, { r4-r10, lr }
200 ldr r10, =4433
201 ldr r14, =4112
202 ldr r12, =3302955134
2031:
204 ldrsh r7, [r0, #6]
205 ldrsh r5, [r0, #2]
206 ldrsh r4, [r0]
207 ldrsh r6, [r0, #4]
208 add r8, r5, r7 /* r8 = z1 = d1 + d3 */
209 add r4, r4, r14
210 smulbb r8, r10, r8 /* z1 *= 4433 */
211 add r9, r4, r6 /* r9 = tmp10 >> 13 = d0 + d2 */
212 smlabb r5, r12, r5, r8 /* r5 = tmp2 = z1 + z2 * 6270 */
213 smlatb r7, r12, r7, r8 /* r7 = tmp0 = z1 - z3 * 15137 */
214 sub r4, r5, r6 /* r4 = tmp12 >> 13 = d0 - d2 */
215 add r6, r5, r9, lsl #13 /* r6 = o0 */
216 rsb r9, r5, r9, lsl #13 /* r9 = o3 */
217 add r5, r7, r4, lsl #13 /* r5 = o1 */
218 rsb r4, r7, r4, lsl #13 /* r4 = o2 */
219 mov r6, r6, asr #18
220 mov r5, r5, asr #18
221 mov r4, r4, asr #18
222 mov r9, r9, asr #18
223 cmp r6, #255
224 mvnhi r6, r6, asr #31
225 cmp r5, #255
226 mvnhi r5, r5, asr #31
227 cmp r4, #255
228 mvnhi r4, r4, asr #31
229 cmp r9, #255
230 mvnhi r9, r9, asr #31
231#ifdef HAVE_LCD_COLOR
232 strb r6, [r1]
233 strb r5, [r1, #4]
234 strb r4, [r1, #8]
235 strb r9, [r1, #12]
236#else
237 strb r6, [r1]
238 strb r5, [r1, #1]
239 strb r4, [r1, #2]
240 strb r9, [r1, #3]
241#endif
242 add r0, r0, #16
243 add r1, r1, r3
244 teq r0, r2
245 bne 1b
246 ldmia sp!, { r4-r10, pc }
247#else
248 stmdb sp!, { r4-r9, lr }
249 ldr r9, =4433
250 ldr r14, =4112
251 ldr r12, =3302955134
2521:
253 ldmia r0, { r4-r5 }
254 sadd16 r4, r4, r14
255 sadd16 r6, r4, r5 /* r6lo = d0 + d2, r6hi = d1 + d3 */
256 ssub16 r7, r4, r5 /* r7lo = d0 - d2 */
257 smulbt r8, r9, r6
258 sxth r6, r6
259 smlabt r4, r12, r4, r8 /* r4 = tmp2 = z1 + z2 * 6270 */
260 smlatt r5, r12, r5, r8 /* r5 = tmp0 = z1 - z3 * 15137 */
261 sxth r7, r7
262 add r8, r4, r6, lsl #13 /* r8 = o0 */
263 rsb r6, r4, r6, lsl #13 /* r6 = o3 */
264 add r4, r5, r7, lsl #13 /* r4 = o1 */
265 rsb r5, r5, r7, lsl #13 /* r5 = o2 */
266 usat r8, #8, r8, asr #18
267 usat r6, #8, r6, asr #18
268 usat r4, #8, r4, asr #18
269 usat r5, #8, r5, asr #18
270#ifdef HAVE_LCD_COLOR
271 strb r8, [r1]
272 strb r6, [r1, #12]
273 strb r4, [r1, #4]
274 strb r5, [r1, #8]
275#else
276 strb r8, [r1]
277 strb r6, [r1, #3]
278 strb r4, [r1, #1]
279 strb r5, [r1, #2]
280#endif
281 add r0, r0, #16
282 add r1, r1, r3
283 teq r0, r2
284 bne 1b
285 ldmia sp!, { r4-r9, pc }
286#endif
287 .size jpeg_idct4h, .-jpeg_idct4h
diff --git a/apps/recorder/jpeg_load.c b/apps/recorder/jpeg_load.c
index dc8bb33862..f2b3b4ba74 100644
--- a/apps/recorder/jpeg_load.c
+++ b/apps/recorder/jpeg_load.c
@@ -31,6 +31,7 @@
31#include "debug.h" 31#include "debug.h"
32#include "jpeg_load.h" 32#include "jpeg_load.h"
33/*#define JPEG_BS_DEBUG*/ 33/*#define JPEG_BS_DEBUG*/
34#define ROCKBOX_DEBUG_JPEG
34/* for portability of below JPEG code */ 35/* for portability of below JPEG code */
35#define MEMSET(p,v,c) memset(p,v,c) 36#define MEMSET(p,v,c) memset(p,v,c)
36#define MEMCPY(d,s,c) memcpy(d,s,c) 37#define MEMCPY(d,s,c) memcpy(d,s,c)
@@ -49,7 +50,23 @@ typedef struct uint8_rgb jpeg_pix_t;
49#else 50#else
50typedef uint8_t jpeg_pix_t; 51typedef uint8_t jpeg_pix_t;
51#endif 52#endif
53#define JPEG_IDCT_TRANSPOSE
52#define JPEG_PIX_SZ (sizeof(jpeg_pix_t)) 54#define JPEG_PIX_SZ (sizeof(jpeg_pix_t))
55#ifdef HAVE_LCD_COLOR
56#define COLOR_EXTRA_IDCT_WS 64
57#else
58#define COLOR_EXTRA_IDCT_WS 0
59#endif
60#ifdef JPEG_IDCT_TRANSPOSE
61#define V_OUT(n) ws2[8*n]
62#define V_IN_ST 1
63#define TRANSPOSE_EXTRA_IDCT_WS 64
64#else
65#define V_OUT(n) ws[8*n]
66#define V_IN_ST 8
67#define TRANSPOSE_EXTRA_IDCT_WS 0
68#endif
69#define IDCT_WS_SIZE (64 + TRANSPOSE_EXTRA_IDCT_WS + COLOR_EXTRA_IDCT_WS)
53 70
54/* This can't be in jpeg_load.h because plugin.h includes it, and it conflicts 71/* This can't be in jpeg_load.h because plugin.h includes it, and it conflicts
55 * with the definition in jpeg_decoder.h 72 * with the definition in jpeg_decoder.h
@@ -259,7 +276,7 @@ INLINE unsigned range_limit(int value)
259*/ 276*/
260 277
261/* horizontal-pass 1-point IDCT */ 278/* horizontal-pass 1-point IDCT */
262static void idct1h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep) 279static void jpeg_idct1h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
263{ 280{
264 for (; ws < end; ws += 8) 281 for (; ws < end; ws += 8)
265 { 282 {
@@ -269,19 +286,19 @@ static void idct1h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
269} 286}
270 287
271/* vertical-pass 2-point IDCT */ 288/* vertical-pass 2-point IDCT */
272static void idct2v(int16_t *ws, int16_t *end) 289static void jpeg_idct2v(int16_t *ws, int16_t *end)
273{ 290{
274 for (; ws < end; ws++) 291 for (; ws < end; ws++)
275 { 292 {
276 int tmp1 = ws[0]; 293 int tmp1 = ws[0*8];
277 int tmp2 = ws[8]; 294 int tmp2 = ws[1*8];
278 ws[0] = tmp1 + tmp2; 295 ws[0*8] = tmp1 + tmp2;
279 ws[8] = tmp1 - tmp2; 296 ws[1*8] = tmp1 - tmp2;
280 } 297 }
281} 298}
282 299
283/* horizontal-pass 2-point IDCT */ 300/* horizontal-pass 2-point IDCT */
284static void idct2h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep) 301static void jpeg_idct2h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
285{ 302{
286 for (; ws < end; ws += 8, out += rowstep) 303 for (; ws < end; ws += 8, out += rowstep)
287 { 304 {
@@ -295,69 +312,12 @@ static void idct2h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
295 } 312 }
296} 313}
297 314
315#ifndef CPU_ARM
298/* vertical-pass 4-point IDCT */ 316/* vertical-pass 4-point IDCT */
299static void idct4v(int16_t *ws, int16_t *end) 317static void jpeg_idct4v(int16_t *ws, int16_t *end)
300{ 318{
301 for (; ws < end; ws++) 319 for (; ws < end; ws++)
302 { 320 {
303#if defined(CPU_ARM)
304 int t0, t1, t2, t3, t4;
305#if ARM_ARCH <= 4
306 int t5;
307#endif
308 asm volatile(
309 "ldrsh %[t4], [%[ws]]\n\t" /* t4 = tmp0 (ws[8*0]) */
310 "ldrsh %[t1], [%[ws], #32]\n\t" /* t1 = tmp2 (ws[8*2]) */
311 "ldrsh %[t2], [%[ws], #16]\n\t" /* t2 = z2 (ws[8*1]) */
312 "add %[t0], %[t4], %[t1]\n\t" /* t0 = tmp10 >> 2
313 (tmp0 + tmp2) */
314 "sub %[t1], %[t4], %[t1]\n\t" /* t1 = tmp12 >> 2
315 (tmp0 - tmp2) */
316 "ldrsh %[t3], [%[ws], #48]\n\t" /* t3 = z3 (ws[8*3] */
317 "add %[t4], %[t2], %[t3]\n\t" /* t4 = z2 + z3 */
318#if ARM_ARCH > 4
319 "smulbb %[t4], %[c1], %[t4]\n\t"
320 "add %[t4], %[t4], #1024\n\t" /* t4 = z1 */
321 "smlatb %[t3], %[c2c3], %[t3], %[t4]\n\t"
322 "smlabb %[t2], %[c2c3], %[t2], %[t4]\n\t"
323 "mov %[t3], %[t3], asr #11\n\t" /* t3 = tmp0 */
324 "mov %[t2], %[t2], asr #11\n\t" /* t2 = tmp2 */
325#else
326 "add %[t5], %[t4], %[t4], lsl #3\n\t"
327 "rsb %[t4], %[t4], %[t5], lsl #4\n\t"
328 "rsb %[t4], %[t4], %[t4], lsl #5\n\t"
329 "add %[t4], %[t4], #1024\n\t" /*z1*/
330 "mla %[t3], %[c2], %[t3], %[t4]\n\t"
331 "mla %[t2], %[c3], %[t2], %[t4]\n\t"
332 "mov %[t3], %[t3], asr #11\n\t" /* t3 = tmp0 */
333 "mov %[t2], %[t2], asr #11\n\t" /* t2 = tmp2 */
334#endif
335 "add %[t4], %[t2], %[t0], lsl #2\n\t" /* t4 = tmp10 + tmp2 */
336 "rsb %[t0], %[t2], %[t0], lsl #2\n\t" /* t0 = tmp10 - tmp2 */
337 "add %[t2], %[t3], %[t1], lsl #2\n\t" /* t2 = tmp12 + tmp0 */
338 "rsb %[t3], %[t3], %[t1], lsl #2\n\t" /* t3 = tmp12 - tmp0 */
339 "strh %[t4], [%[ws]]\n\t"
340 "strh %[t0], [%[ws], #48]\n\t"
341 "strh %[t2], [%[ws], #16]\n\t"
342 "strh %[t3], [%[ws], #32]\n\t"
343 : [t0] "=&r" (t0),
344 [t1] "=&r" (t1),
345 [t2] "=&r" (t2),
346 [t3] "=&r" (t3),
347 [t4] "=&r" (t4)
348#if ARM_ARCH <= 4
349 ,[t5] "=&r" (t5)
350#endif
351 : [ws] "r" (ws),
352#if ARM_ARCH > 4
353 [c1] "r" (FIX_0_541196100),
354 [c2c3] "r" (((-FIX_1_847759065)<<16)|FIX_0_765366865)
355#else
356 [c2] "r" (-FIX_1_847759065),
357 [c3] "r" (FIX_0_765366865)
358#endif
359 );
360#else
361 int tmp0, tmp2, tmp10, tmp12; 321 int tmp0, tmp2, tmp10, tmp12;
362 int z1, z2, z3; 322 int z1, z2, z3;
363 /* Even part */ 323 /* Even part */
@@ -382,93 +342,18 @@ static void idct4v(int16_t *ws, int16_t *end)
382 CONST_BITS-PASS1_BITS); 342 CONST_BITS-PASS1_BITS);
383 343
384 /* Final output stage */ 344 /* Final output stage */
385
386 ws[8*0] = (int) (tmp10 + tmp2); 345 ws[8*0] = (int) (tmp10 + tmp2);
387 ws[8*3] = (int) (tmp10 - tmp2); 346 ws[8*3] = (int) (tmp10 - tmp2);
388 ws[8*1] = (int) (tmp12 + tmp0); 347 ws[8*1] = (int) (tmp12 + tmp0);
389 ws[8*2] = (int) (tmp12 - tmp0); 348 ws[8*2] = (int) (tmp12 - tmp0);
390#endif
391 } 349 }
392} 350}
393 351
394/* horizontal-pass 4-point IDCT */ 352/* horizontal-pass 4-point IDCT */
395static void idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep) 353static void jpeg_idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
396{ 354{
397 for (; ws < end; out += rowstep, ws += 8) 355 for (; ws < end; out += rowstep, ws += 8)
398 { 356 {
399#if defined(CPU_ARM)
400 int t0, t1, t2, t3, t4;
401#if ARM_ARCH <= 4
402 int t5;
403#endif
404 asm volatile(
405 "ldrsh %[t4], [%[ws]]\n\t" /* t4 = tmp0 (ws[0]) */
406 "ldrsh %[t1], [%[ws], #4]\n\t" /* t1 = tmp2 (ws[2]) */
407 "add %[t4], %[t4], #16\n\t" /* add rounding to DC */
408 "add %[t4], %[t4], #4096\n\t" /* pre-add offset */
409 "ldrsh %[t2], [%[ws], #2]\n\t" /* t2 = z2 (ws[1]) */
410 "add %[t0], %[t4], %[t1]\n\t" /* t0 = tmp10 >> 13
411 (tmp0 + tmp2) */
412 "sub %[t1], %[t4], %[t1]\n\t" /* t1 = tmp12 >> 13
413 (tmp0 - tmp2) */
414 "ldrsh %[t3], [%[ws], #6]\n\t" /* t3 = z3 (ws[3] */
415 "add %[t4], %[t2], %[t3]\n\t" /* t4 = z2 + z3 */
416#if ARM_ARCH > 4
417 "smulbb %[t4], %[c1], %[t4]\n\t"
418 "smlatb %[t3], %[c2c3], %[t3], %[t4]\n\t"
419 "smlabb %[t2], %[c2c3], %[t2], %[t4]\n\t"
420#else
421 "add %[t5], %[t4], %[t4], lsl #3\n\t"
422 "rsb %[t4], %[t4], %[t5], lsl #4\n\t"
423 "rsb %[t4], %[t4], %[t4], lsl #5\n\t" /* t4 = z1 */
424 "mla %[t3], %[c2], %[t3], %[t4]\n\t"
425 "mla %[t2], %[c3], %[t2], %[t4]\n\t"
426#endif
427 "add %[t4], %[t2], %[t0], lsl #13\n\t" /* t4 = tmp10 + tmp2 */
428 "rsb %[t0], %[t2], %[t0], lsl #13\n\t" /* t0 = tmp10 - tmp2 */
429 "add %[t2], %[t3], %[t1], lsl #13\n\t" /* t2 = tmp12 + tmp0 */
430 "rsb %[t3], %[t3], %[t1], lsl #13\n\t" /* t3 = tmp12 - tmp0 */
431 "mov %[t4], %[t4], asr #18\n\t" /* descale results */
432 "mov %[t0], %[t0], asr #18\n\t"
433 "mov %[t2], %[t2], asr #18\n\t"
434 "mov %[t3], %[t3], asr #18\n\t"
435 "cmp %[t4], #255\n\t" /* range limit results */
436 "mvnhi %[t4], %[t4], asr #31\n\t"
437 "cmp %[t0], #255\n\t"
438 "mvnhi %[t0], %[t0], asr #31\n\t"
439 "cmp %[t2], #255\n\t"
440 "mvnhi %[t2], %[t2], asr #31\n\t"
441 "cmp %[t3], #255\n\t"
442 "mvnhi %[t3], %[t3], asr #31\n\t"
443 "cmp %[t4], #255\n\t"
444 "mvnhi %[t4], %[t4], asr #31\n\t"
445 "strb %[t4], [%[out]]\n\t"
446 "strb %[t0], [%[out], %[o3]]\n\t"
447 "strb %[t2], [%[out], %[o1]]\n\t"
448 "strb %[t3], [%[out], %[o2]]\n\t"
449 : [t0] "=&r" (t0),
450 [t1] "=&r" (t1),
451 [t2] "=&r" (t2),
452 [t3] "=&r" (t3),
453 [t4] "=&r" (t4)
454#if ARM_ARCH <= 4
455
456 ,[t5] "=&r" (t5)
457#endif
458 : [ws] "r" (ws),
459 [out] "r" (out),
460 [o1] "i" (JPEG_PIX_SZ),
461 [o2] "i" (JPEG_PIX_SZ*2),
462 [o3] "i" (JPEG_PIX_SZ*3),
463#if ARM_ARCH > 4
464 [c1] "r" (FIX_0_541196100),
465 [c2c3] "r" (((-FIX_1_847759065)<<16)|FIX_0_765366865)
466#else
467 [c2] "r" (-FIX_1_847759065),
468 [c3] "r" (FIX_0_765366865)
469#endif
470 );
471#else
472 int tmp0, tmp2, tmp10, tmp12; 357 int tmp0, tmp2, tmp10, tmp12;
473 int z1, z2, z3; 358 int z1, z2, z3;
474 /* Even part */ 359 /* Even part */
@@ -500,18 +385,27 @@ static void idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
500 DS_OUT)); 385 DS_OUT));
501 out[JPEG_PIX_SZ*2] = range_limit((int) RIGHT_SHIFT(tmp12 - tmp0, 386 out[JPEG_PIX_SZ*2] = range_limit((int) RIGHT_SHIFT(tmp12 - tmp0,
502 DS_OUT)); 387 DS_OUT));
503#endif
504 } 388 }
505} 389}
390#else
391extern void jpeg_idct4v(int16_t *ws, int16_t *end);
392extern void jpeg_idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep);
393#endif
506 394
507/* vertical-pass 8-point IDCT */ 395/* vertical-pass 8-point IDCT */
508static void idct8v(int16_t *ws, int16_t *end) 396static void jpeg_idct8v(int16_t *ws, int16_t *end)
509{ 397{
510 long tmp0, tmp1, tmp2, tmp3; 398 long tmp0, tmp1, tmp2, tmp3;
511 long tmp10, tmp11, tmp12, tmp13; 399 long tmp10, tmp11, tmp12, tmp13;
512 long z1, z2, z3, z4, z5; 400 long z1, z2, z3, z4, z5;
401#ifdef JPEG_IDCT_TRANSPOSE
402 int16_t *ws2 = ws + 64;
403 for (; ws < end; ws += 8, ws2++)
404 {
405#else
513 for (; ws < end; ws++) 406 for (; ws < end; ws++)
514 { 407 {
408#endif
515 /* Due to quantization, we will usually find that many of the input 409 /* Due to quantization, we will usually find that many of the input
516 * coefficients are zero, especially the AC terms. We can exploit this 410 * coefficients are zero, especially the AC terms. We can exploit this
517 * by short-circuiting the IDCT calculation for any column in which all 411 * by short-circuiting the IDCT calculation for any column in which all
@@ -520,30 +414,30 @@ static void idct8v(int16_t *ws, int16_t *end)
520 * With typical images and quantization tables, half or more of the 414 * With typical images and quantization tables, half or more of the
521 * column DCT calculations can be simplified this way. 415 * column DCT calculations can be simplified this way.
522 */ 416 */
523 if ((ws[8*1] | ws[8*2] | ws[8*3] 417 if ((ws[V_IN_ST*1] | ws[V_IN_ST*2] | ws[V_IN_ST*3]
524 | ws[8*4] | ws[8*5] | ws[8*6] | ws[8*7]) == 0) 418 | ws[V_IN_ST*4] | ws[V_IN_ST*5] | ws[V_IN_ST*6] | ws[V_IN_ST*7]) == 0)
525 { 419 {
526 /* AC terms all zero */ 420 /* AC terms all zero */
527 int dcval = ws[8*0] << PASS1_BITS; 421 int dcval = ws[V_IN_ST*0] << PASS1_BITS;
528 422
529 ws[8*0] = ws[8*1] = ws[8*2] = ws[8*3] = ws[8*4] 423 V_OUT(0) = V_OUT(1) = V_OUT(2) = V_OUT(3) = V_OUT(4) = V_OUT(5) =
530 = ws[8*5] = ws[8*6] = ws[8*7] = dcval; 424 V_OUT(6) = V_OUT(7) = dcval;
531 continue; 425 continue;
532 } 426 }
533 427
534 /* Even part: reverse the even part of the forward DCT. */ 428 /* Even part: reverse the even part of the forward DCT. */
535 /* The rotator is sqrt(2)*c(-6). */ 429 /* The rotator is sqrt(2)*c(-6). */
536 430
537 z2 = ws[8*2]; 431 z2 = ws[V_IN_ST*2];
538 z3 = ws[8*6]; 432 z3 = ws[V_IN_ST*6];
539 433
540 z1 = MULTIPLY16(z2 + z3, FIX_0_541196100); 434 z1 = MULTIPLY16(z2 + z3, FIX_0_541196100);
541 tmp2 = z1 + MULTIPLY16(z3, - FIX_1_847759065); 435 tmp2 = z1 + MULTIPLY16(z3, - FIX_1_847759065);
542 tmp3 = z1 + MULTIPLY16(z2, FIX_0_765366865); 436 tmp3 = z1 + MULTIPLY16(z2, FIX_0_765366865);
543 437
544 z2 = ws[8*0] << CONST_BITS; 438 z2 = ws[V_IN_ST*0] << CONST_BITS;
545 z2 += ONE << (CONST_BITS - PASS1_BITS - 1); 439 z2 += ONE << (CONST_BITS - PASS1_BITS - 1);
546 z3 = ws[8*4] << CONST_BITS; 440 z3 = ws[V_IN_ST*4] << CONST_BITS;
547 441
548 tmp0 = (z2 + z3); 442 tmp0 = (z2 + z3);
549 tmp1 = (z2 - z3); 443 tmp1 = (z2 - z3);
@@ -556,10 +450,10 @@ static void idct8v(int16_t *ws, int16_t *end)
556 /* Odd part per figure 8; the matrix is unitary and hence its 450 /* Odd part per figure 8; the matrix is unitary and hence its
557 transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. */ 451 transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. */
558 452
559 tmp0 = ws[8*7]; 453 tmp0 = ws[V_IN_ST*7];
560 tmp1 = ws[8*5]; 454 tmp1 = ws[V_IN_ST*5];
561 tmp2 = ws[8*3]; 455 tmp2 = ws[V_IN_ST*3];
562 tmp3 = ws[8*1]; 456 tmp3 = ws[V_IN_ST*1];
563 457
564 z1 = tmp0 + tmp3; 458 z1 = tmp0 + tmp3;
565 z2 = tmp1 + tmp2; 459 z2 = tmp1 + tmp2;
@@ -586,19 +480,19 @@ static void idct8v(int16_t *ws, int16_t *end)
586 480
587 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ 481 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
588 482
589 ws[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS); 483 V_OUT(0) = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
590 ws[8*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS); 484 V_OUT(7) = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
591 ws[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS); 485 V_OUT(1) = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
592 ws[8*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS); 486 V_OUT(6) = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
593 ws[8*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS); 487 V_OUT(2) = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
594 ws[8*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS); 488 V_OUT(5) = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
595 ws[8*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS); 489 V_OUT(3) = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
596 ws[8*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS); 490 V_OUT(4) = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
597 } 491 }
598} 492}
599 493
600/* horizontal-pass 8-point IDCT */ 494/* horizontal-pass 8-point IDCT */
601static void idct8h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep) 495static void jpeg_idct8h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
602{ 496{
603 long tmp0, tmp1, tmp2, tmp3; 497 long tmp0, tmp1, tmp2, tmp3;
604 long tmp10, tmp11, tmp12, tmp13; 498 long tmp10, tmp11, tmp12, tmp13;
@@ -709,20 +603,26 @@ static void idct8h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
709 603
710#ifdef HAVE_LCD_COLOR 604#ifdef HAVE_LCD_COLOR
711/* vertical-pass 16-point IDCT */ 605/* vertical-pass 16-point IDCT */
712static void idct16v(int16_t *ws, int16_t *end) 606static void jpeg_idct16v(int16_t *ws, int16_t *end)
713{ 607{
714 long tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13; 608 long tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
715 long tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27; 609 long tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
716 long z1, z2, z3, z4; 610 long z1, z2, z3, z4;
611#ifdef JPEG_IDCT_TRANSPOSE
612 int16_t *ws2 = ws + 64;
613 for (; ws < end; ws += 8, ws2++)
614 {
615#else
717 for (; ws < end; ws++) 616 for (; ws < end; ws++)
718 { 617 {
618#endif
719 /* Even part */ 619 /* Even part */
720 620
721 tmp0 = ws[8*0] << CONST_BITS; 621 tmp0 = ws[V_IN_ST*0] << CONST_BITS;
722 /* Add fudge factor here for final descale. */ 622 /* Add fudge factor here for final descale. */
723 tmp0 += 1 << (CONST_BITS-PASS1_BITS-1); 623 tmp0 += 1 << (CONST_BITS-PASS1_BITS-1);
724 624
725 z1 = ws[8*4]; 625 z1 = ws[V_IN_ST*4];
726 tmp1 = MULTIPLY(z1, FIX(1.306562965)); /* c4[16] = c2[8] */ 626 tmp1 = MULTIPLY(z1, FIX(1.306562965)); /* c4[16] = c2[8] */
727 tmp2 = MULTIPLY(z1, FIX_0_541196100); /* c12[16] = c6[8] */ 627 tmp2 = MULTIPLY(z1, FIX_0_541196100); /* c12[16] = c6[8] */
728 628
@@ -731,8 +631,8 @@ static void idct16v(int16_t *ws, int16_t *end)
731 tmp12 = tmp0 + tmp2; 631 tmp12 = tmp0 + tmp2;
732 tmp13 = tmp0 - tmp2; 632 tmp13 = tmp0 - tmp2;
733 633
734 z1 = ws[8*2]; 634 z1 = ws[V_IN_ST*2];
735 z2 = ws[8*6]; 635 z2 = ws[V_IN_ST*6];
736 z3 = z1 - z2; 636 z3 = z1 - z2;
737 z4 = MULTIPLY(z3, FIX(0.275899379)); /* c14[16] = c7[8] */ 637 z4 = MULTIPLY(z3, FIX(0.275899379)); /* c14[16] = c7[8] */
738 z3 = MULTIPLY(z3, FIX(1.387039845)); /* c2[16] = c1[8] */ 638 z3 = MULTIPLY(z3, FIX(1.387039845)); /* c2[16] = c1[8] */
@@ -757,10 +657,10 @@ static void idct16v(int16_t *ws, int16_t *end)
757 657
758 /* Odd part */ 658 /* Odd part */
759 659
760 z1 = ws[8*1]; 660 z1 = ws[V_IN_ST*1];
761 z2 = ws[8*3]; 661 z2 = ws[V_IN_ST*3];
762 z3 = ws[8*5]; 662 z3 = ws[V_IN_ST*5];
763 z4 = ws[8*7]; 663 z4 = ws[V_IN_ST*7];
764 664
765 tmp11 = z1 + z3; 665 tmp11 = z1 + z3;
766 666
@@ -795,27 +695,27 @@ static void idct16v(int16_t *ws, int16_t *end)
795 tmp11 += z2; 695 tmp11 += z2;
796 696
797 /* Final output stage */ 697 /* Final output stage */
798 ws[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp0, CONST_BITS-PASS1_BITS); 698 V_OUT(0) = (int) RIGHT_SHIFT(tmp20 + tmp0, CONST_BITS-PASS1_BITS);
799 ws[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0, CONST_BITS-PASS1_BITS); 699 V_OUT(15) = (int) RIGHT_SHIFT(tmp20 - tmp0, CONST_BITS-PASS1_BITS);
800 ws[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp1, CONST_BITS-PASS1_BITS); 700 V_OUT(1) = (int) RIGHT_SHIFT(tmp21 + tmp1, CONST_BITS-PASS1_BITS);
801 ws[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1, CONST_BITS-PASS1_BITS); 701 V_OUT(14) = (int) RIGHT_SHIFT(tmp21 - tmp1, CONST_BITS-PASS1_BITS);
802 ws[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp2, CONST_BITS-PASS1_BITS); 702 V_OUT(2) = (int) RIGHT_SHIFT(tmp22 + tmp2, CONST_BITS-PASS1_BITS);
803 ws[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2, CONST_BITS-PASS1_BITS); 703 V_OUT(13) = (int) RIGHT_SHIFT(tmp22 - tmp2, CONST_BITS-PASS1_BITS);
804 ws[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp3, CONST_BITS-PASS1_BITS); 704 V_OUT(3) = (int) RIGHT_SHIFT(tmp23 + tmp3, CONST_BITS-PASS1_BITS);
805 ws[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3, CONST_BITS-PASS1_BITS); 705 V_OUT(12) = (int) RIGHT_SHIFT(tmp23 - tmp3, CONST_BITS-PASS1_BITS);
806 ws[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS); 706 V_OUT(4) = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
807 ws[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS); 707 V_OUT(11) = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
808 ws[8*5] = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS); 708 V_OUT(5) = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
809 ws[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS); 709 V_OUT(10) = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
810 ws[8*6] = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS); 710 V_OUT(6) = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
811 ws[8*9] = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS); 711 V_OUT(9) = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
812 ws[8*7] = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS); 712 V_OUT(7) = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
813 ws[8*8] = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS); 713 V_OUT(8) = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
814 } 714 }
815} 715}
816 716
817/* horizontal-pass 16-point IDCT */ 717/* horizontal-pass 16-point IDCT */
818static void idct16h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep) 718static void jpeg_idct16h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
819{ 719{
820 long tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13; 720 long tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
821 long tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27; 721 long tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
@@ -946,12 +846,12 @@ struct idct_entry {
946}; 846};
947 847
948struct idct_entry idct_tbl[] = { 848struct idct_entry idct_tbl[] = {
949 { PASS1_BITS, NULL, idct1h }, 849 { PASS1_BITS, NULL, jpeg_idct1h },
950 { PASS1_BITS, idct2v, idct2h }, 850 { PASS1_BITS, jpeg_idct2v, jpeg_idct2h },
951 { 0, idct4v, idct4h }, 851 { 0, jpeg_idct4v, jpeg_idct4h },
952 { 0, idct8v, idct8h }, 852 { 0, jpeg_idct8v, jpeg_idct8h },
953#ifdef HAVE_LCD_COLOR 853#ifdef HAVE_LCD_COLOR
954 { 0, idct16v, idct16h }, 854 { 0, jpeg_idct16v, jpeg_idct16h },
955#endif 855#endif
956}; 856};
957 857
@@ -1468,21 +1368,27 @@ static void fix_huff_tbl(int* htbl, struct derived_tbl* dtbl)
1468} 1368}
1469 1369
1470 1370
1471/* zag[i] is the natural-order position of the i'th element of zigzag order. 1371/* zag[i] is the natural-order position of the i'th element of zigzag order. */
1472 * If the incoming data is corrupted, decode_mcu could attempt to
1473 * reference values beyond the end of the array. To avoid a wild store,
1474 * we put some extra zeroes after the real entries.
1475 */
1476static const unsigned char zag[] = 1372static const unsigned char zag[] =
1477{ 1373{
1478 0, 1, 8, 16, 9, 2, 3, 10, 1374#ifdef JPEG_IDCT_TRANSPOSE
1479 17, 24, 32, 25, 18, 11, 4, 5, 1375 0, 8, 1, 2, 9, 16, 24, 17,
1480 12, 19, 26, 33, 40, 48, 41, 34, 1376 10, 3, 4, 11, 18, 25, 32, 40,
1481 27, 20, 13, 6, 7, 14, 21, 28, 1377 33, 26, 19, 12, 5, 6, 13, 20,
1482 35, 42, 49, 56, 57, 50, 43, 36, 1378 27, 34, 41, 48, 56, 49, 42, 35,
1483 29, 22, 15, 23, 30, 37, 44, 51, 1379 28, 21, 14, 7, 15, 22, 29, 36,
1484 58, 59, 52, 45, 38, 31, 39, 46, 1380 43, 50, 57, 58, 51, 44, 37, 30,
1485 53, 60, 61, 54, 47, 55, 62, 63, 1381 23, 31, 38, 45, 52, 59, 60, 53,
1382 46, 39, 47, 54, 61, 62, 55, 63,
1383#endif
1384 0, 1, 8, 16, 9, 2, 3, 10,
1385 17, 24, 32, 25, 18, 11, 4, 5,
1386 12, 19, 26, 33, 40, 48, 41, 34,
1387 27, 20, 13, 6, 7, 14, 21, 28,
1388 35, 42, 49, 56, 57, 50, 43, 36,
1389 29, 22, 15, 23, 30, 37, 44, 51,
1390 58, 59, 52, 45, 38, 31, 39, 46,
1391 53, 60, 61, 54, 47, 55, 62, 63,
1486}; 1392};
1487 1393
1488/* zig[i] is the the zig-zag order position of the i'th element of natural 1394/* zig[i] is the the zig-zag order position of the i'th element of natural
@@ -1898,17 +1804,20 @@ static struct img_part *store_row_jpeg(void *jpeg_args)
1898 store_offs[p_jpeg->store_pos[1]] = JPEG_PIX_SZ << p_jpeg->h_scale[0]; 1804 store_offs[p_jpeg->store_pos[1]] = JPEG_PIX_SZ << p_jpeg->h_scale[0];
1899 store_offs[p_jpeg->store_pos[2]] = b_width << p_jpeg->v_scale[0]; 1805 store_offs[p_jpeg->store_pos[2]] = b_width << p_jpeg->v_scale[0];
1900 store_offs[p_jpeg->store_pos[3]] = store_offs[1] + store_offs[2]; 1806 store_offs[p_jpeg->store_pos[3]] = store_offs[1] + store_offs[2];
1901 1807 /* decoded DCT coefficients */
1902 int16_t block[128]; /* decoded DCT coefficients */ 1808 int16_t block[IDCT_WS_SIZE] __attribute__((aligned(8)));
1903 for (x = 0; x < p_jpeg->x_mbl; x++) 1809 for (x = 0; x < p_jpeg->x_mbl; x++)
1904 { 1810 {
1905 int blkn; 1811 int blkn;
1906 for (blkn = 0; blkn < p_jpeg->blocks; blkn++) 1812 for (blkn = 0; blkn < p_jpeg->blocks; blkn++)
1907 { 1813 {
1908 int k = 1; /* coefficient index */
1909 int s, r; /* huffman values */
1910 int ci = p_jpeg->mcu_membership[blkn]; /* component index */ 1814 int ci = p_jpeg->mcu_membership[blkn]; /* component index */
1911 int ti = p_jpeg->tab_membership[blkn]; /* table index */ 1815 int ti = p_jpeg->tab_membership[blkn]; /* table index */
1816#ifdef JPEG_IDCT_TRANSPOSE
1817 bool transpose = p_jpeg->v_scale[!!ci] > 2;
1818#endif
1819 int k = 1; /* coefficient index */
1820 int s, r; /* huffman values */
1912 struct derived_tbl* dctbl = &p_jpeg->dc_derived_tbls[ti]; 1821 struct derived_tbl* dctbl = &p_jpeg->dc_derived_tbls[ti];
1913 struct derived_tbl* actbl = &p_jpeg->ac_derived_tbls[ti]; 1822 struct derived_tbl* actbl = &p_jpeg->ac_derived_tbls[ti];
1914 1823
@@ -1948,7 +1857,11 @@ static struct img_part *store_row_jpeg(void *jpeg_args)
1948 r = get_bits(p_jpeg, s); 1857 r = get_bits(p_jpeg, s);
1949 r = HUFF_EXTEND(r, s); 1858 r = HUFF_EXTEND(r, s);
1950 r = MULTIPLY16(r, p_jpeg->quanttable[!!ci][k]); 1859 r = MULTIPLY16(r, p_jpeg->quanttable[!!ci][k]);
1860#ifdef JPEG_IDCT_TRANSPOSE
1861 block[zag[transpose ? k : k + 64]] = r ;
1862#else
1951 block[zag[k]] = r ; 1863 block[zag[k]] = r ;
1864#endif
1952 } 1865 }
1953 else 1866 else
1954 { 1867 {
@@ -1988,10 +1901,19 @@ block_end:
1988 int idct_rows = BIT_N(p_jpeg->v_scale[!!ci]); 1901 int idct_rows = BIT_N(p_jpeg->v_scale[!!ci]);
1989 unsigned char *b_out = out + (ci ? ci : store_offs[blkn]); 1902 unsigned char *b_out = out + (ci ? ci : store_offs[blkn]);
1990 if (idct_tbl[p_jpeg->v_scale[!!ci]].v_idct) 1903 if (idct_tbl[p_jpeg->v_scale[!!ci]].v_idct)
1904#ifdef JPEG_IDCT_TRANSPOSE
1905 idct_tbl[p_jpeg->v_scale[!!ci]].v_idct(block,
1906 transpose ? block + 8 * idct_cols
1907 : block + idct_cols);
1908 uint16_t * h_block = transpose ? block + 64 : block;
1909 idct_tbl[p_jpeg->h_scale[!!ci]].h_idct(h_block, b_out,
1910 h_block + idct_rows * 8, b_width);
1911#else
1991 idct_tbl[p_jpeg->v_scale[!!ci]].v_idct(block, 1912 idct_tbl[p_jpeg->v_scale[!!ci]].v_idct(block,
1992 block + idct_cols); 1913 block + idct_cols);
1993 idct_tbl[p_jpeg->h_scale[!!ci]].h_idct(block, b_out, 1914 idct_tbl[p_jpeg->h_scale[!!ci]].h_idct(block, b_out,
1994 block + idct_rows * 8, b_width); 1915 block + idct_rows * 8, b_width);
1916#endif
1995 } 1917 }
1996 } /* for blkn */ 1918 } /* for blkn */
1997 /* don't starve other threads while an MCU row decodes */ 1919 /* don't starve other threads while an MCU row decodes */
@@ -2048,7 +1970,6 @@ int read_jpeg_file(const char* filename,
2048{ 1970{
2049 int fd, ret; 1971 int fd, ret;
2050 fd = open(filename, O_RDONLY); 1972 fd = open(filename, O_RDONLY);
2051
2052 JDEBUGF("read_jpeg_file: filename: %s buffer len: %d cformat: %p\n", 1973 JDEBUGF("read_jpeg_file: filename: %s buffer len: %d cformat: %p\n",
2053 filename, maxsize, cformat); 1974 filename, maxsize, cformat);
2054 /* Exit if file opening failed */ 1975 /* Exit if file opening failed */
@@ -2181,14 +2102,22 @@ int read_jpeg_fd(int fd,
2181 int decode_h = BIT_N(p_jpeg->v_scale[0]) - 1; 2102 int decode_h = BIT_N(p_jpeg->v_scale[0]) - 1;
2182 src_dim.width = (p_jpeg->x_size << p_jpeg->h_scale[0]) >> 3; 2103 src_dim.width = (p_jpeg->x_size << p_jpeg->h_scale[0]) >> 3;
2183 src_dim.height = (p_jpeg->y_size << p_jpeg->v_scale[0]) >> 3; 2104 src_dim.height = (p_jpeg->y_size << p_jpeg->v_scale[0]) >> 3;
2184 p_jpeg->zero_need[0] = (decode_h << 3) + decode_w; 2105#ifdef JPEG_IDCT_TRANSPOSE
2185 p_jpeg->k_need[0] = zig[p_jpeg->zero_need[0]]; 2106 if (p_jpeg->v_scale[0] > 2)
2107 p_jpeg->zero_need[0] = (decode_w << 3) + decode_h;
2108 else
2109#endif
2110 p_jpeg->zero_need[0] = (decode_h << 3) + decode_w;
2111 p_jpeg->k_need[0] = zig[(decode_h << 3) + decode_w];
2186 JDEBUGF("need luma components to %d\n", p_jpeg->k_need[0]); 2112 JDEBUGF("need luma components to %d\n", p_jpeg->k_need[0]);
2187#ifdef HAVE_LCD_COLOR 2113#ifdef HAVE_LCD_COLOR
2188 decode_w = BIT_N(MIN(p_jpeg->h_scale[1],3)) - 1; 2114 decode_w = BIT_N(MIN(p_jpeg->h_scale[1],3)) - 1;
2189 decode_h = BIT_N(MIN(p_jpeg->v_scale[1],3)) - 1; 2115 decode_h = BIT_N(MIN(p_jpeg->v_scale[1],3)) - 1;
2190 p_jpeg->zero_need[1] = (decode_h << 3) + decode_w; 2116 if (p_jpeg->v_scale[1] > 2)
2191 p_jpeg->k_need[1] = zig[p_jpeg->zero_need[1]]; 2117 p_jpeg->zero_need[1] = (decode_w << 3) + decode_h;
2118 else
2119 p_jpeg->zero_need[1] = (decode_h << 3) + decode_w;
2120 p_jpeg->k_need[1] = zig[(decode_h << 3) + decode_w];
2192 JDEBUGF("need chroma components to %d\n", p_jpeg->k_need[1]); 2121 JDEBUGF("need chroma components to %d\n", p_jpeg->k_need[1]);
2193#endif 2122#endif
2194 if (cformat) 2123 if (cformat)