summaryrefslogtreecommitdiff
path: root/apps/recorder/jpeg_load.c
diff options
context:
space:
mode:
authorAndrew Mahone <andrew.mahone@gmail.com>2009-06-19 02:56:00 +0000
committerAndrew Mahone <andrew.mahone@gmail.com>2009-06-19 02:56:00 +0000
commit6a0d931f383259b4b82fcfd1cc87700f53bbcb02 (patch)
tree04960b8122f35101bb31603af084536f307e0ae1 /apps/recorder/jpeg_load.c
parent4c58ad26ba462309f95790c32421130a73909f05 (diff)
downloadrockbox-6a0d931f383259b4b82fcfd1cc87700f53bbcb02.tar.gz
rockbox-6a0d931f383259b4b82fcfd1cc87700f53bbcb02.zip
Core JPEG decoder improvements:
For >8-point vertical IDCT, transpose the coefficients while decoding them, so that the vertical IDCT can read in rows rather than columns. This improves speed a bit for this size even using the C IDCT. Remove inline ARM asm, replacing it with an external file containing pure asm IDCT functions. Add jpeg_ prefix to JPEG IDCT functions since some of them will now be visible globally. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@21345 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/recorder/jpeg_load.c')
-rw-r--r--apps/recorder/jpeg_load.c367
1 files changed, 148 insertions, 219 deletions
diff --git a/apps/recorder/jpeg_load.c b/apps/recorder/jpeg_load.c
index dc8bb33862..f2b3b4ba74 100644
--- a/apps/recorder/jpeg_load.c
+++ b/apps/recorder/jpeg_load.c
@@ -31,6 +31,7 @@
31#include "debug.h" 31#include "debug.h"
32#include "jpeg_load.h" 32#include "jpeg_load.h"
33/*#define JPEG_BS_DEBUG*/ 33/*#define JPEG_BS_DEBUG*/
34#define ROCKBOX_DEBUG_JPEG
34/* for portability of below JPEG code */ 35/* for portability of below JPEG code */
35#define MEMSET(p,v,c) memset(p,v,c) 36#define MEMSET(p,v,c) memset(p,v,c)
36#define MEMCPY(d,s,c) memcpy(d,s,c) 37#define MEMCPY(d,s,c) memcpy(d,s,c)
@@ -49,7 +50,23 @@ typedef struct uint8_rgb jpeg_pix_t;
49#else 50#else
50typedef uint8_t jpeg_pix_t; 51typedef uint8_t jpeg_pix_t;
51#endif 52#endif
53#define JPEG_IDCT_TRANSPOSE
52#define JPEG_PIX_SZ (sizeof(jpeg_pix_t)) 54#define JPEG_PIX_SZ (sizeof(jpeg_pix_t))
55#ifdef HAVE_LCD_COLOR
56#define COLOR_EXTRA_IDCT_WS 64
57#else
58#define COLOR_EXTRA_IDCT_WS 0
59#endif
60#ifdef JPEG_IDCT_TRANSPOSE
61#define V_OUT(n) ws2[8*n]
62#define V_IN_ST 1
63#define TRANSPOSE_EXTRA_IDCT_WS 64
64#else
65#define V_OUT(n) ws[8*n]
66#define V_IN_ST 8
67#define TRANSPOSE_EXTRA_IDCT_WS 0
68#endif
69#define IDCT_WS_SIZE (64 + TRANSPOSE_EXTRA_IDCT_WS + COLOR_EXTRA_IDCT_WS)
53 70
54/* This can't be in jpeg_load.h because plugin.h includes it, and it conflicts 71/* This can't be in jpeg_load.h because plugin.h includes it, and it conflicts
55 * with the definition in jpeg_decoder.h 72 * with the definition in jpeg_decoder.h
@@ -259,7 +276,7 @@ INLINE unsigned range_limit(int value)
259*/ 276*/
260 277
261/* horizontal-pass 1-point IDCT */ 278/* horizontal-pass 1-point IDCT */
262static void idct1h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep) 279static void jpeg_idct1h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
263{ 280{
264 for (; ws < end; ws += 8) 281 for (; ws < end; ws += 8)
265 { 282 {
@@ -269,19 +286,19 @@ static void idct1h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
269} 286}
270 287
271/* vertical-pass 2-point IDCT */ 288/* vertical-pass 2-point IDCT */
272static void idct2v(int16_t *ws, int16_t *end) 289static void jpeg_idct2v(int16_t *ws, int16_t *end)
273{ 290{
274 for (; ws < end; ws++) 291 for (; ws < end; ws++)
275 { 292 {
276 int tmp1 = ws[0]; 293 int tmp1 = ws[0*8];
277 int tmp2 = ws[8]; 294 int tmp2 = ws[1*8];
278 ws[0] = tmp1 + tmp2; 295 ws[0*8] = tmp1 + tmp2;
279 ws[8] = tmp1 - tmp2; 296 ws[1*8] = tmp1 - tmp2;
280 } 297 }
281} 298}
282 299
283/* horizontal-pass 2-point IDCT */ 300/* horizontal-pass 2-point IDCT */
284static void idct2h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep) 301static void jpeg_idct2h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
285{ 302{
286 for (; ws < end; ws += 8, out += rowstep) 303 for (; ws < end; ws += 8, out += rowstep)
287 { 304 {
@@ -295,69 +312,12 @@ static void idct2h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
295 } 312 }
296} 313}
297 314
315#ifndef CPU_ARM
298/* vertical-pass 4-point IDCT */ 316/* vertical-pass 4-point IDCT */
299static void idct4v(int16_t *ws, int16_t *end) 317static void jpeg_idct4v(int16_t *ws, int16_t *end)
300{ 318{
301 for (; ws < end; ws++) 319 for (; ws < end; ws++)
302 { 320 {
303#if defined(CPU_ARM)
304 int t0, t1, t2, t3, t4;
305#if ARM_ARCH <= 4
306 int t5;
307#endif
308 asm volatile(
309 "ldrsh %[t4], [%[ws]]\n\t" /* t4 = tmp0 (ws[8*0]) */
310 "ldrsh %[t1], [%[ws], #32]\n\t" /* t1 = tmp2 (ws[8*2]) */
311 "ldrsh %[t2], [%[ws], #16]\n\t" /* t2 = z2 (ws[8*1]) */
312 "add %[t0], %[t4], %[t1]\n\t" /* t0 = tmp10 >> 2
313 (tmp0 + tmp2) */
314 "sub %[t1], %[t4], %[t1]\n\t" /* t1 = tmp12 >> 2
315 (tmp0 - tmp2) */
316 "ldrsh %[t3], [%[ws], #48]\n\t" /* t3 = z3 (ws[8*3] */
317 "add %[t4], %[t2], %[t3]\n\t" /* t4 = z2 + z3 */
318#if ARM_ARCH > 4
319 "smulbb %[t4], %[c1], %[t4]\n\t"
320 "add %[t4], %[t4], #1024\n\t" /* t4 = z1 */
321 "smlatb %[t3], %[c2c3], %[t3], %[t4]\n\t"
322 "smlabb %[t2], %[c2c3], %[t2], %[t4]\n\t"
323 "mov %[t3], %[t3], asr #11\n\t" /* t3 = tmp0 */
324 "mov %[t2], %[t2], asr #11\n\t" /* t2 = tmp2 */
325#else
326 "add %[t5], %[t4], %[t4], lsl #3\n\t"
327 "rsb %[t4], %[t4], %[t5], lsl #4\n\t"
328 "rsb %[t4], %[t4], %[t4], lsl #5\n\t"
329 "add %[t4], %[t4], #1024\n\t" /*z1*/
330 "mla %[t3], %[c2], %[t3], %[t4]\n\t"
331 "mla %[t2], %[c3], %[t2], %[t4]\n\t"
332 "mov %[t3], %[t3], asr #11\n\t" /* t3 = tmp0 */
333 "mov %[t2], %[t2], asr #11\n\t" /* t2 = tmp2 */
334#endif
335 "add %[t4], %[t2], %[t0], lsl #2\n\t" /* t4 = tmp10 + tmp2 */
336 "rsb %[t0], %[t2], %[t0], lsl #2\n\t" /* t0 = tmp10 - tmp2 */
337 "add %[t2], %[t3], %[t1], lsl #2\n\t" /* t2 = tmp12 + tmp0 */
338 "rsb %[t3], %[t3], %[t1], lsl #2\n\t" /* t3 = tmp12 - tmp0 */
339 "strh %[t4], [%[ws]]\n\t"
340 "strh %[t0], [%[ws], #48]\n\t"
341 "strh %[t2], [%[ws], #16]\n\t"
342 "strh %[t3], [%[ws], #32]\n\t"
343 : [t0] "=&r" (t0),
344 [t1] "=&r" (t1),
345 [t2] "=&r" (t2),
346 [t3] "=&r" (t3),
347 [t4] "=&r" (t4)
348#if ARM_ARCH <= 4
349 ,[t5] "=&r" (t5)
350#endif
351 : [ws] "r" (ws),
352#if ARM_ARCH > 4
353 [c1] "r" (FIX_0_541196100),
354 [c2c3] "r" (((-FIX_1_847759065)<<16)|FIX_0_765366865)
355#else
356 [c2] "r" (-FIX_1_847759065),
357 [c3] "r" (FIX_0_765366865)
358#endif
359 );
360#else
361 int tmp0, tmp2, tmp10, tmp12; 321 int tmp0, tmp2, tmp10, tmp12;
362 int z1, z2, z3; 322 int z1, z2, z3;
363 /* Even part */ 323 /* Even part */
@@ -382,93 +342,18 @@ static void idct4v(int16_t *ws, int16_t *end)
382 CONST_BITS-PASS1_BITS); 342 CONST_BITS-PASS1_BITS);
383 343
384 /* Final output stage */ 344 /* Final output stage */
385
386 ws[8*0] = (int) (tmp10 + tmp2); 345 ws[8*0] = (int) (tmp10 + tmp2);
387 ws[8*3] = (int) (tmp10 - tmp2); 346 ws[8*3] = (int) (tmp10 - tmp2);
388 ws[8*1] = (int) (tmp12 + tmp0); 347 ws[8*1] = (int) (tmp12 + tmp0);
389 ws[8*2] = (int) (tmp12 - tmp0); 348 ws[8*2] = (int) (tmp12 - tmp0);
390#endif
391 } 349 }
392} 350}
393 351
394/* horizontal-pass 4-point IDCT */ 352/* horizontal-pass 4-point IDCT */
395static void idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep) 353static void jpeg_idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
396{ 354{
397 for (; ws < end; out += rowstep, ws += 8) 355 for (; ws < end; out += rowstep, ws += 8)
398 { 356 {
399#if defined(CPU_ARM)
400 int t0, t1, t2, t3, t4;
401#if ARM_ARCH <= 4
402 int t5;
403#endif
404 asm volatile(
405 "ldrsh %[t4], [%[ws]]\n\t" /* t4 = tmp0 (ws[0]) */
406 "ldrsh %[t1], [%[ws], #4]\n\t" /* t1 = tmp2 (ws[2]) */
407 "add %[t4], %[t4], #16\n\t" /* add rounding to DC */
408 "add %[t4], %[t4], #4096\n\t" /* pre-add offset */
409 "ldrsh %[t2], [%[ws], #2]\n\t" /* t2 = z2 (ws[1]) */
410 "add %[t0], %[t4], %[t1]\n\t" /* t0 = tmp10 >> 13
411 (tmp0 + tmp2) */
412 "sub %[t1], %[t4], %[t1]\n\t" /* t1 = tmp12 >> 13
413 (tmp0 - tmp2) */
414 "ldrsh %[t3], [%[ws], #6]\n\t" /* t3 = z3 (ws[3] */
415 "add %[t4], %[t2], %[t3]\n\t" /* t4 = z2 + z3 */
416#if ARM_ARCH > 4
417 "smulbb %[t4], %[c1], %[t4]\n\t"
418 "smlatb %[t3], %[c2c3], %[t3], %[t4]\n\t"
419 "smlabb %[t2], %[c2c3], %[t2], %[t4]\n\t"
420#else
421 "add %[t5], %[t4], %[t4], lsl #3\n\t"
422 "rsb %[t4], %[t4], %[t5], lsl #4\n\t"
423 "rsb %[t4], %[t4], %[t4], lsl #5\n\t" /* t4 = z1 */
424 "mla %[t3], %[c2], %[t3], %[t4]\n\t"
425 "mla %[t2], %[c3], %[t2], %[t4]\n\t"
426#endif
427 "add %[t4], %[t2], %[t0], lsl #13\n\t" /* t4 = tmp10 + tmp2 */
428 "rsb %[t0], %[t2], %[t0], lsl #13\n\t" /* t0 = tmp10 - tmp2 */
429 "add %[t2], %[t3], %[t1], lsl #13\n\t" /* t2 = tmp12 + tmp0 */
430 "rsb %[t3], %[t3], %[t1], lsl #13\n\t" /* t3 = tmp12 - tmp0 */
431 "mov %[t4], %[t4], asr #18\n\t" /* descale results */
432 "mov %[t0], %[t0], asr #18\n\t"
433 "mov %[t2], %[t2], asr #18\n\t"
434 "mov %[t3], %[t3], asr #18\n\t"
435 "cmp %[t4], #255\n\t" /* range limit results */
436 "mvnhi %[t4], %[t4], asr #31\n\t"
437 "cmp %[t0], #255\n\t"
438 "mvnhi %[t0], %[t0], asr #31\n\t"
439 "cmp %[t2], #255\n\t"
440 "mvnhi %[t2], %[t2], asr #31\n\t"
441 "cmp %[t3], #255\n\t"
442 "mvnhi %[t3], %[t3], asr #31\n\t"
443 "cmp %[t4], #255\n\t"
444 "mvnhi %[t4], %[t4], asr #31\n\t"
445 "strb %[t4], [%[out]]\n\t"
446 "strb %[t0], [%[out], %[o3]]\n\t"
447 "strb %[t2], [%[out], %[o1]]\n\t"
448 "strb %[t3], [%[out], %[o2]]\n\t"
449 : [t0] "=&r" (t0),
450 [t1] "=&r" (t1),
451 [t2] "=&r" (t2),
452 [t3] "=&r" (t3),
453 [t4] "=&r" (t4)
454#if ARM_ARCH <= 4
455
456 ,[t5] "=&r" (t5)
457#endif
458 : [ws] "r" (ws),
459 [out] "r" (out),
460 [o1] "i" (JPEG_PIX_SZ),
461 [o2] "i" (JPEG_PIX_SZ*2),
462 [o3] "i" (JPEG_PIX_SZ*3),
463#if ARM_ARCH > 4
464 [c1] "r" (FIX_0_541196100),
465 [c2c3] "r" (((-FIX_1_847759065)<<16)|FIX_0_765366865)
466#else
467 [c2] "r" (-FIX_1_847759065),
468 [c3] "r" (FIX_0_765366865)
469#endif
470 );
471#else
472 int tmp0, tmp2, tmp10, tmp12; 357 int tmp0, tmp2, tmp10, tmp12;
473 int z1, z2, z3; 358 int z1, z2, z3;
474 /* Even part */ 359 /* Even part */
@@ -500,18 +385,27 @@ static void idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
500 DS_OUT)); 385 DS_OUT));
501 out[JPEG_PIX_SZ*2] = range_limit((int) RIGHT_SHIFT(tmp12 - tmp0, 386 out[JPEG_PIX_SZ*2] = range_limit((int) RIGHT_SHIFT(tmp12 - tmp0,
502 DS_OUT)); 387 DS_OUT));
503#endif
504 } 388 }
505} 389}
390#else
391extern void jpeg_idct4v(int16_t *ws, int16_t *end);
392extern void jpeg_idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep);
393#endif
506 394
507/* vertical-pass 8-point IDCT */ 395/* vertical-pass 8-point IDCT */
508static void idct8v(int16_t *ws, int16_t *end) 396static void jpeg_idct8v(int16_t *ws, int16_t *end)
509{ 397{
510 long tmp0, tmp1, tmp2, tmp3; 398 long tmp0, tmp1, tmp2, tmp3;
511 long tmp10, tmp11, tmp12, tmp13; 399 long tmp10, tmp11, tmp12, tmp13;
512 long z1, z2, z3, z4, z5; 400 long z1, z2, z3, z4, z5;
401#ifdef JPEG_IDCT_TRANSPOSE
402 int16_t *ws2 = ws + 64;
403 for (; ws < end; ws += 8, ws2++)
404 {
405#else
513 for (; ws < end; ws++) 406 for (; ws < end; ws++)
514 { 407 {
408#endif
515 /* Due to quantization, we will usually find that many of the input 409 /* Due to quantization, we will usually find that many of the input
516 * coefficients are zero, especially the AC terms. We can exploit this 410 * coefficients are zero, especially the AC terms. We can exploit this
517 * by short-circuiting the IDCT calculation for any column in which all 411 * by short-circuiting the IDCT calculation for any column in which all
@@ -520,30 +414,30 @@ static void idct8v(int16_t *ws, int16_t *end)
520 * With typical images and quantization tables, half or more of the 414 * With typical images and quantization tables, half or more of the
521 * column DCT calculations can be simplified this way. 415 * column DCT calculations can be simplified this way.
522 */ 416 */
523 if ((ws[8*1] | ws[8*2] | ws[8*3] 417 if ((ws[V_IN_ST*1] | ws[V_IN_ST*2] | ws[V_IN_ST*3]
524 | ws[8*4] | ws[8*5] | ws[8*6] | ws[8*7]) == 0) 418 | ws[V_IN_ST*4] | ws[V_IN_ST*5] | ws[V_IN_ST*6] | ws[V_IN_ST*7]) == 0)
525 { 419 {
526 /* AC terms all zero */ 420 /* AC terms all zero */
527 int dcval = ws[8*0] << PASS1_BITS; 421 int dcval = ws[V_IN_ST*0] << PASS1_BITS;
528 422
529 ws[8*0] = ws[8*1] = ws[8*2] = ws[8*3] = ws[8*4] 423 V_OUT(0) = V_OUT(1) = V_OUT(2) = V_OUT(3) = V_OUT(4) = V_OUT(5) =
530 = ws[8*5] = ws[8*6] = ws[8*7] = dcval; 424 V_OUT(6) = V_OUT(7) = dcval;
531 continue; 425 continue;
532 } 426 }
533 427
534 /* Even part: reverse the even part of the forward DCT. */ 428 /* Even part: reverse the even part of the forward DCT. */
535 /* The rotator is sqrt(2)*c(-6). */ 429 /* The rotator is sqrt(2)*c(-6). */
536 430
537 z2 = ws[8*2]; 431 z2 = ws[V_IN_ST*2];
538 z3 = ws[8*6]; 432 z3 = ws[V_IN_ST*6];
539 433
540 z1 = MULTIPLY16(z2 + z3, FIX_0_541196100); 434 z1 = MULTIPLY16(z2 + z3, FIX_0_541196100);
541 tmp2 = z1 + MULTIPLY16(z3, - FIX_1_847759065); 435 tmp2 = z1 + MULTIPLY16(z3, - FIX_1_847759065);
542 tmp3 = z1 + MULTIPLY16(z2, FIX_0_765366865); 436 tmp3 = z1 + MULTIPLY16(z2, FIX_0_765366865);
543 437
544 z2 = ws[8*0] << CONST_BITS; 438 z2 = ws[V_IN_ST*0] << CONST_BITS;
545 z2 += ONE << (CONST_BITS - PASS1_BITS - 1); 439 z2 += ONE << (CONST_BITS - PASS1_BITS - 1);
546 z3 = ws[8*4] << CONST_BITS; 440 z3 = ws[V_IN_ST*4] << CONST_BITS;
547 441
548 tmp0 = (z2 + z3); 442 tmp0 = (z2 + z3);
549 tmp1 = (z2 - z3); 443 tmp1 = (z2 - z3);
@@ -556,10 +450,10 @@ static void idct8v(int16_t *ws, int16_t *end)
556 /* Odd part per figure 8; the matrix is unitary and hence its 450 /* Odd part per figure 8; the matrix is unitary and hence its
557 transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. */ 451 transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. */
558 452
559 tmp0 = ws[8*7]; 453 tmp0 = ws[V_IN_ST*7];
560 tmp1 = ws[8*5]; 454 tmp1 = ws[V_IN_ST*5];
561 tmp2 = ws[8*3]; 455 tmp2 = ws[V_IN_ST*3];
562 tmp3 = ws[8*1]; 456 tmp3 = ws[V_IN_ST*1];
563 457
564 z1 = tmp0 + tmp3; 458 z1 = tmp0 + tmp3;
565 z2 = tmp1 + tmp2; 459 z2 = tmp1 + tmp2;
@@ -586,19 +480,19 @@ static void idct8v(int16_t *ws, int16_t *end)
586 480
587 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ 481 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
588 482
589 ws[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS); 483 V_OUT(0) = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
590 ws[8*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS); 484 V_OUT(7) = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
591 ws[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS); 485 V_OUT(1) = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
592 ws[8*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS); 486 V_OUT(6) = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
593 ws[8*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS); 487 V_OUT(2) = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
594 ws[8*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS); 488 V_OUT(5) = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
595 ws[8*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS); 489 V_OUT(3) = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
596 ws[8*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS); 490 V_OUT(4) = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
597 } 491 }
598} 492}
599 493
600/* horizontal-pass 8-point IDCT */ 494/* horizontal-pass 8-point IDCT */
601static void idct8h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep) 495static void jpeg_idct8h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
602{ 496{
603 long tmp0, tmp1, tmp2, tmp3; 497 long tmp0, tmp1, tmp2, tmp3;
604 long tmp10, tmp11, tmp12, tmp13; 498 long tmp10, tmp11, tmp12, tmp13;
@@ -709,20 +603,26 @@ static void idct8h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
709 603
710#ifdef HAVE_LCD_COLOR 604#ifdef HAVE_LCD_COLOR
711/* vertical-pass 16-point IDCT */ 605/* vertical-pass 16-point IDCT */
712static void idct16v(int16_t *ws, int16_t *end) 606static void jpeg_idct16v(int16_t *ws, int16_t *end)
713{ 607{
714 long tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13; 608 long tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
715 long tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27; 609 long tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
716 long z1, z2, z3, z4; 610 long z1, z2, z3, z4;
611#ifdef JPEG_IDCT_TRANSPOSE
612 int16_t *ws2 = ws + 64;
613 for (; ws < end; ws += 8, ws2++)
614 {
615#else
717 for (; ws < end; ws++) 616 for (; ws < end; ws++)
718 { 617 {
618#endif
719 /* Even part */ 619 /* Even part */
720 620
721 tmp0 = ws[8*0] << CONST_BITS; 621 tmp0 = ws[V_IN_ST*0] << CONST_BITS;
722 /* Add fudge factor here for final descale. */ 622 /* Add fudge factor here for final descale. */
723 tmp0 += 1 << (CONST_BITS-PASS1_BITS-1); 623 tmp0 += 1 << (CONST_BITS-PASS1_BITS-1);
724 624
725 z1 = ws[8*4]; 625 z1 = ws[V_IN_ST*4];
726 tmp1 = MULTIPLY(z1, FIX(1.306562965)); /* c4[16] = c2[8] */ 626 tmp1 = MULTIPLY(z1, FIX(1.306562965)); /* c4[16] = c2[8] */
727 tmp2 = MULTIPLY(z1, FIX_0_541196100); /* c12[16] = c6[8] */ 627 tmp2 = MULTIPLY(z1, FIX_0_541196100); /* c12[16] = c6[8] */
728 628
@@ -731,8 +631,8 @@ static void idct16v(int16_t *ws, int16_t *end)
731 tmp12 = tmp0 + tmp2; 631 tmp12 = tmp0 + tmp2;
732 tmp13 = tmp0 - tmp2; 632 tmp13 = tmp0 - tmp2;
733 633
734 z1 = ws[8*2]; 634 z1 = ws[V_IN_ST*2];
735 z2 = ws[8*6]; 635 z2 = ws[V_IN_ST*6];
736 z3 = z1 - z2; 636 z3 = z1 - z2;
737 z4 = MULTIPLY(z3, FIX(0.275899379)); /* c14[16] = c7[8] */ 637 z4 = MULTIPLY(z3, FIX(0.275899379)); /* c14[16] = c7[8] */
738 z3 = MULTIPLY(z3, FIX(1.387039845)); /* c2[16] = c1[8] */ 638 z3 = MULTIPLY(z3, FIX(1.387039845)); /* c2[16] = c1[8] */
@@ -757,10 +657,10 @@ static void idct16v(int16_t *ws, int16_t *end)
757 657
758 /* Odd part */ 658 /* Odd part */
759 659
760 z1 = ws[8*1]; 660 z1 = ws[V_IN_ST*1];
761 z2 = ws[8*3]; 661 z2 = ws[V_IN_ST*3];
762 z3 = ws[8*5]; 662 z3 = ws[V_IN_ST*5];
763 z4 = ws[8*7]; 663 z4 = ws[V_IN_ST*7];
764 664
765 tmp11 = z1 + z3; 665 tmp11 = z1 + z3;
766 666
@@ -795,27 +695,27 @@ static void idct16v(int16_t *ws, int16_t *end)
795 tmp11 += z2; 695 tmp11 += z2;
796 696
797 /* Final output stage */ 697 /* Final output stage */
798 ws[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp0, CONST_BITS-PASS1_BITS); 698 V_OUT(0) = (int) RIGHT_SHIFT(tmp20 + tmp0, CONST_BITS-PASS1_BITS);
799 ws[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0, CONST_BITS-PASS1_BITS); 699 V_OUT(15) = (int) RIGHT_SHIFT(tmp20 - tmp0, CONST_BITS-PASS1_BITS);
800 ws[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp1, CONST_BITS-PASS1_BITS); 700 V_OUT(1) = (int) RIGHT_SHIFT(tmp21 + tmp1, CONST_BITS-PASS1_BITS);
801 ws[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1, CONST_BITS-PASS1_BITS); 701 V_OUT(14) = (int) RIGHT_SHIFT(tmp21 - tmp1, CONST_BITS-PASS1_BITS);
802 ws[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp2, CONST_BITS-PASS1_BITS); 702 V_OUT(2) = (int) RIGHT_SHIFT(tmp22 + tmp2, CONST_BITS-PASS1_BITS);
803 ws[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2, CONST_BITS-PASS1_BITS); 703 V_OUT(13) = (int) RIGHT_SHIFT(tmp22 - tmp2, CONST_BITS-PASS1_BITS);
804 ws[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp3, CONST_BITS-PASS1_BITS); 704 V_OUT(3) = (int) RIGHT_SHIFT(tmp23 + tmp3, CONST_BITS-PASS1_BITS);
805 ws[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3, CONST_BITS-PASS1_BITS); 705 V_OUT(12) = (int) RIGHT_SHIFT(tmp23 - tmp3, CONST_BITS-PASS1_BITS);
806 ws[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS); 706 V_OUT(4) = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
807 ws[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS); 707 V_OUT(11) = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
808 ws[8*5] = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS); 708 V_OUT(5) = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
809 ws[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS); 709 V_OUT(10) = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
810 ws[8*6] = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS); 710 V_OUT(6) = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
811 ws[8*9] = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS); 711 V_OUT(9) = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
812 ws[8*7] = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS); 712 V_OUT(7) = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
813 ws[8*8] = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS); 713 V_OUT(8) = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
814 } 714 }
815} 715}
816 716
817/* horizontal-pass 16-point IDCT */ 717/* horizontal-pass 16-point IDCT */
818static void idct16h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep) 718static void jpeg_idct16h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
819{ 719{
820 long tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13; 720 long tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
821 long tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27; 721 long tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
@@ -946,12 +846,12 @@ struct idct_entry {
946}; 846};
947 847
948struct idct_entry idct_tbl[] = { 848struct idct_entry idct_tbl[] = {
949 { PASS1_BITS, NULL, idct1h }, 849 { PASS1_BITS, NULL, jpeg_idct1h },
950 { PASS1_BITS, idct2v, idct2h }, 850 { PASS1_BITS, jpeg_idct2v, jpeg_idct2h },
951 { 0, idct4v, idct4h }, 851 { 0, jpeg_idct4v, jpeg_idct4h },
952 { 0, idct8v, idct8h }, 852 { 0, jpeg_idct8v, jpeg_idct8h },
953#ifdef HAVE_LCD_COLOR 853#ifdef HAVE_LCD_COLOR
954 { 0, idct16v, idct16h }, 854 { 0, jpeg_idct16v, jpeg_idct16h },
955#endif 855#endif
956}; 856};
957 857
@@ -1468,21 +1368,27 @@ static void fix_huff_tbl(int* htbl, struct derived_tbl* dtbl)
1468} 1368}
1469 1369
1470 1370
1471/* zag[i] is the natural-order position of the i'th element of zigzag order. 1371/* zag[i] is the natural-order position of the i'th element of zigzag order. */
1472 * If the incoming data is corrupted, decode_mcu could attempt to
1473 * reference values beyond the end of the array. To avoid a wild store,
1474 * we put some extra zeroes after the real entries.
1475 */
1476static const unsigned char zag[] = 1372static const unsigned char zag[] =
1477{ 1373{
1478 0, 1, 8, 16, 9, 2, 3, 10, 1374#ifdef JPEG_IDCT_TRANSPOSE
1479 17, 24, 32, 25, 18, 11, 4, 5, 1375 0, 8, 1, 2, 9, 16, 24, 17,
1480 12, 19, 26, 33, 40, 48, 41, 34, 1376 10, 3, 4, 11, 18, 25, 32, 40,
1481 27, 20, 13, 6, 7, 14, 21, 28, 1377 33, 26, 19, 12, 5, 6, 13, 20,
1482 35, 42, 49, 56, 57, 50, 43, 36, 1378 27, 34, 41, 48, 56, 49, 42, 35,
1483 29, 22, 15, 23, 30, 37, 44, 51, 1379 28, 21, 14, 7, 15, 22, 29, 36,
1484 58, 59, 52, 45, 38, 31, 39, 46, 1380 43, 50, 57, 58, 51, 44, 37, 30,
1485 53, 60, 61, 54, 47, 55, 62, 63, 1381 23, 31, 38, 45, 52, 59, 60, 53,
1382 46, 39, 47, 54, 61, 62, 55, 63,
1383#endif
1384 0, 1, 8, 16, 9, 2, 3, 10,
1385 17, 24, 32, 25, 18, 11, 4, 5,
1386 12, 19, 26, 33, 40, 48, 41, 34,
1387 27, 20, 13, 6, 7, 14, 21, 28,
1388 35, 42, 49, 56, 57, 50, 43, 36,
1389 29, 22, 15, 23, 30, 37, 44, 51,
1390 58, 59, 52, 45, 38, 31, 39, 46,
1391 53, 60, 61, 54, 47, 55, 62, 63,
1486}; 1392};
1487 1393
1488/* zig[i] is the the zig-zag order position of the i'th element of natural 1394/* zig[i] is the the zig-zag order position of the i'th element of natural
@@ -1898,17 +1804,20 @@ static struct img_part *store_row_jpeg(void *jpeg_args)
1898 store_offs[p_jpeg->store_pos[1]] = JPEG_PIX_SZ << p_jpeg->h_scale[0]; 1804 store_offs[p_jpeg->store_pos[1]] = JPEG_PIX_SZ << p_jpeg->h_scale[0];
1899 store_offs[p_jpeg->store_pos[2]] = b_width << p_jpeg->v_scale[0]; 1805 store_offs[p_jpeg->store_pos[2]] = b_width << p_jpeg->v_scale[0];
1900 store_offs[p_jpeg->store_pos[3]] = store_offs[1] + store_offs[2]; 1806 store_offs[p_jpeg->store_pos[3]] = store_offs[1] + store_offs[2];
1901 1807 /* decoded DCT coefficients */
1902 int16_t block[128]; /* decoded DCT coefficients */ 1808 int16_t block[IDCT_WS_SIZE] __attribute__((aligned(8)));
1903 for (x = 0; x < p_jpeg->x_mbl; x++) 1809 for (x = 0; x < p_jpeg->x_mbl; x++)
1904 { 1810 {
1905 int blkn; 1811 int blkn;
1906 for (blkn = 0; blkn < p_jpeg->blocks; blkn++) 1812 for (blkn = 0; blkn < p_jpeg->blocks; blkn++)
1907 { 1813 {
1908 int k = 1; /* coefficient index */
1909 int s, r; /* huffman values */
1910 int ci = p_jpeg->mcu_membership[blkn]; /* component index */ 1814 int ci = p_jpeg->mcu_membership[blkn]; /* component index */
1911 int ti = p_jpeg->tab_membership[blkn]; /* table index */ 1815 int ti = p_jpeg->tab_membership[blkn]; /* table index */
1816#ifdef JPEG_IDCT_TRANSPOSE
1817 bool transpose = p_jpeg->v_scale[!!ci] > 2;
1818#endif
1819 int k = 1; /* coefficient index */
1820 int s, r; /* huffman values */
1912 struct derived_tbl* dctbl = &p_jpeg->dc_derived_tbls[ti]; 1821 struct derived_tbl* dctbl = &p_jpeg->dc_derived_tbls[ti];
1913 struct derived_tbl* actbl = &p_jpeg->ac_derived_tbls[ti]; 1822 struct derived_tbl* actbl = &p_jpeg->ac_derived_tbls[ti];
1914 1823
@@ -1948,7 +1857,11 @@ static struct img_part *store_row_jpeg(void *jpeg_args)
1948 r = get_bits(p_jpeg, s); 1857 r = get_bits(p_jpeg, s);
1949 r = HUFF_EXTEND(r, s); 1858 r = HUFF_EXTEND(r, s);
1950 r = MULTIPLY16(r, p_jpeg->quanttable[!!ci][k]); 1859 r = MULTIPLY16(r, p_jpeg->quanttable[!!ci][k]);
1860#ifdef JPEG_IDCT_TRANSPOSE
1861 block[zag[transpose ? k : k + 64]] = r ;
1862#else
1951 block[zag[k]] = r ; 1863 block[zag[k]] = r ;
1864#endif
1952 } 1865 }
1953 else 1866 else
1954 { 1867 {
@@ -1988,10 +1901,19 @@ block_end:
1988 int idct_rows = BIT_N(p_jpeg->v_scale[!!ci]); 1901 int idct_rows = BIT_N(p_jpeg->v_scale[!!ci]);
1989 unsigned char *b_out = out + (ci ? ci : store_offs[blkn]); 1902 unsigned char *b_out = out + (ci ? ci : store_offs[blkn]);
1990 if (idct_tbl[p_jpeg->v_scale[!!ci]].v_idct) 1903 if (idct_tbl[p_jpeg->v_scale[!!ci]].v_idct)
1904#ifdef JPEG_IDCT_TRANSPOSE
1905 idct_tbl[p_jpeg->v_scale[!!ci]].v_idct(block,
1906 transpose ? block + 8 * idct_cols
1907 : block + idct_cols);
1908 uint16_t * h_block = transpose ? block + 64 : block;
1909 idct_tbl[p_jpeg->h_scale[!!ci]].h_idct(h_block, b_out,
1910 h_block + idct_rows * 8, b_width);
1911#else
1991 idct_tbl[p_jpeg->v_scale[!!ci]].v_idct(block, 1912 idct_tbl[p_jpeg->v_scale[!!ci]].v_idct(block,
1992 block + idct_cols); 1913 block + idct_cols);
1993 idct_tbl[p_jpeg->h_scale[!!ci]].h_idct(block, b_out, 1914 idct_tbl[p_jpeg->h_scale[!!ci]].h_idct(block, b_out,
1994 block + idct_rows * 8, b_width); 1915 block + idct_rows * 8, b_width);
1916#endif
1995 } 1917 }
1996 } /* for blkn */ 1918 } /* for blkn */
1997 /* don't starve other threads while an MCU row decodes */ 1919 /* don't starve other threads while an MCU row decodes */
@@ -2048,7 +1970,6 @@ int read_jpeg_file(const char* filename,
2048{ 1970{
2049 int fd, ret; 1971 int fd, ret;
2050 fd = open(filename, O_RDONLY); 1972 fd = open(filename, O_RDONLY);
2051
2052 JDEBUGF("read_jpeg_file: filename: %s buffer len: %d cformat: %p\n", 1973 JDEBUGF("read_jpeg_file: filename: %s buffer len: %d cformat: %p\n",
2053 filename, maxsize, cformat); 1974 filename, maxsize, cformat);
2054 /* Exit if file opening failed */ 1975 /* Exit if file opening failed */
@@ -2181,14 +2102,22 @@ int read_jpeg_fd(int fd,
2181 int decode_h = BIT_N(p_jpeg->v_scale[0]) - 1; 2102 int decode_h = BIT_N(p_jpeg->v_scale[0]) - 1;
2182 src_dim.width = (p_jpeg->x_size << p_jpeg->h_scale[0]) >> 3; 2103 src_dim.width = (p_jpeg->x_size << p_jpeg->h_scale[0]) >> 3;
2183 src_dim.height = (p_jpeg->y_size << p_jpeg->v_scale[0]) >> 3; 2104 src_dim.height = (p_jpeg->y_size << p_jpeg->v_scale[0]) >> 3;
2184 p_jpeg->zero_need[0] = (decode_h << 3) + decode_w; 2105#ifdef JPEG_IDCT_TRANSPOSE
2185 p_jpeg->k_need[0] = zig[p_jpeg->zero_need[0]]; 2106 if (p_jpeg->v_scale[0] > 2)
2107 p_jpeg->zero_need[0] = (decode_w << 3) + decode_h;
2108 else
2109#endif
2110 p_jpeg->zero_need[0] = (decode_h << 3) + decode_w;
2111 p_jpeg->k_need[0] = zig[(decode_h << 3) + decode_w];
2186 JDEBUGF("need luma components to %d\n", p_jpeg->k_need[0]); 2112 JDEBUGF("need luma components to %d\n", p_jpeg->k_need[0]);
2187#ifdef HAVE_LCD_COLOR 2113#ifdef HAVE_LCD_COLOR
2188 decode_w = BIT_N(MIN(p_jpeg->h_scale[1],3)) - 1; 2114 decode_w = BIT_N(MIN(p_jpeg->h_scale[1],3)) - 1;
2189 decode_h = BIT_N(MIN(p_jpeg->v_scale[1],3)) - 1; 2115 decode_h = BIT_N(MIN(p_jpeg->v_scale[1],3)) - 1;
2190 p_jpeg->zero_need[1] = (decode_h << 3) + decode_w; 2116 if (p_jpeg->v_scale[1] > 2)
2191 p_jpeg->k_need[1] = zig[p_jpeg->zero_need[1]]; 2117 p_jpeg->zero_need[1] = (decode_w << 3) + decode_h;
2118 else
2119 p_jpeg->zero_need[1] = (decode_h << 3) + decode_w;
2120 p_jpeg->k_need[1] = zig[(decode_h << 3) + decode_w];
2192 JDEBUGF("need chroma components to %d\n", p_jpeg->k_need[1]); 2121 JDEBUGF("need chroma components to %d\n", p_jpeg->k_need[1]);
2193#endif 2122#endif
2194 if (cformat) 2123 if (cformat)