From 3152bfc39a8e1b85c4d0e82fbb75a10dab528ea6 Mon Sep 17 00:00:00 2001 From: Andrew Mahone Date: Sat, 27 Jun 2009 09:21:22 +0000 Subject: ARM assembly 8-point IDCT, both passes. No ARMv5/6 optimizations yet, aside from usat for final output. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@21526 a1c6a512-1295-4272-9138-f99709370657 --- apps/recorder/jpeg_idct_arm.S | 265 ++++++++++++++++++++++++++++++++++++++++++ apps/recorder/jpeg_load.c | 17 +-- 2 files changed, 275 insertions(+), 7 deletions(-) diff --git a/apps/recorder/jpeg_idct_arm.S b/apps/recorder/jpeg_idct_arm.S index b9c94e5639..01b08c4b5a 100644 --- a/apps/recorder/jpeg_idct_arm.S +++ b/apps/recorder/jpeg_idct_arm.S @@ -35,6 +35,10 @@ .type jpeg_idct4v, %function .global jpeg_idct4h .type jpeg_idct4h, %function + .global jpeg_idct8v + .type jpeg_idct8v, %function + .global jpeg_idct8h + .type jpeg_idct8h, %function jpeg_idct1h: /* In the common case of one pass through the loop, the extra add should be @@ -414,3 +418,264 @@ jpeg_idct4h: ldmia sp!, { r4-r9, pc } #endif .size jpeg_idct4h, .-jpeg_idct4h + +jpeg_idct8v: + stmdb sp!, { r4-r11, lr } + add r2, r0, #128 +1: + ldmia r0!, { r4-r7 } + mov r8, r4, lsl #16 + orrs r9, r6, r7 + orreqs r9, r5, r4, lsr #16 + bne 2f + mov r8, r8, asr #14 + strh r8, [r2] + strh r8, [r2, #16] + strh r8, [r2, #32] + strh r8, [r2, #48] + strh r8, [r2, #64] + strh r8, [r2, #80] + strh r8, [r2, #96] + strh r8, [r2, #112] + cmp r0, r1 + add r2, r2, #2 + bcc 1b + ldmia sp!, { r4-r11, pc } +2: + ldr r14, =4433 + ldr r12, =-15137 + mov r10, r5, lsl #16 + mov r11, r7, lsl #16 + mov r10, r10, asr #16 /* r10 = z2 = d2 */ + mov r11, r11, asr #16 /* r11 = z3 = d6 */ + add r8, r8, #8192 + add r9, r10, r11 + mov r8, r8, asr #3 /* r8 = z4 = (d0 + 4112) << 13 */ + mul r9, r14, r9 /* r9 = z1 = (z2 + z3) * 4433 */ + ldr r14, =6270 + mla r11, r12, r11, r9 /* r11 = tmp2 = z1 - z3 * 15137 */ + mla r10, r14, r10, r9 /* r10 = tmp3 = z1 + z2 * 6270 */ + mov r9, r6, lsl #16 /* r9 = z5 << 3 = d4 << 16 */ + add r12, r8, r9, asr #3 /* r12 = tmp0 = z4 + z5 */ + sub r14, r8, r9, asr #3 /* r14 = tmp1 = z4 - z5 */ + add r8, r12, r10 /* r8 = tmp10 = tmp0 + tmp3 */ + sub r9, r12, r10 /* r9 = tmp13 = tmp0 - tmp3 */ + add r10, r14, r11 /* r10 = tmp11 = tmp1 + tmp2 */ + sub r11, r14, r11 /* r11 = tmp12 = tmp1 - tmp2 */ + stmdb sp, { r8-r11 } /* tmp10 tmp13 tmp11 tmp12 */ + mov r4, r4, asr #16 /* r4 = tmp3 = d1 */ + mov r5, r5, asr #16 /* r5 = tmp2 = d3 */ + mov r6, r6, asr #16 /* r6 = tmp1 = d5 */ + mov r7, r7, asr #16 /* r7 = tmp0 = d7 */ + ldr r10, =9633 + ldr r11, =-16069 + add r12, r5, r7 /* r12 = z3 = tmp0 + tmp2 */ + add r14, r4, r6 /* r14 = z4 = tmp1 + tmp3 */ + add r9, r12, r14 /* r9 = z3 + z4 */ + mul r9, r10, r9 /* r9 = z5 = (z3 + z4) * 9633 */ + ldr r10, =-3196 + mla r12, r11, r12, r9 /* r12 = z3 = z5 - z3 * 16069 */ + ldr r11, =-7373 + mla r14, r10, r14, r9 /* r14 = z4 = z5 - z4 * 3196 */ + ldr r10, =2446 + add r9, r4, r7 /* r9 = tmp0 + tmp3 */ + mla r8, r11, r9, r12 /* r8 = z1 + z3 */ + mla r9, r11, r9, r14 /* r9 = z1 + z4 */ + ldr r11, =12299 + mla r7, r10, r7, r8 /* r7 = tmp0 = z1 + z3 + tmp0 * 2446 */ + ldr r10, =-20995 + mla r4, r11, r4, r9 /* r4 = tmp3 = z1 + z4 + tmp0 * 12299 */ + ldr r11, =25172 + add r9, r5, r6 /* r9 = tmp1 + tmp2 */ + mla r12, r10, r9, r12 /* r12 = z2 + z3 */ + mla r14, r10, r9, r14 /* r14 = z2 + z4 */ + ldr r10, =16819 + mla r5, r11, r5, r12 /* r5 = tmp2 = z2 + z3 + tmp2 * 25172 */ + mla r6, r10, r6, r14 /* r6 = tmp1 = z2 + z4 + tmp1 * 16819 */ + ldmdb sp, { r8-r11 } + add r12, r8, r4 /* o0 */ + sub r14, r8, r4 /* o7 */ + add r8, r9, r7 /* o3 */ + sub r9, r9, r7 /* o4 */ + add r4, r10, r5 /* O1 */ + sub r5, r10, r5 /* o6 */ + add r10, r11, r6 /* o2 */ + sub r11, r11, r6 /* o5 */ + /* output in order: r12 r4 r10 r8 r9 r11 r5 r14 */ + mov r12, r12, asr #11 + mov r4, r4, asr #11 + mov r10, r10, asr #11 + mov r8, r8, asr #11 + mov r9, r9, asr #11 + mov r11, r11, asr #11 + mov r5, r5, asr #11 + mov r14, r14, asr #11 + strh r12, [r2] + strh r4, [r2, #16] + strh r10, [r2, #32] + strh r8, [r2, #48] + strh r9, [r2, #64] + strh r11, [r2, #80] + strh r5, [r2, #96] + strh r14, [r2, #112] + cmp r0, r1 + add r2, r2, #2 + bcc 1b + ldmia sp!, { r4-r11, pc } + .size jpeg_idct8v, .-jpeg_idct8v + +jpeg_idct8h: + stmdb sp!, { r4-r11, lr } +1: + ldmia r0!, { r4-r7 } + ldr r14, =4112 + mov r8, r4, lsl #16 + add r8, r8, r14, lsl #16 + orrs r9, r6, r7 + orreqs r9, r5, r4, lsr #16 + bne 2f + mov r8, r8, asr #21 + cmp r8, #255 + mvnhi r8, r8, asr #31 +#ifdef HAVE_LCD_COLOR + strb r8, [r1] + strb r8, [r1, #4] + strb r8, [r1, #8] + strb r8, [r1, #12] + strb r8, [r1, #16] + strb r8, [r1, #20] + strb r8, [r1, #24] + strb r8, [r1, #28] +#else + strb r8, [r1] + strb r8, [r1, #1] + strb r8, [r1, #2] + strb r8, [r1, #3] + strb r8, [r1, #4] + strb r8, [r1, #5] + strb r8, [r1, #6] + strb r8, [r1, #7] +#endif + add r1, r1, r3 + cmp r0, r2 + bcc 1b + ldmia sp!, { r4-r11, pc } +2: + ldr r14, =4433 + ldr r12, =-15137 + mov r10, r5, lsl #16 + mov r11, r7, lsl #16 + mov r10, r10, asr #16 /* r10 = z2 = d2 */ + mov r11, r11, asr #16 /* r11 = z3 = d6 */ + add r9, r10, r11 + mov r8, r8, asr #3 /* r8 = z4 = (d0 + 4112) << 13 */ + mul r9, r14, r9 /* r9 = z1 = (z2 + z3) * 4433 */ + ldr r14, =6270 + mla r11, r12, r11, r9 /* r11 = tmp2 = z1 - z3 * 15137 */ + mla r10, r14, r10, r9 /* r10 = tmp3 = z1 + z2 * 6270 */ + mov r9, r6, lsl #16 /* r9 = z5 << 3 = d4 << 16 */ + add r12, r8, r9, asr #3 /* r12 = tmp0 = z4 + z5 */ + sub r14, r8, r9, asr #3 /* r14 = tmp1 = z4 - z5 */ + add r8, r12, r10 /* r8 = tmp10 = tmp0 + tmp3 */ + sub r9, r12, r10 /* r9 = tmp13 = tmp0 - tmp3 */ + add r10, r14, r11 /* r10 = tmp11 = tmp1 + tmp2 */ + sub r11, r14, r11 /* r11 = tmp12 = tmp1 - tmp2 */ + stmdb sp, { r8-r11 } /* tmp10 tmp13 tmp11 tmp12 */ + mov r4, r4, asr #16 /* r4 = tmp3 = d1 */ + mov r5, r5, asr #16 /* r5 = tmp2 = d3 */ + mov r6, r6, asr #16 /* r6 = tmp1 = d5 */ + mov r7, r7, asr #16 /* r7 = tmp0 = d7 */ + ldr r10, =9633 + ldr r11, =-16069 + add r12, r5, r7 /* r12 = z3 = tmp0 + tmp2 */ + add r14, r4, r6 /* r14 = z4 = tmp1 + tmp3 */ + add r9, r12, r14 /* r9 = z3 + z4 */ + mul r9, r10, r9 /* r9 = z5 = (z3 + z4) * 9633 */ + ldr r10, =-3196 + mla r12, r11, r12, r9 /* r12 = z3 = z5 - z3 * 16069 */ + ldr r11, =-7373 + mla r14, r10, r14, r9 /* r14 = z4 = z5 - z4 * 3196 */ + ldr r10, =2446 + add r9, r4, r7 /* r9 = tmp0 + tmp3 */ + mla r8, r11, r9, r12 /* r8 = z1 + z3 */ + mla r9, r11, r9, r14 /* r9 = z1 + z4 */ + ldr r11, =12299 + mla r7, r10, r7, r8 /* r7 = tmp0 = z1 + z3 + tmp0 * 2446 */ + ldr r10, =-20995 + mla r4, r11, r4, r9 /* r4 = tmp3 = z1 + z4 + tmp0 * 12299 */ + ldr r11, =25172 + add r9, r5, r6 /* r9 = tmp1 + tmp2 */ + mla r12, r10, r9, r12 /* r12 = z2 + z3 */ + mla r14, r10, r9, r14 /* r14 = z2 + z4 */ + ldr r10, =16819 + mla r5, r11, r5, r12 /* r5 = tmp2 = z2 + z3 + tmp2 * 25172 */ + mla r6, r10, r6, r14 /* r6 = tmp1 = z2 + z4 + tmp1 * 16819 */ + ldmdb sp, { r8-r11 } + add r12, r8, r4 /* o0 */ + sub r14, r8, r4 /* o7 */ + add r8, r9, r7 /* o3 */ + sub r9, r9, r7 /* o4 */ + add r4, r10, r5 /* O1 */ + sub r5, r10, r5 /* o6 */ + add r10, r11, r6 /* o2 */ + sub r11, r11, r6 /* o5 */ + /* output in order: r12 r4 r10 r8 r9 r11 r5 r14 */ +#if ARM_ARCH < 6 + mov r12, r12, asr #18 + cmp r12, #255 + mvnhi r12, r12, asr #31 + mov r4, r4, asr #18 + cmp r4, #255 + mvnhi r4, r4, asr #31 + mov r10, r10, asr #18 + cmp r10, #255 + mvnhi r10, r10, asr #31 + mov r8, r8, asr #18 + cmp r8, #255 + mvnhi r8, r8, asr #31 + mov r9, r9, asr #18 + cmp r9, #255 + mvnhi r9, r9, asr #31 + mov r11, r11, asr #18 + cmp r11, #255 + mvnhi r11, r11, asr #31 + mov r5, r5, asr #18 + cmp r5, #255 + mvnhi r5, r5, asr #31 + mov r14, r14, asr #18 + cmp r14, #255 + mvnhi r14, r14, asr #31 +#else + usat r12, #8, r12, asr #18 + usat r4, #8, r4, asr #18 + usat r10, #8, r10, asr #18 + usat r8, #8, r8, asr #18 + usat r9, #8, r9, asr #18 + usat r11, #8, r11, asr #18 + usat r5, #8, r5, asr #18 + usat r14, #8, r14, asr #18 +#endif +#ifdef HAVE_LCD_COLOR + strb r12, [r1] + strb r4, [r1, #4] + strb r10, [r1, #8] + strb r8, [r1, #12] + strb r9, [r1, #16] + strb r11, [r1, #20] + strb r5, [r1, #24] + strb r14, [r1, #28] +#else + strb r12, [r1] + strb r4, [r1, #1] + strb r10, [r1, #2] + strb r8, [r1, #3] + strb r9, [r1, #4] + strb r11, [r1, #5] + strb r5, [r1, #6] + strb r14, [r1, #7] +#endif + add r1, r1, r3 + cmp r0, r2 + bcc 1b + ldmia sp!, { r4-r11, pc } + .size jpeg_idct8h, .-jpeg_idct8h diff --git a/apps/recorder/jpeg_load.c b/apps/recorder/jpeg_load.c index fa2df5b993..5ffa4a54a0 100644 --- a/apps/recorder/jpeg_load.c +++ b/apps/recorder/jpeg_load.c @@ -382,13 +382,6 @@ static void jpeg_idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowst DS_OUT)); } } -#else -extern void jpeg_idct1h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep); -extern void jpeg_idct2v(int16_t *ws, int16_t *end); -extern void jpeg_idct2h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep); -extern void jpeg_idct4v(int16_t *ws, int16_t *end); -extern void jpeg_idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep); -#endif /* vertical-pass 8-point IDCT */ static void jpeg_idct8v(int16_t *ws, int16_t *end) @@ -599,6 +592,16 @@ static void jpeg_idct8h(int16_t *ws, unsigned char *out, int16_t *end, int rowst } } +#else +extern void jpeg_idct1h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep); +extern void jpeg_idct2v(int16_t *ws, int16_t *end); +extern void jpeg_idct2h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep); +extern void jpeg_idct4v(int16_t *ws, int16_t *end); +extern void jpeg_idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep); +extern void jpeg_idct8v(int16_t *ws, int16_t *end); +extern void jpeg_idct8h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep); +#endif + #ifdef HAVE_LCD_COLOR /* vertical-pass 16-point IDCT */ static void jpeg_idct16v(int16_t *ws, int16_t *end) -- cgit v1.2.3