summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--apps/recorder/jpeg_idct_arm.S149
-rw-r--r--apps/recorder/jpeg_load.c10
2 files changed, 143 insertions, 16 deletions
diff --git a/apps/recorder/jpeg_idct_arm.S b/apps/recorder/jpeg_idct_arm.S
index 2ef868e753..d46843ff12 100644
--- a/apps/recorder/jpeg_idct_arm.S
+++ b/apps/recorder/jpeg_idct_arm.S
@@ -25,11 +25,140 @@
25 25
26 .section .text 26 .section .text
27 .align 2 27 .align 2
28 .global jpeg_idct1h
29 .type jpeg_idct1h, %function
30 .global jpeg_idct2v
31 .type jpeg_idct2v, %function
32 .global jpeg_idct2h
33 .type jpeg_idct2h, %function
28 .global jpeg_idct4v 34 .global jpeg_idct4v
29 .type jpeg_idct4v, %function 35 .type jpeg_idct4v, %function
30 .global jpeg_idct4h 36 .global jpeg_idct4h
31 .type jpeg_idct4h, %function 37 .type jpeg_idct4h, %function
32 38
39jpeg_idct1h:
40/* In the common case of one pass through the loop, the extra add should be
41 cheaper than saving registers to stack and loading a the value 4112. */
421:
43 ldrsh r12, [r0]
44 add r12, r12, #4096
45 add r12, r12, #16
46#if ARM_ARCH < 6
47 mov r12, r12, asr #5
48 cmp r12, #255
49 mvnhi r12, r12, asr #31
50#else
51 usat r12, #8, r12, asr #5
52#endif
53 strb r12, [r1]
54 add r0, r0, #16
55 add r1, r1, r3
56 cmp r0, r2
57 bcc 1b
58 bx lr
59 .size jpeg_idct1h, .-jpeg_idct1h
60
61jpeg_idct2v:
62#if ARM_ARCH < 6
63/* Use SWAR tricks to fake partitioned add and subtract. This is slightly faster
64 than loading two values in each register and using shifts and strh, and
65 requires fewer fixup operations than splitting the values, calculating, and
66 merging.
67*/
68 stmdb sp!, { r4, lr }
691:
70 ldr r2, [r0]
71 ldr r3, [r0, #16]
72 eor r12, r2, r3
73 and r12, r12, #0x8000
74 bic r3, r3, #0x8000
75 bic r4, r2, #0x8000
76 add r4, r4, r3
77 eor r4, r4, r12
78 orr r2, r2, #0x8000
79 sub r2, r2, r3
80 eor r2, r2, r12
81 eor r2, r2, #0x8000
82 str r4, [r0]
83 str r2, [r0, #16]
84 add r0, r0, #4
85 cmp r0, r1
86 bcc 1b
87 ldmia sp!, { r4, pc }
88#else
89/* ARMv6 offers partitioned adds and subtracts, used here to unroll the loop
90 to two columns.
91*/
921:
93 ldr r2, [r0]
94 ldr r3, [r0, #16]
95 sadd16 r12, r2, r3
96 ssub16 r2, r2, r3
97 str r12, [r0]
98 str r2, [r0, #16]
99 add r0, r0, #4
100 cmp r0, r1
101 bcc 1b
102 bx lr
103#endif
104 .size jpeg_idct2v, .-jpeg_idct2v
105
106jpeg_idct2h:
107#if ARM_ARCH < 6
108/* Using LDR and shifts here would costs two more ops, and is no faster as
109 results can not be stored merged.
110*/
111 stmdb sp!, { r4-r5, lr }
112 ldr r14, =4112
1131:
114 ldrsh r12, [r0]
115 ldrsh r4, [r0, #2]
116 add r12, r12, r14
117 add r5, r12, r4
118 sub r4, r12, r4
119 mov r5, r5, asr #5
120 mov r4, r4, asr #5
121 cmp r5, #255
122 mvnhi r5, r5, asr #31
123 cmp r4, #255
124 mvnhi r4, r4, asr #31
125#ifdef HAVE_LCD_COLOR
126 strb r5, [r1]
127 strb r4, [r1, #4]
128#else
129 strb r5, [r1]
130 strb r4, [r1, #1]
131#endif
132 add r0, r0, #16
133 add r1, r1, r3
134 cmp r0, r2
135 bcc 1b
136 ldmia sp!, { r4-r5, pc }
137#else
138 stmdb sp!, { r4, lr }
139 ldr r14, =4112
1401:
141 ldr r12, [r0]
142 sadd16 r12, r12, r14
143 saddsubx r12, r12, r12
144 usat r4, #8, r12, asr #21
145 sxth r12, r12
146 usat r12, #8, r12, asr #5
147#ifdef HAVE_LCD_COLOR
148 strb r4, [r1]
149 strb r12, [r1, #4]
150#else
151 strb r4, [r1]
152 strb r12, [r1, #1]
153#endif
154 add r0, r0, #16
155 add r1, r1, r3
156 cmp r0, r2
157 bcc 1b
158 ldmia sp!, { r4, pc }
159#endif
160 .size jpeg_idct2h, .-jpeg_idct2h
161
33jpeg_idct4v: 162jpeg_idct4v:
34#if ARM_ARCH < 5 163#if ARM_ARCH < 5
35 stmdb sp!, { r4-r7, lr } 164 stmdb sp!, { r4-r7, lr }
@@ -60,8 +189,8 @@ jpeg_idct4v:
60 strh r6, [r0, #16] 189 strh r6, [r0, #16]
61 strh r2, [r0, #32] 190 strh r2, [r0, #32]
62 add r0, r0, #2 191 add r0, r0, #2
63 teq r0, r1 192 cmp r0, r1
64 bne 1b 193 bcc 1b
65 ldmia sp!, { r4-r7, pc } 194 ldmia sp!, { r4-r7, pc }
66#elif ARM_ARCH < 6 195#elif ARM_ARCH < 6
67 stmdb sp!, { r4-r8, lr } 196 stmdb sp!, { r4-r8, lr }
@@ -90,8 +219,8 @@ jpeg_idct4v:
90 strh r3, [r0, #16] 219 strh r3, [r0, #16]
91 strh r2, [r0, #32] 220 strh r2, [r0, #32]
92 add r0, r0, #2 221 add r0, r0, #2
93 teq r0, r1 222 cmp r0, r1
94 bne 1b 223 bcc 1b
95 ldmia sp!, { r4-r8, pc } 224 ldmia sp!, { r4-r8, pc }
96#else 225#else
97 stmdb sp!, { r4-r10, lr } 226 stmdb sp!, { r4-r10, lr }
@@ -192,8 +321,8 @@ jpeg_idct4h:
192#endif 321#endif
193 add r0, r0, #16 322 add r0, r0, #16
194 add r1, r1, r3 323 add r1, r1, r3
195 teq r0, r2 324 cmp r0, r2
196 bne 1b 325 bcc 1b
197 ldmia sp!, { r4-r10, pc } 326 ldmia sp!, { r4-r10, pc }
198#elif ARM_ARCH < 6 327#elif ARM_ARCH < 6
199 stmdb sp!, { r4-r10, lr } 328 stmdb sp!, { r4-r10, lr }
@@ -241,8 +370,8 @@ jpeg_idct4h:
241#endif 370#endif
242 add r0, r0, #16 371 add r0, r0, #16
243 add r1, r1, r3 372 add r1, r1, r3
244 teq r0, r2 373 cmp r0, r2
245 bne 1b 374 bcc 1b
246 ldmia sp!, { r4-r10, pc } 375 ldmia sp!, { r4-r10, pc }
247#else 376#else
248 stmdb sp!, { r4-r9, lr } 377 stmdb sp!, { r4-r9, lr }
@@ -280,8 +409,8 @@ jpeg_idct4h:
280#endif 409#endif
281 add r0, r0, #16 410 add r0, r0, #16
282 add r1, r1, r3 411 add r1, r1, r3
283 teq r0, r2 412 cmp r0, r2
284 bne 1b 413 bcc 1b
285 ldmia sp!, { r4-r9, pc } 414 ldmia sp!, { r4-r9, pc }
286#endif 415#endif
287 .size jpeg_idct4h, .-jpeg_idct4h 416 .size jpeg_idct4h, .-jpeg_idct4h
diff --git a/apps/recorder/jpeg_load.c b/apps/recorder/jpeg_load.c
index f2b3b4ba74..fa2df5b993 100644
--- a/apps/recorder/jpeg_load.c
+++ b/apps/recorder/jpeg_load.c
@@ -270,11 +270,7 @@ INLINE unsigned range_limit(int value)
270#define BUFAC 227 270#define BUFAC 227
271#define COMPONENT_SHIFT 15 271#define COMPONENT_SHIFT 15
272 272
273/* Some of the below have inline ASM optimizations of the loop contents. To 273#ifndef CPU_ARM
274 make comparison with the C versions easier, the C variable names are used
275 in comments whenever intermediate values are labeled.
276*/
277
278/* horizontal-pass 1-point IDCT */ 274/* horizontal-pass 1-point IDCT */
279static void jpeg_idct1h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep) 275static void jpeg_idct1h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
280{ 276{
@@ -312,7 +308,6 @@ static void jpeg_idct2h(int16_t *ws, unsigned char *out, int16_t *end, int rowst
312 } 308 }
313} 309}
314 310
315#ifndef CPU_ARM
316/* vertical-pass 4-point IDCT */ 311/* vertical-pass 4-point IDCT */
317static void jpeg_idct4v(int16_t *ws, int16_t *end) 312static void jpeg_idct4v(int16_t *ws, int16_t *end)
318{ 313{
@@ -388,6 +383,9 @@ static void jpeg_idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowst
388 } 383 }
389} 384}
390#else 385#else
386extern void jpeg_idct1h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep);
387extern void jpeg_idct2v(int16_t *ws, int16_t *end);
388extern void jpeg_idct2h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep);
391extern void jpeg_idct4v(int16_t *ws, int16_t *end); 389extern void jpeg_idct4v(int16_t *ws, int16_t *end);
392extern void jpeg_idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep); 390extern void jpeg_idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep);
393#endif 391#endif