diff options
author | Andrew Mahone <andrew.mahone@gmail.com> | 2009-06-19 08:26:05 +0000 |
---|---|---|
committer | Andrew Mahone <andrew.mahone@gmail.com> | 2009-06-19 08:26:05 +0000 |
commit | 498ad469c9a6cab6843bacb0126afee2219fa2e5 (patch) | |
tree | 754dc004a564d05d2f6c0c467ccd14d078e899d0 /apps/recorder/jpeg_idct_arm.S | |
parent | 895357be999ffdeefac83d0b9296c91171b05cd0 (diff) | |
download | rockbox-498ad469c9a6cab6843bacb0126afee2219fa2e5.tar.gz rockbox-498ad469c9a6cab6843bacb0126afee2219fa2e5.zip |
2-point and 1-point JPEG IDCT ARM assembly, remove comment in jpeg_load.c about inline asm, change loop condition to be a bit safer in case of bad values being passed.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@21349 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/recorder/jpeg_idct_arm.S')
-rw-r--r-- | apps/recorder/jpeg_idct_arm.S | 149 |
1 files changed, 139 insertions, 10 deletions
diff --git a/apps/recorder/jpeg_idct_arm.S b/apps/recorder/jpeg_idct_arm.S index 2ef868e753..d46843ff12 100644 --- a/apps/recorder/jpeg_idct_arm.S +++ b/apps/recorder/jpeg_idct_arm.S | |||
@@ -25,11 +25,140 @@ | |||
25 | 25 | ||
26 | .section .text | 26 | .section .text |
27 | .align 2 | 27 | .align 2 |
28 | .global jpeg_idct1h | ||
29 | .type jpeg_idct1h, %function | ||
30 | .global jpeg_idct2v | ||
31 | .type jpeg_idct2v, %function | ||
32 | .global jpeg_idct2h | ||
33 | .type jpeg_idct2h, %function | ||
28 | .global jpeg_idct4v | 34 | .global jpeg_idct4v |
29 | .type jpeg_idct4v, %function | 35 | .type jpeg_idct4v, %function |
30 | .global jpeg_idct4h | 36 | .global jpeg_idct4h |
31 | .type jpeg_idct4h, %function | 37 | .type jpeg_idct4h, %function |
32 | 38 | ||
39 | jpeg_idct1h: | ||
40 | /* In the common case of one pass through the loop, the extra add should be | ||
41 | cheaper than saving registers to stack and loading a the value 4112. */ | ||
42 | 1: | ||
43 | ldrsh r12, [r0] | ||
44 | add r12, r12, #4096 | ||
45 | add r12, r12, #16 | ||
46 | #if ARM_ARCH < 6 | ||
47 | mov r12, r12, asr #5 | ||
48 | cmp r12, #255 | ||
49 | mvnhi r12, r12, asr #31 | ||
50 | #else | ||
51 | usat r12, #8, r12, asr #5 | ||
52 | #endif | ||
53 | strb r12, [r1] | ||
54 | add r0, r0, #16 | ||
55 | add r1, r1, r3 | ||
56 | cmp r0, r2 | ||
57 | bcc 1b | ||
58 | bx lr | ||
59 | .size jpeg_idct1h, .-jpeg_idct1h | ||
60 | |||
61 | jpeg_idct2v: | ||
62 | #if ARM_ARCH < 6 | ||
63 | /* Use SWAR tricks to fake partitioned add and subtract. This is slightly faster | ||
64 | than loading two values in each register and using shifts and strh, and | ||
65 | requires fewer fixup operations than splitting the values, calculating, and | ||
66 | merging. | ||
67 | */ | ||
68 | stmdb sp!, { r4, lr } | ||
69 | 1: | ||
70 | ldr r2, [r0] | ||
71 | ldr r3, [r0, #16] | ||
72 | eor r12, r2, r3 | ||
73 | and r12, r12, #0x8000 | ||
74 | bic r3, r3, #0x8000 | ||
75 | bic r4, r2, #0x8000 | ||
76 | add r4, r4, r3 | ||
77 | eor r4, r4, r12 | ||
78 | orr r2, r2, #0x8000 | ||
79 | sub r2, r2, r3 | ||
80 | eor r2, r2, r12 | ||
81 | eor r2, r2, #0x8000 | ||
82 | str r4, [r0] | ||
83 | str r2, [r0, #16] | ||
84 | add r0, r0, #4 | ||
85 | cmp r0, r1 | ||
86 | bcc 1b | ||
87 | ldmia sp!, { r4, pc } | ||
88 | #else | ||
89 | /* ARMv6 offers partitioned adds and subtracts, used here to unroll the loop | ||
90 | to two columns. | ||
91 | */ | ||
92 | 1: | ||
93 | ldr r2, [r0] | ||
94 | ldr r3, [r0, #16] | ||
95 | sadd16 r12, r2, r3 | ||
96 | ssub16 r2, r2, r3 | ||
97 | str r12, [r0] | ||
98 | str r2, [r0, #16] | ||
99 | add r0, r0, #4 | ||
100 | cmp r0, r1 | ||
101 | bcc 1b | ||
102 | bx lr | ||
103 | #endif | ||
104 | .size jpeg_idct2v, .-jpeg_idct2v | ||
105 | |||
106 | jpeg_idct2h: | ||
107 | #if ARM_ARCH < 6 | ||
108 | /* Using LDR and shifts here would costs two more ops, and is no faster as | ||
109 | results can not be stored merged. | ||
110 | */ | ||
111 | stmdb sp!, { r4-r5, lr } | ||
112 | ldr r14, =4112 | ||
113 | 1: | ||
114 | ldrsh r12, [r0] | ||
115 | ldrsh r4, [r0, #2] | ||
116 | add r12, r12, r14 | ||
117 | add r5, r12, r4 | ||
118 | sub r4, r12, r4 | ||
119 | mov r5, r5, asr #5 | ||
120 | mov r4, r4, asr #5 | ||
121 | cmp r5, #255 | ||
122 | mvnhi r5, r5, asr #31 | ||
123 | cmp r4, #255 | ||
124 | mvnhi r4, r4, asr #31 | ||
125 | #ifdef HAVE_LCD_COLOR | ||
126 | strb r5, [r1] | ||
127 | strb r4, [r1, #4] | ||
128 | #else | ||
129 | strb r5, [r1] | ||
130 | strb r4, [r1, #1] | ||
131 | #endif | ||
132 | add r0, r0, #16 | ||
133 | add r1, r1, r3 | ||
134 | cmp r0, r2 | ||
135 | bcc 1b | ||
136 | ldmia sp!, { r4-r5, pc } | ||
137 | #else | ||
138 | stmdb sp!, { r4, lr } | ||
139 | ldr r14, =4112 | ||
140 | 1: | ||
141 | ldr r12, [r0] | ||
142 | sadd16 r12, r12, r14 | ||
143 | saddsubx r12, r12, r12 | ||
144 | usat r4, #8, r12, asr #21 | ||
145 | sxth r12, r12 | ||
146 | usat r12, #8, r12, asr #5 | ||
147 | #ifdef HAVE_LCD_COLOR | ||
148 | strb r4, [r1] | ||
149 | strb r12, [r1, #4] | ||
150 | #else | ||
151 | strb r4, [r1] | ||
152 | strb r12, [r1, #1] | ||
153 | #endif | ||
154 | add r0, r0, #16 | ||
155 | add r1, r1, r3 | ||
156 | cmp r0, r2 | ||
157 | bcc 1b | ||
158 | ldmia sp!, { r4, pc } | ||
159 | #endif | ||
160 | .size jpeg_idct2h, .-jpeg_idct2h | ||
161 | |||
33 | jpeg_idct4v: | 162 | jpeg_idct4v: |
34 | #if ARM_ARCH < 5 | 163 | #if ARM_ARCH < 5 |
35 | stmdb sp!, { r4-r7, lr } | 164 | stmdb sp!, { r4-r7, lr } |
@@ -60,8 +189,8 @@ jpeg_idct4v: | |||
60 | strh r6, [r0, #16] | 189 | strh r6, [r0, #16] |
61 | strh r2, [r0, #32] | 190 | strh r2, [r0, #32] |
62 | add r0, r0, #2 | 191 | add r0, r0, #2 |
63 | teq r0, r1 | 192 | cmp r0, r1 |
64 | bne 1b | 193 | bcc 1b |
65 | ldmia sp!, { r4-r7, pc } | 194 | ldmia sp!, { r4-r7, pc } |
66 | #elif ARM_ARCH < 6 | 195 | #elif ARM_ARCH < 6 |
67 | stmdb sp!, { r4-r8, lr } | 196 | stmdb sp!, { r4-r8, lr } |
@@ -90,8 +219,8 @@ jpeg_idct4v: | |||
90 | strh r3, [r0, #16] | 219 | strh r3, [r0, #16] |
91 | strh r2, [r0, #32] | 220 | strh r2, [r0, #32] |
92 | add r0, r0, #2 | 221 | add r0, r0, #2 |
93 | teq r0, r1 | 222 | cmp r0, r1 |
94 | bne 1b | 223 | bcc 1b |
95 | ldmia sp!, { r4-r8, pc } | 224 | ldmia sp!, { r4-r8, pc } |
96 | #else | 225 | #else |
97 | stmdb sp!, { r4-r10, lr } | 226 | stmdb sp!, { r4-r10, lr } |
@@ -192,8 +321,8 @@ jpeg_idct4h: | |||
192 | #endif | 321 | #endif |
193 | add r0, r0, #16 | 322 | add r0, r0, #16 |
194 | add r1, r1, r3 | 323 | add r1, r1, r3 |
195 | teq r0, r2 | 324 | cmp r0, r2 |
196 | bne 1b | 325 | bcc 1b |
197 | ldmia sp!, { r4-r10, pc } | 326 | ldmia sp!, { r4-r10, pc } |
198 | #elif ARM_ARCH < 6 | 327 | #elif ARM_ARCH < 6 |
199 | stmdb sp!, { r4-r10, lr } | 328 | stmdb sp!, { r4-r10, lr } |
@@ -241,8 +370,8 @@ jpeg_idct4h: | |||
241 | #endif | 370 | #endif |
242 | add r0, r0, #16 | 371 | add r0, r0, #16 |
243 | add r1, r1, r3 | 372 | add r1, r1, r3 |
244 | teq r0, r2 | 373 | cmp r0, r2 |
245 | bne 1b | 374 | bcc 1b |
246 | ldmia sp!, { r4-r10, pc } | 375 | ldmia sp!, { r4-r10, pc } |
247 | #else | 376 | #else |
248 | stmdb sp!, { r4-r9, lr } | 377 | stmdb sp!, { r4-r9, lr } |
@@ -280,8 +409,8 @@ jpeg_idct4h: | |||
280 | #endif | 409 | #endif |
281 | add r0, r0, #16 | 410 | add r0, r0, #16 |
282 | add r1, r1, r3 | 411 | add r1, r1, r3 |
283 | teq r0, r2 | 412 | cmp r0, r2 |
284 | bne 1b | 413 | bcc 1b |
285 | ldmia sp!, { r4-r9, pc } | 414 | ldmia sp!, { r4-r9, pc } |
286 | #endif | 415 | #endif |
287 | .size jpeg_idct4h, .-jpeg_idct4h | 416 | .size jpeg_idct4h, .-jpeg_idct4h |