summaryrefslogtreecommitdiff
path: root/apps/codecs/demac/libdemac
diff options
context:
space:
mode:
authorJens Arnold <amiconn@rockbox.org>2008-11-28 23:50:22 +0000
committerJens Arnold <amiconn@rockbox.org>2008-11-28 23:50:22 +0000
commit88270f7622891810bafc9404fc5dc6a7496f3f10 (patch)
tree40215e6c74af4f6f6b4c8f56801c810ba672becf /apps/codecs/demac/libdemac
parentd158a6d0c9d2bcfe27b1bacff6c41bad3e1cbc22 (diff)
downloadrockbox-88270f7622891810bafc9404fc5dc6a7496f3f10.tar.gz
rockbox-88270f7622891810bafc9404fc5dc6a7496f3f10.zip
Resurrect the ARM7 16-bit packed vector addition/subtraction for ARMv5, giving a nice speedup for the higher compression levels (tested on Cowon D2).
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19260 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/codecs/demac/libdemac')
-rw-r--r--apps/codecs/demac/libdemac/vector_math16_armv5te.h244
1 files changed, 166 insertions, 78 deletions
diff --git a/apps/codecs/demac/libdemac/vector_math16_armv5te.h b/apps/codecs/demac/libdemac/vector_math16_armv5te.h
index 826aaa3f80..4f2c203f5e 100644
--- a/apps/codecs/demac/libdemac/vector_math16_armv5te.h
+++ b/apps/codecs/demac/libdemac/vector_math16_armv5te.h
@@ -24,92 +24,180 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
24 24
25*/ 25*/
26 26
27/* This version fetches data as 32 bit words, and *requires* v1 to be
28 * 32 bit aligned, otherwise it will result either in a data abort, or
29 * incorrect results (if ARM aligncheck is disabled). */
27static inline void vector_add(int16_t* v1, int16_t* v2) 30static inline void vector_add(int16_t* v1, int16_t* v2)
28{ 31{
29#if ORDER > 32
30 int order = (ORDER >> 5);
31 while (order--)
32#endif
33 {
34 *v1++ += *v2++;
35 *v1++ += *v2++;
36 *v1++ += *v2++;
37 *v1++ += *v2++;
38 *v1++ += *v2++;
39 *v1++ += *v2++;
40 *v1++ += *v2++;
41 *v1++ += *v2++;
42 *v1++ += *v2++;
43 *v1++ += *v2++;
44 *v1++ += *v2++;
45 *v1++ += *v2++;
46 *v1++ += *v2++;
47 *v1++ += *v2++;
48 *v1++ += *v2++;
49 *v1++ += *v2++;
50#if ORDER > 16 32#if ORDER > 16
51 *v1++ += *v2++; 33 int cnt = ORDER>>4;
52 *v1++ += *v2++; 34#endif
53 *v1++ += *v2++; 35
54 *v1++ += *v2++; 36#define ADDHALFREGS(sum, s1) /* Adds register */ \
55 *v1++ += *v2++; 37 "mov " #s1 ", " #s1 ", ror #16 \n" /* halves straight. */ \
56 *v1++ += *v2++; 38 "add r8 , " #s1 ", " #sum ", lsl #16 \n" /* Clobbers 's1' */ \
57 *v1++ += *v2++; 39 "add " #sum ", " #s1 ", " #sum ", lsr #16 \n" /* and r8. */ \
58 *v1++ += *v2++; 40 "mov " #sum ", " #sum ", lsl #16 \n" \
59 *v1++ += *v2++; 41 "orr " #sum ", " #sum ", r8 , lsr #16 \n"
60 *v1++ += *v2++; 42
61 *v1++ += *v2++; 43#define ADDHALFXREGS(sum, s1, s2) /* Adds register */ \
62 *v1++ += *v2++; 44 "add " #s1 ", " #s1 ", " #sum ", lsl #16 \n" /* halves across. */ \
63 *v1++ += *v2++; 45 "add " #sum ", " #s2 ", " #sum ", lsr #16 \n" /* Clobbers 's1'. */ \
64 *v1++ += *v2++; 46 "mov " #sum ", " #sum ", lsl #16 \n" \
65 *v1++ += *v2++; 47 "orr " #sum ", " #sum ", " #s1 ", lsr #16 \n"
66 *v1++ += *v2++; 48
67#endif 49 asm volatile (
68 } 50 "tst %[v2], #2 \n"
51 "beq 20f \n"
52
53 "10: \n"
54 "ldrh r4, [%[v2]], #2 \n"
55 "mov r4, r4, lsl #16 \n"
56 "1: \n"
57 "ldmia %[v1], {r0-r3} \n"
58 "ldmia %[v2]!, {r5-r8} \n"
59 ADDHALFXREGS(r0, r4, r5)
60 ADDHALFXREGS(r1, r5, r6)
61 ADDHALFXREGS(r2, r6, r7)
62 ADDHALFXREGS(r3, r7, r8)
63 "stmia %[v1]!, {r0-r3} \n"
64 "mov r4, r8 \n"
65 "ldmia %[v1], {r0-r3} \n"
66 "ldmia %[v2]!, {r5-r8} \n"
67 ADDHALFXREGS(r0, r4, r5)
68 ADDHALFXREGS(r1, r5, r6)
69 ADDHALFXREGS(r2, r6, r7)
70 ADDHALFXREGS(r3, r7, r8)
71 "stmia %[v1]!, {r0-r3} \n"
72#if ORDER > 16
73 "mov r4, r8 \n"
74 "subs %[cnt], %[cnt], #1 \n"
75 "bne 1b \n"
76#endif
77 "b 99f \n"
78
79 "20: \n"
80 "1: \n"
81 "ldmia %[v1], {r0-r3} \n"
82 "ldmia %[v2]!, {r4-r7} \n"
83 ADDHALFREGS(r0, r4)
84 ADDHALFREGS(r1, r5)
85 ADDHALFREGS(r2, r6)
86 ADDHALFREGS(r3, r7)
87 "stmia %[v1]!, {r0-r3} \n"
88 "ldmia %[v1], {r0-r3} \n"
89 "ldmia %[v2]!, {r4-r7} \n"
90 ADDHALFREGS(r0, r4)
91 ADDHALFREGS(r1, r5)
92 ADDHALFREGS(r2, r6)
93 ADDHALFREGS(r3, r7)
94 "stmia %[v1]!, {r0-r3} \n"
95#if ORDER > 16
96 "subs %[cnt], %[cnt], #1 \n"
97 "bne 1b \n"
98#endif
99
100 "99: \n"
101 : /* outputs */
102#if ORDER > 16
103 [cnt]"+r"(cnt),
104#endif
105 [v1] "+r"(v1),
106 [v2] "+r"(v2)
107 : /* inputs */
108 : /* clobbers */
109 "r0", "r1", "r2", "r3", "r4",
110 "r5", "r6", "r7", "r8", "memory"
111 );
69} 112}
70 113
114/* This version fetches data as 32 bit words, and *requires* v1 to be
115 * 32 bit aligned, otherwise it will result either in a data abort, or
116 * incorrect results (if ARM aligncheck is disabled). */
71static inline void vector_sub(int16_t* v1, int16_t* v2) 117static inline void vector_sub(int16_t* v1, int16_t* v2)
72{ 118{
73#if ORDER > 32
74 int order = (ORDER >> 5);
75 while (order--)
76#endif
77 {
78 *v1++ -= *v2++;
79 *v1++ -= *v2++;
80 *v1++ -= *v2++;
81 *v1++ -= *v2++;
82 *v1++ -= *v2++;
83 *v1++ -= *v2++;
84 *v1++ -= *v2++;
85 *v1++ -= *v2++;
86 *v1++ -= *v2++;
87 *v1++ -= *v2++;
88 *v1++ -= *v2++;
89 *v1++ -= *v2++;
90 *v1++ -= *v2++;
91 *v1++ -= *v2++;
92 *v1++ -= *v2++;
93 *v1++ -= *v2++;
94#if ORDER > 16 119#if ORDER > 16
95 *v1++ -= *v2++; 120 int cnt = ORDER>>4;
96 *v1++ -= *v2++; 121#endif
97 *v1++ -= *v2++; 122
98 *v1++ -= *v2++; 123#define SUBHALFREGS(dif, s1) /* Subtracts register */ \
99 *v1++ -= *v2++; 124 "sub r8 , " #dif ", " #s1 "\n" /* halves straight. */ \
100 *v1++ -= *v2++; 125 "and r8 , r8 , r9 \n" /* Needs r9 = 0x0000ffff, */ \
101 *v1++ -= *v2++; 126 "mov " #dif ", " #dif ", lsr #16 \n" /* clobbers r8. */ \
102 *v1++ -= *v2++; 127 "sub " #dif ", " #dif ", " #s1 ", lsr #16 \n" \
103 *v1++ -= *v2++; 128 "orr " #dif ", r8 , " #dif ", lsl #16 \n"
104 *v1++ -= *v2++; 129
105 *v1++ -= *v2++; 130#define SUBHALFXREGS(dif, s1, s2) /* Subtracts register */ \
106 *v1++ -= *v2++; 131 "sub " #s1 ", " #dif ", " #s1 ", lsr #16 \n" /* halves across. */ \
107 *v1++ -= *v2++; 132 "and " #s1 ", " #s1 ", r9 \n" /* Needs r9 = 0x0000ffff, */ \
108 *v1++ -= *v2++; 133 "rsb " #dif ", " #s2 ", " #dif ", lsr #16 \n" /* clobbers 's1'. */ \
109 *v1++ -= *v2++; 134 "orr " #dif ", " #s1 ", " #dif ", lsl #16 \n"
110 *v1++ -= *v2++; 135
111#endif 136 asm volatile (
112 } 137 "mov r9, #0xff \n"
138 "orr r9, r9, #0xff00 \n"
139 "tst %[v2], #2 \n"
140 "beq 20f \n"
141
142 "10: \n"
143 "ldrh r4, [%[v2]], #2 \n"
144 "mov r4, r4, lsl #16 \n"
145 "1: \n"
146 "ldmia %[v1], {r0-r3} \n"
147 "ldmia %[v2]!, {r5-r8} \n"
148 SUBHALFXREGS(r0, r4, r5)
149 SUBHALFXREGS(r1, r5, r6)
150 SUBHALFXREGS(r2, r6, r7)
151 SUBHALFXREGS(r3, r7, r8)
152 "stmia %[v1]!, {r0-r3} \n"
153 "mov r4, r8 \n"
154 "ldmia %[v1], {r0-r3} \n"
155 "ldmia %[v2]!, {r5-r8} \n"
156 SUBHALFXREGS(r0, r4, r5)
157 SUBHALFXREGS(r1, r5, r6)
158 SUBHALFXREGS(r2, r6, r7)
159 SUBHALFXREGS(r3, r7, r8)
160 "stmia %[v1]!, {r0-r3} \n"
161#if ORDER > 16
162 "mov r4, r8 \n"
163 "subs %[cnt], %[cnt], #1 \n"
164 "bne 1b \n"
165#endif
166 "b 99f \n"
167
168 "20: \n"
169 "1: \n"
170 "ldmia %[v1], {r0-r3} \n"
171 "ldmia %[v2]!, {r4-r7} \n"
172 SUBHALFREGS(r0, r4)
173 SUBHALFREGS(r1, r5)
174 SUBHALFREGS(r2, r6)
175 SUBHALFREGS(r3, r7)
176 "stmia %[v1]!, {r0-r3} \n"
177 "ldmia %[v1], {r0-r3} \n"
178 "ldmia %[v2]!, {r4-r7} \n"
179 SUBHALFREGS(r0, r4)
180 SUBHALFREGS(r1, r5)
181 SUBHALFREGS(r2, r6)
182 SUBHALFREGS(r3, r7)
183 "stmia %[v1]!, {r0-r3} \n"
184#if ORDER > 16
185 "subs %[cnt], %[cnt], #1 \n"
186 "bne 1b \n"
187#endif
188
189 "99: \n"
190 : /* outputs */
191#if ORDER > 16
192 [cnt]"+r"(cnt),
193#endif
194 [v1] "+r"(v1),
195 [v2] "+r"(v2)
196 : /* inputs */
197 : /* clobbers */
198 "r0", "r1", "r2", "r3", "r4", "r5",
199 "r6", "r7", "r8", "r9", "memory"
200 );
113} 201}
114 202
115/* This version fetches data as 32 bit words, and *requires* v1 to be 203/* This version fetches data as 32 bit words, and *requires* v1 to be