diff options
author | Jens Arnold <amiconn@rockbox.org> | 2008-11-28 23:50:22 +0000 |
---|---|---|
committer | Jens Arnold <amiconn@rockbox.org> | 2008-11-28 23:50:22 +0000 |
commit | 88270f7622891810bafc9404fc5dc6a7496f3f10 (patch) | |
tree | 40215e6c74af4f6f6b4c8f56801c810ba672becf /apps/codecs | |
parent | d158a6d0c9d2bcfe27b1bacff6c41bad3e1cbc22 (diff) | |
download | rockbox-88270f7622891810bafc9404fc5dc6a7496f3f10.tar.gz rockbox-88270f7622891810bafc9404fc5dc6a7496f3f10.zip |
Resurrect the ARM7 16-bit packed vector addition/subtraction for ARMv5, giving a nice speedup for the higher compression levels (tested on Cowon D2).
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19260 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/codecs')
-rw-r--r-- | apps/codecs/demac/libdemac/vector_math16_armv5te.h | 244 |
1 files changed, 166 insertions, 78 deletions
diff --git a/apps/codecs/demac/libdemac/vector_math16_armv5te.h b/apps/codecs/demac/libdemac/vector_math16_armv5te.h index 826aaa3f80..4f2c203f5e 100644 --- a/apps/codecs/demac/libdemac/vector_math16_armv5te.h +++ b/apps/codecs/demac/libdemac/vector_math16_armv5te.h | |||
@@ -24,92 +24,180 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA | |||
24 | 24 | ||
25 | */ | 25 | */ |
26 | 26 | ||
27 | /* This version fetches data as 32 bit words, and *requires* v1 to be | ||
28 | * 32 bit aligned, otherwise it will result either in a data abort, or | ||
29 | * incorrect results (if ARM aligncheck is disabled). */ | ||
27 | static inline void vector_add(int16_t* v1, int16_t* v2) | 30 | static inline void vector_add(int16_t* v1, int16_t* v2) |
28 | { | 31 | { |
29 | #if ORDER > 32 | ||
30 | int order = (ORDER >> 5); | ||
31 | while (order--) | ||
32 | #endif | ||
33 | { | ||
34 | *v1++ += *v2++; | ||
35 | *v1++ += *v2++; | ||
36 | *v1++ += *v2++; | ||
37 | *v1++ += *v2++; | ||
38 | *v1++ += *v2++; | ||
39 | *v1++ += *v2++; | ||
40 | *v1++ += *v2++; | ||
41 | *v1++ += *v2++; | ||
42 | *v1++ += *v2++; | ||
43 | *v1++ += *v2++; | ||
44 | *v1++ += *v2++; | ||
45 | *v1++ += *v2++; | ||
46 | *v1++ += *v2++; | ||
47 | *v1++ += *v2++; | ||
48 | *v1++ += *v2++; | ||
49 | *v1++ += *v2++; | ||
50 | #if ORDER > 16 | 32 | #if ORDER > 16 |
51 | *v1++ += *v2++; | 33 | int cnt = ORDER>>4; |
52 | *v1++ += *v2++; | 34 | #endif |
53 | *v1++ += *v2++; | 35 | |
54 | *v1++ += *v2++; | 36 | #define ADDHALFREGS(sum, s1) /* Adds register */ \ |
55 | *v1++ += *v2++; | 37 | "mov " #s1 ", " #s1 ", ror #16 \n" /* halves straight. */ \ |
56 | *v1++ += *v2++; | 38 | "add r8 , " #s1 ", " #sum ", lsl #16 \n" /* Clobbers 's1' */ \ |
57 | *v1++ += *v2++; | 39 | "add " #sum ", " #s1 ", " #sum ", lsr #16 \n" /* and r8. */ \ |
58 | *v1++ += *v2++; | 40 | "mov " #sum ", " #sum ", lsl #16 \n" \ |
59 | *v1++ += *v2++; | 41 | "orr " #sum ", " #sum ", r8 , lsr #16 \n" |
60 | *v1++ += *v2++; | 42 | |
61 | *v1++ += *v2++; | 43 | #define ADDHALFXREGS(sum, s1, s2) /* Adds register */ \ |
62 | *v1++ += *v2++; | 44 | "add " #s1 ", " #s1 ", " #sum ", lsl #16 \n" /* halves across. */ \ |
63 | *v1++ += *v2++; | 45 | "add " #sum ", " #s2 ", " #sum ", lsr #16 \n" /* Clobbers 's1'. */ \ |
64 | *v1++ += *v2++; | 46 | "mov " #sum ", " #sum ", lsl #16 \n" \ |
65 | *v1++ += *v2++; | 47 | "orr " #sum ", " #sum ", " #s1 ", lsr #16 \n" |
66 | *v1++ += *v2++; | 48 | |
67 | #endif | 49 | asm volatile ( |
68 | } | 50 | "tst %[v2], #2 \n" |
51 | "beq 20f \n" | ||
52 | |||
53 | "10: \n" | ||
54 | "ldrh r4, [%[v2]], #2 \n" | ||
55 | "mov r4, r4, lsl #16 \n" | ||
56 | "1: \n" | ||
57 | "ldmia %[v1], {r0-r3} \n" | ||
58 | "ldmia %[v2]!, {r5-r8} \n" | ||
59 | ADDHALFXREGS(r0, r4, r5) | ||
60 | ADDHALFXREGS(r1, r5, r6) | ||
61 | ADDHALFXREGS(r2, r6, r7) | ||
62 | ADDHALFXREGS(r3, r7, r8) | ||
63 | "stmia %[v1]!, {r0-r3} \n" | ||
64 | "mov r4, r8 \n" | ||
65 | "ldmia %[v1], {r0-r3} \n" | ||
66 | "ldmia %[v2]!, {r5-r8} \n" | ||
67 | ADDHALFXREGS(r0, r4, r5) | ||
68 | ADDHALFXREGS(r1, r5, r6) | ||
69 | ADDHALFXREGS(r2, r6, r7) | ||
70 | ADDHALFXREGS(r3, r7, r8) | ||
71 | "stmia %[v1]!, {r0-r3} \n" | ||
72 | #if ORDER > 16 | ||
73 | "mov r4, r8 \n" | ||
74 | "subs %[cnt], %[cnt], #1 \n" | ||
75 | "bne 1b \n" | ||
76 | #endif | ||
77 | "b 99f \n" | ||
78 | |||
79 | "20: \n" | ||
80 | "1: \n" | ||
81 | "ldmia %[v1], {r0-r3} \n" | ||
82 | "ldmia %[v2]!, {r4-r7} \n" | ||
83 | ADDHALFREGS(r0, r4) | ||
84 | ADDHALFREGS(r1, r5) | ||
85 | ADDHALFREGS(r2, r6) | ||
86 | ADDHALFREGS(r3, r7) | ||
87 | "stmia %[v1]!, {r0-r3} \n" | ||
88 | "ldmia %[v1], {r0-r3} \n" | ||
89 | "ldmia %[v2]!, {r4-r7} \n" | ||
90 | ADDHALFREGS(r0, r4) | ||
91 | ADDHALFREGS(r1, r5) | ||
92 | ADDHALFREGS(r2, r6) | ||
93 | ADDHALFREGS(r3, r7) | ||
94 | "stmia %[v1]!, {r0-r3} \n" | ||
95 | #if ORDER > 16 | ||
96 | "subs %[cnt], %[cnt], #1 \n" | ||
97 | "bne 1b \n" | ||
98 | #endif | ||
99 | |||
100 | "99: \n" | ||
101 | : /* outputs */ | ||
102 | #if ORDER > 16 | ||
103 | [cnt]"+r"(cnt), | ||
104 | #endif | ||
105 | [v1] "+r"(v1), | ||
106 | [v2] "+r"(v2) | ||
107 | : /* inputs */ | ||
108 | : /* clobbers */ | ||
109 | "r0", "r1", "r2", "r3", "r4", | ||
110 | "r5", "r6", "r7", "r8", "memory" | ||
111 | ); | ||
69 | } | 112 | } |
70 | 113 | ||
114 | /* This version fetches data as 32 bit words, and *requires* v1 to be | ||
115 | * 32 bit aligned, otherwise it will result either in a data abort, or | ||
116 | * incorrect results (if ARM aligncheck is disabled). */ | ||
71 | static inline void vector_sub(int16_t* v1, int16_t* v2) | 117 | static inline void vector_sub(int16_t* v1, int16_t* v2) |
72 | { | 118 | { |
73 | #if ORDER > 32 | ||
74 | int order = (ORDER >> 5); | ||
75 | while (order--) | ||
76 | #endif | ||
77 | { | ||
78 | *v1++ -= *v2++; | ||
79 | *v1++ -= *v2++; | ||
80 | *v1++ -= *v2++; | ||
81 | *v1++ -= *v2++; | ||
82 | *v1++ -= *v2++; | ||
83 | *v1++ -= *v2++; | ||
84 | *v1++ -= *v2++; | ||
85 | *v1++ -= *v2++; | ||
86 | *v1++ -= *v2++; | ||
87 | *v1++ -= *v2++; | ||
88 | *v1++ -= *v2++; | ||
89 | *v1++ -= *v2++; | ||
90 | *v1++ -= *v2++; | ||
91 | *v1++ -= *v2++; | ||
92 | *v1++ -= *v2++; | ||
93 | *v1++ -= *v2++; | ||
94 | #if ORDER > 16 | 119 | #if ORDER > 16 |
95 | *v1++ -= *v2++; | 120 | int cnt = ORDER>>4; |
96 | *v1++ -= *v2++; | 121 | #endif |
97 | *v1++ -= *v2++; | 122 | |
98 | *v1++ -= *v2++; | 123 | #define SUBHALFREGS(dif, s1) /* Subtracts register */ \ |
99 | *v1++ -= *v2++; | 124 | "sub r8 , " #dif ", " #s1 "\n" /* halves straight. */ \ |
100 | *v1++ -= *v2++; | 125 | "and r8 , r8 , r9 \n" /* Needs r9 = 0x0000ffff, */ \ |
101 | *v1++ -= *v2++; | 126 | "mov " #dif ", " #dif ", lsr #16 \n" /* clobbers r8. */ \ |
102 | *v1++ -= *v2++; | 127 | "sub " #dif ", " #dif ", " #s1 ", lsr #16 \n" \ |
103 | *v1++ -= *v2++; | 128 | "orr " #dif ", r8 , " #dif ", lsl #16 \n" |
104 | *v1++ -= *v2++; | 129 | |
105 | *v1++ -= *v2++; | 130 | #define SUBHALFXREGS(dif, s1, s2) /* Subtracts register */ \ |
106 | *v1++ -= *v2++; | 131 | "sub " #s1 ", " #dif ", " #s1 ", lsr #16 \n" /* halves across. */ \ |
107 | *v1++ -= *v2++; | 132 | "and " #s1 ", " #s1 ", r9 \n" /* Needs r9 = 0x0000ffff, */ \ |
108 | *v1++ -= *v2++; | 133 | "rsb " #dif ", " #s2 ", " #dif ", lsr #16 \n" /* clobbers 's1'. */ \ |
109 | *v1++ -= *v2++; | 134 | "orr " #dif ", " #s1 ", " #dif ", lsl #16 \n" |
110 | *v1++ -= *v2++; | 135 | |
111 | #endif | 136 | asm volatile ( |
112 | } | 137 | "mov r9, #0xff \n" |
138 | "orr r9, r9, #0xff00 \n" | ||
139 | "tst %[v2], #2 \n" | ||
140 | "beq 20f \n" | ||
141 | |||
142 | "10: \n" | ||
143 | "ldrh r4, [%[v2]], #2 \n" | ||
144 | "mov r4, r4, lsl #16 \n" | ||
145 | "1: \n" | ||
146 | "ldmia %[v1], {r0-r3} \n" | ||
147 | "ldmia %[v2]!, {r5-r8} \n" | ||
148 | SUBHALFXREGS(r0, r4, r5) | ||
149 | SUBHALFXREGS(r1, r5, r6) | ||
150 | SUBHALFXREGS(r2, r6, r7) | ||
151 | SUBHALFXREGS(r3, r7, r8) | ||
152 | "stmia %[v1]!, {r0-r3} \n" | ||
153 | "mov r4, r8 \n" | ||
154 | "ldmia %[v1], {r0-r3} \n" | ||
155 | "ldmia %[v2]!, {r5-r8} \n" | ||
156 | SUBHALFXREGS(r0, r4, r5) | ||
157 | SUBHALFXREGS(r1, r5, r6) | ||
158 | SUBHALFXREGS(r2, r6, r7) | ||
159 | SUBHALFXREGS(r3, r7, r8) | ||
160 | "stmia %[v1]!, {r0-r3} \n" | ||
161 | #if ORDER > 16 | ||
162 | "mov r4, r8 \n" | ||
163 | "subs %[cnt], %[cnt], #1 \n" | ||
164 | "bne 1b \n" | ||
165 | #endif | ||
166 | "b 99f \n" | ||
167 | |||
168 | "20: \n" | ||
169 | "1: \n" | ||
170 | "ldmia %[v1], {r0-r3} \n" | ||
171 | "ldmia %[v2]!, {r4-r7} \n" | ||
172 | SUBHALFREGS(r0, r4) | ||
173 | SUBHALFREGS(r1, r5) | ||
174 | SUBHALFREGS(r2, r6) | ||
175 | SUBHALFREGS(r3, r7) | ||
176 | "stmia %[v1]!, {r0-r3} \n" | ||
177 | "ldmia %[v1], {r0-r3} \n" | ||
178 | "ldmia %[v2]!, {r4-r7} \n" | ||
179 | SUBHALFREGS(r0, r4) | ||
180 | SUBHALFREGS(r1, r5) | ||
181 | SUBHALFREGS(r2, r6) | ||
182 | SUBHALFREGS(r3, r7) | ||
183 | "stmia %[v1]!, {r0-r3} \n" | ||
184 | #if ORDER > 16 | ||
185 | "subs %[cnt], %[cnt], #1 \n" | ||
186 | "bne 1b \n" | ||
187 | #endif | ||
188 | |||
189 | "99: \n" | ||
190 | : /* outputs */ | ||
191 | #if ORDER > 16 | ||
192 | [cnt]"+r"(cnt), | ||
193 | #endif | ||
194 | [v1] "+r"(v1), | ||
195 | [v2] "+r"(v2) | ||
196 | : /* inputs */ | ||
197 | : /* clobbers */ | ||
198 | "r0", "r1", "r2", "r3", "r4", "r5", | ||
199 | "r6", "r7", "r8", "r9", "memory" | ||
200 | ); | ||
113 | } | 201 | } |
114 | 202 | ||
115 | /* This version fetches data as 32 bit words, and *requires* v1 to be | 203 | /* This version fetches data as 32 bit words, and *requires* v1 to be |