summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJens Arnold <amiconn@rockbox.org>2008-11-19 00:34:48 +0000
committerJens Arnold <amiconn@rockbox.org>2008-11-19 00:34:48 +0000
commit77934cbc961a69e7d18588276f0e64a692854125 (patch)
treeffad34c6e3ae65466bdce8fc0f998404bbadee57
parent73b3f5417fb53579600b2645cfc227f614793f4f (diff)
downloadrockbox-77934cbc961a69e7d18588276f0e64a692854125.tar.gz
rockbox-77934cbc961a69e7d18588276f0e64a692854125.zip
Compile-time choice between 16 bit and 32 bit integers for the filters. 32 bit filters are faster on ARMv4 (with assembler code), so use them there. Nice speedup on PP and Gigabeat F/X.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19140 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/demac/libdemac/decoder.c13
-rw-r--r--apps/codecs/demac/libdemac/demac_config.h27
-rw-r--r--apps/codecs/demac/libdemac/filter.c41
-rw-r--r--apps/codecs/demac/libdemac/filter.h12
-rw-r--r--apps/codecs/demac/libdemac/vector_math16_arm7.h293
-rw-r--r--apps/codecs/demac/libdemac/vector_math32_armv4.h210
-rw-r--r--apps/codecs/demac/libdemac/vector_math_generic.h (renamed from apps/codecs/demac/libdemac/vector_math16.h)10
7 files changed, 280 insertions, 326 deletions
diff --git a/apps/codecs/demac/libdemac/decoder.c b/apps/codecs/demac/libdemac/decoder.c
index 540db47636..31bcb28b72 100644
--- a/apps/codecs/demac/libdemac/decoder.c
+++ b/apps/codecs/demac/libdemac/decoder.c
@@ -33,15 +33,16 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
33 33
34/* Statically allocate the filter buffers */ 34/* Statically allocate the filter buffers */
35 35
36static int16_t filterbuf32[(32*3 + FILTER_HISTORY_SIZE) * 2] /* 2432 bytes */ 36static filter_int filterbuf32[(32*3 + FILTER_HISTORY_SIZE) * 2]
37 IBSS_ATTR __attribute__((aligned(16))); 37 IBSS_ATTR __attribute__((aligned(16))); /* 2432/4864 bytes */
38static int16_t filterbuf256[(256*3 + FILTER_HISTORY_SIZE) * 2] /* 5120 bytes */ 38static filter_int filterbuf256[(256*3 + FILTER_HISTORY_SIZE) * 2]
39 IBSS_ATTR __attribute__((aligned(16))); 39 IBSS_ATTR __attribute__((aligned(16))); /* 5120/10240 bytes */
40 40
41/* This is only needed for "insane" files, and no current Rockbox targets 41/* This is only needed for "insane" files, and no current Rockbox targets
42 can hope to decode them in realtime, although the Gigabeat S comes close. */ 42 can hope to decode them in realtime, although the Gigabeat S comes close. */
43static int16_t filterbuf1280[(1280*3 + FILTER_HISTORY_SIZE) * 2] /* 17408 bytes */ 43static filter_int filterbuf1280[(1280*3 + FILTER_HISTORY_SIZE) * 2]
44 IBSS_ATTR_DEMAC_INSANEBUF __attribute__((aligned(16))); 44 IBSS_ATTR_DEMAC_INSANEBUF __attribute__((aligned(16)));
45 /* 17408 or 34816 bytes */
45 46
46void init_frame_decoder(struct ape_ctx_t* ape_ctx, 47void init_frame_decoder(struct ape_ctx_t* ape_ctx,
47 unsigned char* inbuffer, int* firstbyte, 48 unsigned char* inbuffer, int* firstbyte,
diff --git a/apps/codecs/demac/libdemac/demac_config.h b/apps/codecs/demac/libdemac/demac_config.h
index 93fda76e25..86c2d24919 100644
--- a/apps/codecs/demac/libdemac/demac_config.h
+++ b/apps/codecs/demac/libdemac/demac_config.h
@@ -39,12 +39,21 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
39 39
40#define APE_OUTPUT_DEPTH 29 40#define APE_OUTPUT_DEPTH 29
41 41
42/* On PP5002 code should go into IRAM. Otherwise put the insane 42/* On ARMv4, using 32 bit ints for the filters is faster. */
43 * filter buffer into IRAM as long as there is no better use. */ 43#if defined(CPU_ARM) && (ARM_ARCH == 4)
44#define FILTER_BITS 32
45#endif
46
44#if CONFIG_CPU == PP5002 47#if CONFIG_CPU == PP5002
48/* Code in IRAM for speed, not enough IRAM for the insane filter buffer. */
45#define ICODE_SECTION_DEMAC_ARM .icode 49#define ICODE_SECTION_DEMAC_ARM .icode
46#define ICODE_ATTR_DEMAC ICODE_ATTR 50#define ICODE_ATTR_DEMAC ICODE_ATTR
47#define IBSS_ATTR_DEMAC_INSANEBUF 51#define IBSS_ATTR_DEMAC_INSANEBUF
52#elif CONFIG_CPU == PP5020
53/* Not enough IRAM for the insane filter buffer. */
54#define ICODE_SECTION_DEMAC_ARM .text
55#define ICODE_ATTR_DEMAC
56#define IBSS_ATTR_DEMAC_INSANEBUF
48#else 57#else
49#define ICODE_SECTION_DEMAC_ARM .text 58#define ICODE_SECTION_DEMAC_ARM .text
50#define ICODE_ATTR_DEMAC 59#define ICODE_ATTR_DEMAC
@@ -75,6 +84,20 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
75 84
76#ifndef PREDICTOR_HISTORY_SIZE 85#ifndef PREDICTOR_HISTORY_SIZE
77#define PREDICTOR_HISTORY_SIZE 512 86#define PREDICTOR_HISTORY_SIZE 512
87#endif
88
89#ifndef FILTER_BITS
90#define FILTER_BITS 16
91#endif
92
93
94#ifndef __ASSEMBLER__
95#include <inttypes.h>
96#if FILTER_BITS == 32
97typedef int32_t filter_int;
98#elif FILTER_BITS == 16
99typedef int16_t filter_int;
100#endif
78#endif 101#endif
79 102
80#endif /* _DEMAC_CONFIG_H */ 103#endif /* _DEMAC_CONFIG_H */
diff --git a/apps/codecs/demac/libdemac/filter.c b/apps/codecs/demac/libdemac/filter.c
index b47a37a041..5601fffcd4 100644
--- a/apps/codecs/demac/libdemac/filter.c
+++ b/apps/codecs/demac/libdemac/filter.c
@@ -28,27 +28,38 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
28#include "demac.h" 28#include "demac.h"
29#include "filter.h" 29#include "filter.h"
30#include "demac_config.h" 30#include "demac_config.h"
31
32#if FILTER_BITS == 32
33
34#if defined(CPU_ARM) && (ARM_ARCH == 4)
35#include "vector_math32_armv4.h"
36#else
37#include "vector_math_generic.h"
38#endif
39
40#else /* FILTER_BITS == 16 */
31 41
32#ifdef CPU_COLDFIRE 42#ifdef CPU_COLDFIRE
33#include "vector_math16_cf.h" 43#include "vector_math16_cf.h"
34#elif ARM_ARCH >= 6 44#elif defined(CPU_ARM) && (ARM_ARCH >= 6)
35#include "vector_math16_armv6.h" 45#include "vector_math16_armv6.h"
36#elif ARM_ARCH >= 5 /* Assume all our ARMv5 targets are ARMv5te(j) */ 46#elif defined(CPU_ARM) && (ARM_ARCH >= 5)
47/* Assume all our ARMv5 targets are ARMv5te(j) */
37#include "vector_math16_armv5te.h" 48#include "vector_math16_armv5te.h"
38#elif defined CPU_ARM7TDMI
39#include "vector_math16_arm7.h"
40#else 49#else
41#include "vector_math16.h" 50#include "vector_math_generic.h"
42#endif 51#endif
43 52
53#endif /* FILTER_BITS */
54
44struct filter_t { 55struct filter_t {
45 int16_t* coeffs; /* ORDER entries */ 56 filter_int* coeffs; /* ORDER entries */
46 57
47 /* We store all the filter delays in a single buffer */ 58 /* We store all the filter delays in a single buffer */
48 int16_t* history_end; 59 filter_int* history_end;
49 60
50 int16_t* delay; 61 filter_int* delay;
51 int16_t* adaptcoeffs; 62 filter_int* adaptcoeffs;
52 63
53 int avg; 64 int avg;
54}; 65};
@@ -89,7 +100,7 @@ struct filter_t {
89#if defined(CPU_ARM) && (ARM_ARCH >= 6) 100#if defined(CPU_ARM) && (ARM_ARCH >= 6)
90#define SATURATE(x) ({int __res; asm("ssat %0, #16, %1" : "=r"(__res) : "r"(x)); __res; }) 101#define SATURATE(x) ({int __res; asm("ssat %0, #16, %1" : "=r"(__res) : "r"(x)); __res; })
91#else 102#else
92#define SATURATE(x) (int16_t)(((x) == (int16_t)(x)) ? (x) : ((x) >> 31) ^ 0x7FFF); 103#define SATURATE(x) (((x) == (int16_t)(x)) ? (x) : ((x) >> 31) ^ 0x7FFF);
93#endif 104#endif
94 105
95/* Apply the filter with state f to count entries in data[] */ 106/* Apply the filter with state f to count entries in data[] */
@@ -145,7 +156,7 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3980(struct filter_t* f,
145 /* Have we filled the history buffer? */ 156 /* Have we filled the history buffer? */
146 if (f->delay == f->history_end) { 157 if (f->delay == f->history_end) {
147 memmove(f->coeffs + ORDER, f->delay - (ORDER*2), 158 memmove(f->coeffs + ORDER, f->delay - (ORDER*2),
148 (ORDER*2) * sizeof(int16_t)); 159 (ORDER*2) * sizeof(filter_int));
149 f->adaptcoeffs = f->coeffs + ORDER*2; 160 f->adaptcoeffs = f->coeffs + ORDER*2;
150 f->delay = f->coeffs + ORDER*3; 161 f->delay = f->coeffs + ORDER*3;
151 } 162 }
@@ -190,7 +201,7 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3970(struct filter_t* f,
190 /* Have we filled the history buffer? */ 201 /* Have we filled the history buffer? */
191 if (f->delay == f->history_end) { 202 if (f->delay == f->history_end) {
192 memmove(f->coeffs + ORDER, f->delay - (ORDER*2), 203 memmove(f->coeffs + ORDER, f->delay - (ORDER*2),
193 (ORDER*2) * sizeof(int16_t)); 204 (ORDER*2) * sizeof(filter_int));
194 f->adaptcoeffs = f->coeffs + ORDER*2; 205 f->adaptcoeffs = f->coeffs + ORDER*2;
195 f->delay = f->coeffs + ORDER*3; 206 f->delay = f->coeffs + ORDER*3;
196 } 207 }
@@ -200,7 +211,7 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3970(struct filter_t* f,
200static struct filter_t filter0 IBSS_ATTR; 211static struct filter_t filter0 IBSS_ATTR;
201static struct filter_t filter1 IBSS_ATTR; 212static struct filter_t filter1 IBSS_ATTR;
202 213
203static void do_init_filter(struct filter_t* f, int16_t* buf) 214static void do_init_filter(struct filter_t* f, filter_int* buf)
204{ 215{
205 f->coeffs = buf; 216 f->coeffs = buf;
206 f->history_end = buf + ORDER*3 + FILTER_HISTORY_SIZE; 217 f->history_end = buf + ORDER*3 + FILTER_HISTORY_SIZE;
@@ -210,13 +221,13 @@ static void do_init_filter(struct filter_t* f, int16_t* buf)
210 f->delay = f->coeffs + ORDER*3; 221 f->delay = f->coeffs + ORDER*3;
211 222
212 /* Zero coefficients and history buffer */ 223 /* Zero coefficients and history buffer */
213 memset(f->coeffs, 0, ORDER*3 * sizeof(int16_t)); 224 memset(f->coeffs, 0, ORDER*3 * sizeof(filter_int));
214 225
215 /* Zero the running average */ 226 /* Zero the running average */
216 f->avg = 0; 227 f->avg = 0;
217} 228}
218 229
219void INIT_FILTER(int16_t* buf) 230void INIT_FILTER(filter_int* buf)
220{ 231{
221 do_init_filter(&filter0, buf); 232 do_init_filter(&filter0, buf);
222 do_init_filter(&filter1, buf + ORDER*3 + FILTER_HISTORY_SIZE); 233 do_init_filter(&filter1, buf + ORDER*3 + FILTER_HISTORY_SIZE);
diff --git a/apps/codecs/demac/libdemac/filter.h b/apps/codecs/demac/libdemac/filter.h
index acbb155b29..bbe51d4572 100644
--- a/apps/codecs/demac/libdemac/filter.h
+++ b/apps/codecs/demac/libdemac/filter.h
@@ -25,21 +25,21 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
25#ifndef _APE_FILTER_H 25#ifndef _APE_FILTER_H
26#define _APE_FILTER_H 26#define _APE_FILTER_H
27 27
28#include <inttypes.h> 28#include "demac_config.h"
29 29
30void init_filter_16_11(int16_t* buf); 30void init_filter_16_11(filter_int* buf);
31int apply_filter_16_11(int fileversion, int32_t* decoded0, int32_t* decoded1, int count); 31int apply_filter_16_11(int fileversion, int32_t* decoded0, int32_t* decoded1, int count);
32 32
33void init_filter_64_11(int16_t* buf); 33void init_filter_64_11(filter_int* buf);
34int apply_filter_64_11(int fileversion, int32_t* decoded0, int32_t* decoded1, int count); 34int apply_filter_64_11(int fileversion, int32_t* decoded0, int32_t* decoded1, int count);
35 35
36void init_filter_32_10(int16_t* buf); 36void init_filter_32_10(filter_int* buf);
37int apply_filter_32_10(int fileversion, int32_t* decoded0, int32_t* decoded1, int count); 37int apply_filter_32_10(int fileversion, int32_t* decoded0, int32_t* decoded1, int count);
38 38
39void init_filter_256_13(int16_t* buf); 39void init_filter_256_13(filter_int* buf);
40int apply_filter_256_13(int fileversion, int32_t* decoded0, int32_t* decoded1, int count); 40int apply_filter_256_13(int fileversion, int32_t* decoded0, int32_t* decoded1, int count);
41 41
42void init_filter_1280_15(int16_t* buf); 42void init_filter_1280_15(filter_int* buf);
43int apply_filter_1280_15(int fileversion, int32_t* decoded0, int32_t* decoded1, int count); 43int apply_filter_1280_15(int fileversion, int32_t* decoded0, int32_t* decoded1, int count);
44 44
45#endif 45#endif
diff --git a/apps/codecs/demac/libdemac/vector_math16_arm7.h b/apps/codecs/demac/libdemac/vector_math16_arm7.h
deleted file mode 100644
index 653bb1f53f..0000000000
--- a/apps/codecs/demac/libdemac/vector_math16_arm7.h
+++ /dev/null
@@ -1,293 +0,0 @@
1/*
2
3libdemac - A Monkey's Audio decoder
4
5$Id$
6
7Copyright (C) Dave Chapman 2007
8
9ARM7 vector math copyright (C) 2007 Jens Arnold
10
11This program is free software; you can redistribute it and/or modify
12it under the terms of the GNU General Public License as published by
13the Free Software Foundation; either version 2 of the License, or
14(at your option) any later version.
15
16This program is distributed in the hope that it will be useful,
17but WITHOUT ANY WARRANTY; without even the implied warranty of
18MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19GNU General Public License for more details.
20
21You should have received a copy of the GNU General Public License
22along with this program; if not, write to the Free Software
23Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
24
25*/
26
27/* This version fetches data as 32 bit words, and *requires* v1 to be
28 * 32 bit aligned, otherwise it will result either in a data abort, or
29 * incorrect results (if ARM aligncheck is disabled). */
30static inline void vector_add(int16_t* v1, int16_t* v2)
31{
32#if ORDER > 16
33 int cnt = ORDER>>4;
34#endif
35
36#define ADDHALFREGS(sum, s1) /* Adds register */ \
37 "mov " #s1 ", " #s1 ", ror #16 \n" /* halves straight. */ \
38 "add r8 , " #s1 ", " #sum ", lsl #16 \n" /* Clobbers 's1' */ \
39 "add " #sum ", " #s1 ", " #sum ", lsr #16 \n" /* and r8. */ \
40 "mov " #sum ", " #sum ", lsl #16 \n" \
41 "orr " #sum ", " #sum ", r8 , lsr #16 \n"
42
43#define ADDHALFXREGS(sum, s1, s2) /* Adds register */ \
44 "add " #s1 ", " #s1 ", " #sum ", lsl #16 \n" /* halves across. */ \
45 "add " #sum ", " #s2 ", " #sum ", lsr #16 \n" /* Clobbers 's1'. */ \
46 "mov " #sum ", " #sum ", lsl #16 \n" \
47 "orr " #sum ", " #sum ", " #s1 ", lsr #16 \n"
48
49 asm volatile (
50 "tst %[v2], #2 \n"
51 "beq 20f \n"
52
53 "10: \n"
54 "ldrh r4, [%[v2]], #2 \n"
55 "mov r4, r4, lsl #16 \n"
56 "1: \n"
57 "ldmia %[v1], {r0-r3} \n"
58 "ldmia %[v2]!, {r5-r8} \n"
59 ADDHALFXREGS(r0, r4, r5)
60 ADDHALFXREGS(r1, r5, r6)
61 ADDHALFXREGS(r2, r6, r7)
62 ADDHALFXREGS(r3, r7, r8)
63 "stmia %[v1]!, {r0-r3} \n"
64 "mov r4, r8 \n"
65 "ldmia %[v1], {r0-r3} \n"
66 "ldmia %[v2]!, {r5-r8} \n"
67 ADDHALFXREGS(r0, r4, r5)
68 ADDHALFXREGS(r1, r5, r6)
69 ADDHALFXREGS(r2, r6, r7)
70 ADDHALFXREGS(r3, r7, r8)
71 "stmia %[v1]!, {r0-r3} \n"
72#if ORDER > 16
73 "mov r4, r8 \n"
74 "subs %[cnt], %[cnt], #1 \n"
75 "bne 1b \n"
76#endif
77 "b 99f \n"
78
79 "20: \n"
80 "1: \n"
81 "ldmia %[v1], {r0-r3} \n"
82 "ldmia %[v2]!, {r4-r7} \n"
83 ADDHALFREGS(r0, r4)
84 ADDHALFREGS(r1, r5)
85 ADDHALFREGS(r2, r6)
86 ADDHALFREGS(r3, r7)
87 "stmia %[v1]!, {r0-r3} \n"
88 "ldmia %[v1], {r0-r3} \n"
89 "ldmia %[v2]!, {r4-r7} \n"
90 ADDHALFREGS(r0, r4)
91 ADDHALFREGS(r1, r5)
92 ADDHALFREGS(r2, r6)
93 ADDHALFREGS(r3, r7)
94 "stmia %[v1]!, {r0-r3} \n"
95#if ORDER > 16
96 "subs %[cnt], %[cnt], #1 \n"
97 "bne 1b \n"
98#endif
99
100 "99: \n"
101 : /* outputs */
102#if ORDER > 16
103 [cnt]"+r"(cnt),
104#endif
105 [v1] "+r"(v1),
106 [v2] "+r"(v2)
107 : /* inputs */
108 : /* clobbers */
109 "r0", "r1", "r2", "r3", "r4",
110 "r5", "r6", "r7", "r8", "memory"
111 );
112}
113
114/* This version fetches data as 32 bit words, and *requires* v1 to be
115 * 32 bit aligned, otherwise it will result either in a data abort, or
116 * incorrect results (if ARM aligncheck is disabled). */
117static inline void vector_sub(int16_t* v1, int16_t* v2)
118{
119#if ORDER > 16
120 int cnt = ORDER>>4;
121#endif
122
123#define SUBHALFREGS(dif, s1) /* Subtracts register */ \
124 "sub r8 , " #dif ", " #s1 "\n" /* halves straight. */ \
125 "and r8 , r8 , r9 \n" /* Needs r9 = 0x0000ffff, */ \
126 "mov " #dif ", " #dif ", lsr #16 \n" /* clobbers r8. */ \
127 "sub " #dif ", " #dif ", " #s1 ", lsr #16 \n" \
128 "orr " #dif ", r8 , " #dif ", lsl #16 \n"
129
130#define SUBHALFXREGS(dif, s1, s2) /* Subtracts register */ \
131 "sub " #s1 ", " #dif ", " #s1 ", lsr #16 \n" /* halves across. */ \
132 "and " #s1 ", " #s1 ", r9 \n" /* Needs r9 = 0x0000ffff, */ \
133 "rsb " #dif ", " #s2 ", " #dif ", lsr #16 \n" /* clobbers 's1'. */ \
134 "orr " #dif ", " #s1 ", " #dif ", lsl #16 \n"
135
136 asm volatile (
137 "mov r9, #0xff \n"
138 "orr r9, r9, #0xff00 \n"
139 "tst %[v2], #2 \n"
140 "beq 20f \n"
141
142 "10: \n"
143 "ldrh r4, [%[v2]], #2 \n"
144 "mov r4, r4, lsl #16 \n"
145 "1: \n"
146 "ldmia %[v1], {r0-r3} \n"
147 "ldmia %[v2]!, {r5-r8} \n"
148 SUBHALFXREGS(r0, r4, r5)
149 SUBHALFXREGS(r1, r5, r6)
150 SUBHALFXREGS(r2, r6, r7)
151 SUBHALFXREGS(r3, r7, r8)
152 "stmia %[v1]!, {r0-r3} \n"
153 "mov r4, r8 \n"
154 "ldmia %[v1], {r0-r3} \n"
155 "ldmia %[v2]!, {r5-r8} \n"
156 SUBHALFXREGS(r0, r4, r5)
157 SUBHALFXREGS(r1, r5, r6)
158 SUBHALFXREGS(r2, r6, r7)
159 SUBHALFXREGS(r3, r7, r8)
160 "stmia %[v1]!, {r0-r3} \n"
161#if ORDER > 16
162 "mov r4, r8 \n"
163 "subs %[cnt], %[cnt], #1 \n"
164 "bne 1b \n"
165#endif
166 "b 99f \n"
167
168 "20: \n"
169 "1: \n"
170 "ldmia %[v1], {r0-r3} \n"
171 "ldmia %[v2]!, {r4-r7} \n"
172 SUBHALFREGS(r0, r4)
173 SUBHALFREGS(r1, r5)
174 SUBHALFREGS(r2, r6)
175 SUBHALFREGS(r3, r7)
176 "stmia %[v1]!, {r0-r3} \n"
177 "ldmia %[v1], {r0-r3} \n"
178 "ldmia %[v2]!, {r4-r7} \n"
179 SUBHALFREGS(r0, r4)
180 SUBHALFREGS(r1, r5)
181 SUBHALFREGS(r2, r6)
182 SUBHALFREGS(r3, r7)
183 "stmia %[v1]!, {r0-r3} \n"
184#if ORDER > 16
185 "subs %[cnt], %[cnt], #1 \n"
186 "bne 1b \n"
187#endif
188
189 "99: \n"
190 : /* outputs */
191#if ORDER > 16
192 [cnt]"+r"(cnt),
193#endif
194 [v1] "+r"(v1),
195 [v2] "+r"(v2)
196 : /* inputs */
197 : /* clobbers */
198 "r0", "r1", "r2", "r3", "r4", "r5",
199 "r6", "r7", "r8", "r9", "memory"
200 );
201}
202
203/* This version fetches data as 32 bit words, and *requires* v1 to be
204 * 32 bit aligned, otherwise it will result either in a data abort, or
205 * incorrect results (if ARM aligncheck is disabled). It is optimised
206 * for ARM7TDMI. Using it for ARM9 or higher results in worse performance
207 * than the C version. */
208static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
209{
210 int res = 0;
211#if ORDER > 16
212 int cnt = ORDER>>4;
213#endif
214
215#define MLABLOCK2(f1, f2) \
216 "mov r8, " #f1 ", lsl #16 \n" \
217 "mov r8, r8 , asr #16 \n" \
218 "mov r9, " #f2 ", lsl #16 \n" \
219 "mov r9, r9 , asr #16 \n" \
220 "mla %[res], r9, r8, %[res] \n" \
221 "mov r8, " #f1 ", asr #16 \n" \
222 "mov r9, " #f2 ", asr #16 \n" \
223 "mla %[res], r9, r8, %[res] \n"
224
225#define MLABLOCK2_U2(f1, f2) \
226 "mov r8, " #f1 ", lsl #16 \n" \
227 "mov r8, r8 , asr #16 \n" \
228 "mla %[res], r9, r8, %[res] \n" \
229 "mov r8, " #f1 ", asr #16 \n" \
230 "mov r9, " #f2 ", lsl #16 \n" \
231 "mov r9, r9 , asr #16 \n" \
232 "mla %[res], r9, r8, %[res] \n" \
233 "mov r9, " #f2 ", asr #16 \n"
234
235 asm volatile (
236 "tst %[v2], #2 \n"
237 "beq 20f \n"
238
239 "10: \n"
240 "ldrsh r9, [%[v2]], #2 \n"
241 "1: \n"
242 "ldmia %[v1]!, {r0-r3} \n"
243 "ldmia %[v2]!, {r4-r7} \n"
244 MLABLOCK2_U2(r0, r4)
245 MLABLOCK2_U2(r1, r5)
246 MLABLOCK2_U2(r2, r6)
247 MLABLOCK2_U2(r3, r7)
248 "ldmia %[v1]!, {r0-r3} \n"
249 "ldmia %[v2]!, {r4-r7} \n"
250 MLABLOCK2_U2(r0, r4)
251 MLABLOCK2_U2(r1, r5)
252 MLABLOCK2_U2(r2, r6)
253 MLABLOCK2_U2(r3, r7)
254#if ORDER > 16
255 "subs %[cnt], %[cnt], #1 \n"
256 "bne 1b \n"
257#endif
258 "b 99f \n"
259
260 "20: \n"
261 "1: \n"
262 "ldmia %[v1]!, {r0-r3} \n"
263 "ldmia %[v2]!, {r4-r7} \n"
264 MLABLOCK2(r0, r4)
265 MLABLOCK2(r1, r5)
266 MLABLOCK2(r2, r6)
267 MLABLOCK2(r3, r7)
268 "ldmia %[v1]!, {r0-r3} \n"
269 "ldmia %[v2]!, {r4-r7} \n"
270 MLABLOCK2(r0, r4)
271 MLABLOCK2(r1, r5)
272 MLABLOCK2(r2, r6)
273 MLABLOCK2(r3, r7)
274#if ORDER > 16
275 "subs %[cnt], %[cnt], #1 \n"
276 "bne 1b \n"
277#endif
278
279 "99: \n"
280 : /* outputs */
281#if ORDER > 16
282 [cnt]"+r"(cnt),
283#endif
284 [v1] "+r"(v1),
285 [v2] "+r"(v2),
286 [res]"+r"(res)
287 : /* inputs */
288 : /* clobbers */
289 "r0", "r1", "r2", "r3", "r4",
290 "r5", "r6", "r7", "r8", "r9"
291 );
292 return res;
293}
diff --git a/apps/codecs/demac/libdemac/vector_math32_armv4.h b/apps/codecs/demac/libdemac/vector_math32_armv4.h
new file mode 100644
index 0000000000..b729bd3a0a
--- /dev/null
+++ b/apps/codecs/demac/libdemac/vector_math32_armv4.h
@@ -0,0 +1,210 @@
1/*
2
3libdemac - A Monkey's Audio decoder
4
5$Id$
6
7Copyright (C) Dave Chapman 2007
8
9ARMv4 vector math copyright (C) 2008 Jens Arnold
10
11This program is free software; you can redistribute it and/or modify
12it under the terms of the GNU General Public License as published by
13the Free Software Foundation; either version 2 of the License, or
14(at your option) any later version.
15
16This program is distributed in the hope that it will be useful,
17but WITHOUT ANY WARRANTY; without even the implied warranty of
18MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19GNU General Public License for more details.
20
21You should have received a copy of the GNU General Public License
22along with this program; if not, write to the Free Software
23Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
24
25*/
26
27static inline void vector_add(int32_t* v1, int32_t* v2)
28{
29#if ORDER > 32
30 int cnt = ORDER>>5;
31#endif
32
33#define ADDBLOCK4 \
34 "ldmia %[v1], {r0-r3} \n" \
35 "ldmia %[v2]!, {r4-r7} \n" \
36 "add r0, r0, r4 \n" \
37 "add r1, r1, r5 \n" \
38 "add r2, r2, r6 \n" \
39 "add r3, r3, r7 \n" \
40 "stmia %[v1]!, {r0-r3} \n"
41
42 asm volatile (
43 "1: \n"
44 ADDBLOCK4
45 ADDBLOCK4
46 ADDBLOCK4
47 ADDBLOCK4
48#if ORDER > 16
49 ADDBLOCK4
50 ADDBLOCK4
51 ADDBLOCK4
52 ADDBLOCK4
53#endif
54#if ORDER > 32
55 "subs %[cnt], %[cnt], #1 \n"
56 "bne 1b \n"
57#endif
58 : /* outputs */
59#if ORDER > 32
60 [cnt]"+r"(cnt),
61#endif
62 [v1] "+r"(v1),
63 [v2] "+r"(v2)
64 : /* inputs */
65 : /* clobbers */
66 "r0", "r1", "r2", "r3", "r4",
67 "r5", "r6", "r7", "memory"
68 );
69}
70
71static inline void vector_sub(int32_t* v1, int32_t* v2)
72{
73#if ORDER > 32
74 int cnt = ORDER>>5;
75#endif
76
77#define SUBBLOCK4 \
78 "ldmia %[v1], {r0-r3} \n" \
79 "ldmia %[v2]!, {r4-r7} \n" \
80 "sub r0, r0, r4 \n" \
81 "sub r1, r1, r5 \n" \
82 "sub r2, r2, r6 \n" \
83 "sub r3, r3, r7 \n" \
84 "stmia %[v1]!, {r0-r3} \n"
85
86 asm volatile (
87 "1: \n"
88 SUBBLOCK4
89 SUBBLOCK4
90 SUBBLOCK4
91 SUBBLOCK4
92#if ORDER > 16
93 SUBBLOCK4
94 SUBBLOCK4
95 SUBBLOCK4
96 SUBBLOCK4
97#endif
98#if ORDER > 32
99 "subs %[cnt], %[cnt], #1 \n"
100 "bne 1b \n"
101#endif
102 : /* outputs */
103#if ORDER > 32
104 [cnt]"+r"(cnt),
105#endif
106 [v1] "+r"(v1),
107 [v2] "+r"(v2)
108 : /* inputs */
109 : /* clobbers */
110 "r0", "r1", "r2", "r3", "r4",
111 "r5", "r6", "r7", "memory"
112 );
113}
114
115static inline int32_t scalarproduct(int32_t* v1, int32_t* v2)
116{
117 int res = 0;
118#if ORDER > 32
119 int cnt = ORDER>>5;
120#endif
121
122 asm volatile (
123#if ORDER > 16
124 "ldmia %[v2]!, {r6-r7} \n"
125 "1: \n"
126 "ldmia %[v1]!, {r0,r1,r3-r5} \n"
127 "mla %[res], r6, r0, %[res] \n"
128 "mla %[res], r7, r1, %[res] \n"
129 "ldmia %[v2]!, {r0-r2,r6-r8} \n"
130 "mla %[res], r0, r3, %[res] \n"
131 "mla %[res], r1, r4, %[res] \n"
132 "mla %[res], r2, r5, %[res] \n"
133 "ldmia %[v1]!, {r0-r4} \n"
134 "mla %[res], r6, r0, %[res] \n"
135 "mla %[res], r7, r1, %[res] \n"
136 "mla %[res], r8, r2, %[res] \n"
137 "ldmia %[v2]!, {r0,r1,r6-r8} \n"
138 "mla %[res], r0, r3, %[res] \n"
139 "mla %[res], r1, r4, %[res] \n"
140 "ldmia %[v1]!, {r0-r5} \n"
141 "mla %[res], r6, r0, %[res] \n"
142 "mla %[res], r7, r1, %[res] \n"
143 "mla %[res], r8, r2, %[res] \n"
144 "ldmia %[v2]!, {r0-r2,r6,r7} \n"
145 "mla %[res], r0, r3, %[res] \n"
146 "mla %[res], r1, r4, %[res] \n"
147 "mla %[res], r2, r5, %[res] \n"
148 "ldmia %[v1]!, {r0,r1,r3-r5} \n"
149 "mla %[res], r6, r0, %[res] \n"
150 "mla %[res], r7, r1, %[res] \n"
151 "ldmia %[v2]!, {r0-r2,r6-r8} \n"
152 "mla %[res], r0, r3, %[res] \n"
153 "mla %[res], r1, r4, %[res] \n"
154 "mla %[res], r2, r5, %[res] \n"
155 "ldmia %[v1]!, {r0-r4} \n"
156 "mla %[res], r6, r0, %[res] \n"
157 "mla %[res], r7, r1, %[res] \n"
158 "mla %[res], r8, r2, %[res] \n"
159 "ldmia %[v2]!, {r0,r1,r6-r8} \n"
160 "mla %[res], r0, r3, %[res] \n"
161 "mla %[res], r1, r4, %[res] \n"
162 "ldmia %[v1]!, {r0-r5} \n"
163 "mla %[res], r6, r0, %[res] \n"
164 "mla %[res], r7, r1, %[res] \n"
165 "mla %[res], r8, r2, %[res] \n"
166#if ORDER > 32
167 "ldmia %[v2]!, {r0-r2,r6,r7} \n"
168#else
169 "ldmia %[v2]!, {r0-r2} \n"
170#endif
171 "mla %[res], r0, r3, %[res] \n"
172 "mla %[res], r1, r4, %[res] \n"
173 "mla %[res], r2, r5, %[res] \n"
174#if ORDER > 32
175 "subs %[cnt], %[cnt], #1 \n"
176 "bne 1b \n"
177#endif
178
179#else /* ORDER <= 16 */
180
181#define MLABLOCK4 \
182 "ldmia %[v1]!, {r0-r3} \n" \
183 "ldmia %[v2]!, {r4-r7} \n" \
184 "mla %[res], r4, r0, %[res] \n" \
185 "mla %[res], r5, r1, %[res] \n" \
186 "mla %[res], r6, r2, %[res] \n" \
187 "mla %[res], r7, r3, %[res] \n"
188
189 MLABLOCK4
190 MLABLOCK4
191 MLABLOCK4
192 MLABLOCK4
193#endif /* ORDER <= 16 */
194 : /* outputs */
195#if ORDER > 32
196 [cnt]"+r"(cnt),
197#endif
198 [v1] "+r"(v1),
199 [v2] "+r"(v2),
200 [res]"+r"(res)
201 : /* inputs */
202 : /* clobbers */
203 "r0", "r1", "r2", "r3",
204 "r4", "r5", "r6", "r7"
205#if ORDER > 16
206 ,"r8"
207#endif
208 );
209 return res;
210}
diff --git a/apps/codecs/demac/libdemac/vector_math16.h b/apps/codecs/demac/libdemac/vector_math_generic.h
index 5d82abe930..7b61db77be 100644
--- a/apps/codecs/demac/libdemac/vector_math16.h
+++ b/apps/codecs/demac/libdemac/vector_math_generic.h
@@ -2,7 +2,7 @@
2 2
3libdemac - A Monkey's Audio decoder 3libdemac - A Monkey's Audio decoder
4 4
5$Id:$ 5$Id$
6 6
7Copyright (C) Dave Chapman 2007 7Copyright (C) Dave Chapman 2007
8 8
@@ -22,7 +22,9 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
22 22
23*/ 23*/
24 24
25static inline void vector_add(int16_t* v1, int16_t* v2) 25#include "demac_config.h"
26
27static inline void vector_add(filter_int* v1, filter_int* v2)
26{ 28{
27#if ORDER > 32 29#if ORDER > 32
28 int order = (ORDER >> 5); 30 int order = (ORDER >> 5);
@@ -66,7 +68,7 @@ static inline void vector_add(int16_t* v1, int16_t* v2)
66 } 68 }
67} 69}
68 70
69static inline void vector_sub(int16_t* v1, int16_t* v2) 71static inline void vector_sub(filter_int* v1, filter_int* v2)
70{ 72{
71#if ORDER > 32 73#if ORDER > 32
72 int order = (ORDER >> 5); 74 int order = (ORDER >> 5);
@@ -110,7 +112,7 @@ static inline void vector_sub(int16_t* v1, int16_t* v2)
110 } 112 }
111} 113}
112 114
113static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) 115static inline int32_t scalarproduct(filter_int* v1, filter_int* v2)
114{ 116{
115 int res = 0; 117 int res = 0;
116 118