diff options
author | Jens Arnold <amiconn@rockbox.org> | 2007-10-18 22:37:33 +0000 |
---|---|---|
committer | Jens Arnold <amiconn@rockbox.org> | 2007-10-18 22:37:33 +0000 |
commit | 2640bdb262d07bf910a6ff614834d73713bdf4a4 (patch) | |
tree | b42bb69edb6410f91f2e30a2f29d838b17b56e57 /apps | |
parent | 4a19ce39f8205f025b38dbc4297d591d033c8c2a (diff) | |
download | rockbox-2640bdb262d07bf910a6ff614834d73713bdf4a4.tar.gz rockbox-2640bdb262d07bf910a6ff614834d73713bdf4a4.zip |
APE codec: Assembler optimised vector math routines for coldfire. -c2000 is now usable at 130% realtime (was 107%), -c3000 is near realtime (93%, was 64%). -c1000 doesn't change.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@15194 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps')
-rw-r--r-- | apps/codecs/demac/libdemac/decoder.c | 11 | ||||
-rw-r--r-- | apps/codecs/demac/libdemac/filter.c | 13 | ||||
-rw-r--r-- | apps/codecs/demac/libdemac/vector_math16_cf.h | 180 |
3 files changed, 200 insertions, 4 deletions
diff --git a/apps/codecs/demac/libdemac/decoder.c b/apps/codecs/demac/libdemac/decoder.c index 4f4a583d00..326e893ec4 100644 --- a/apps/codecs/demac/libdemac/decoder.c +++ b/apps/codecs/demac/libdemac/decoder.c | |||
@@ -32,12 +32,15 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA | |||
32 | 32 | ||
33 | /* Statically allocate the filter buffers */ | 33 | /* Statically allocate the filter buffers */ |
34 | 34 | ||
35 | static int16_t filterbuf32[(32*3 + HISTORY_SIZE) * 2] IBSS_ATTR; /* 4480 bytes */ | 35 | static int16_t filterbuf32[(32*3 + HISTORY_SIZE) * 2] /* 4480 bytes */ |
36 | static int16_t filterbuf256[(256*3 + HISTORY_SIZE) * 2] IBSS_ATTR; /* 5120 bytes */ | 36 | IBSS_ATTR __attribute__((aligned(16))); |
37 | static int16_t filterbuf256[(256*3 + HISTORY_SIZE) * 2] /* 5120 bytes */ | ||
38 | IBSS_ATTR __attribute__((aligned(16))); | ||
37 | 39 | ||
38 | /* This is only needed for "insane" files, and no Rockbox targets can | 40 | /* This is only needed for "insane" files, and no Rockbox targets can |
39 | hope to decode them in realtime anyway. */ | 41 | hope to decode them in realtime anyway. */ |
40 | static int16_t filterbuf1280[(1280*3 + HISTORY_SIZE) * 2]; /* 17408 bytes */ | 42 | static int16_t filterbuf1280[(1280*3 + HISTORY_SIZE) * 2] /* 17408 bytes */ |
43 | __attribute__((aligned(16))); | ||
41 | 44 | ||
42 | void init_frame_decoder(struct ape_ctx_t* ape_ctx, | 45 | void init_frame_decoder(struct ape_ctx_t* ape_ctx, |
43 | unsigned char* inbuffer, int* firstbyte, | 46 | unsigned char* inbuffer, int* firstbyte, |
@@ -163,7 +166,7 @@ int decode_chunk(struct ape_ctx_t* ape_ctx, | |||
163 | } | 166 | } |
164 | 167 | ||
165 | /* Now apply the predictor decoding */ | 168 | /* Now apply the predictor decoding */ |
166 | predictor_decode_stereo(&ape_ctx->predictor,decoded0,decoded1,count); | 169 | predictor_decode_stereo(&ape_ctx->predictor,decoded0,decoded1,count); |
167 | 170 | ||
168 | if (ape_ctx->bps == 8) { | 171 | if (ape_ctx->bps == 8) { |
169 | /* TODO: Handle 8-bit streams */ | 172 | /* TODO: Handle 8-bit streams */ |
diff --git a/apps/codecs/demac/libdemac/filter.c b/apps/codecs/demac/libdemac/filter.c index 131c152590..ac12959241 100644 --- a/apps/codecs/demac/libdemac/filter.c +++ b/apps/codecs/demac/libdemac/filter.c | |||
@@ -25,10 +25,15 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA | |||
25 | #include <string.h> | 25 | #include <string.h> |
26 | #include <inttypes.h> | 26 | #include <inttypes.h> |
27 | 27 | ||
28 | #include "codecs.h" /* for Rockbox CPU definitions etc */ | ||
28 | #include "demac.h" | 29 | #include "demac.h" |
29 | #include "filter.h" | 30 | #include "filter.h" |
30 | 31 | ||
32 | #ifdef CPU_COLDFIRE | ||
33 | #include "vector_math16_cf.h" | ||
34 | #else | ||
31 | #include "vector_math16.h" | 35 | #include "vector_math16.h" |
36 | #endif | ||
32 | 37 | ||
33 | struct filter_t { | 38 | struct filter_t { |
34 | int16_t* coeffs; /* ORDER entries */ | 39 | int16_t* coeffs; /* ORDER entries */ |
@@ -84,6 +89,10 @@ static inline void do_apply_filter_3980(struct filter_t* f, int32_t* data, int c | |||
84 | int res; | 89 | int res; |
85 | int absres; | 90 | int absres; |
86 | 91 | ||
92 | #ifdef PREPARE_SCALARPRODUCT | ||
93 | PREPARE_SCALARPRODUCT | ||
94 | #endif | ||
95 | |||
87 | while(count--) | 96 | while(count--) |
88 | { | 97 | { |
89 | res = FP_TO_INT(scalarproduct(f->delay - ORDER, f->coeffs)); | 98 | res = FP_TO_INT(scalarproduct(f->delay - ORDER, f->coeffs)); |
@@ -135,6 +144,10 @@ static inline void do_apply_filter_3980(struct filter_t* f, int32_t* data, int c | |||
135 | static inline void do_apply_filter_3970(struct filter_t* f, int32_t* data, int count) | 144 | static inline void do_apply_filter_3970(struct filter_t* f, int32_t* data, int count) |
136 | { | 145 | { |
137 | int res; | 146 | int res; |
147 | |||
148 | #ifdef PREPARE_SCALARPRODUCT | ||
149 | PREPARE_SCALARPRODUCT | ||
150 | #endif | ||
138 | 151 | ||
139 | while(count--) | 152 | while(count--) |
140 | { | 153 | { |
diff --git a/apps/codecs/demac/libdemac/vector_math16_cf.h b/apps/codecs/demac/libdemac/vector_math16_cf.h new file mode 100644 index 0000000000..85258c97a5 --- /dev/null +++ b/apps/codecs/demac/libdemac/vector_math16_cf.h | |||
@@ -0,0 +1,180 @@ | |||
1 | /* | ||
2 | |||
3 | libdemac - A Monkey's Audio decoder | ||
4 | |||
5 | $Id$ | ||
6 | |||
7 | Copyright (C) Dave Chapman 2007 | ||
8 | |||
9 | Coldfire vector math copyright (C) 2007 Jens Arnold | ||
10 | |||
11 | This program is free software; you can redistribute it and/or modify | ||
12 | it under the terms of the GNU General Public License as published by | ||
13 | the Free Software Foundation; either version 2 of the License, or | ||
14 | (at your option) any later version. | ||
15 | |||
16 | This program is distributed in the hope that it will be useful, | ||
17 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
19 | GNU General Public License for more details. | ||
20 | |||
21 | You should have received a copy of the GNU General Public License | ||
22 | along with this program; if not, write to the Free Software | ||
23 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA | ||
24 | |||
25 | */ | ||
26 | |||
27 | static inline void vector_add(int16_t* v1, int16_t* v2) | ||
28 | { | ||
29 | #define ADDHALFREGS(s1, sum) /* 's1' can be an A or D reg */ \ | ||
30 | "move.l " #s1 ", %%d4 \n" /* 'sum' must be a D reg */ \ | ||
31 | "add.l " #sum ", " #s1 "\n" /* 's1' and %%d4 are clobbered! */ \ | ||
32 | "clr.w %%d4 \n" \ | ||
33 | "add.l %%d4 , " #sum "\n" \ | ||
34 | "move.w " #s1 ", " #sum "\n" | ||
35 | |||
36 | asm volatile ( | ||
37 | #if ORDER > 16 | ||
38 | "moveq.l %[cnt], %%d5 \n" | ||
39 | "1: \n" | ||
40 | #endif | ||
41 | "movem.l (%[v1]), %%d0-%%d3 \n" | ||
42 | "movem.l (%[v2]), %%a0-%%a3 \n" | ||
43 | |||
44 | ADDHALFREGS(%%a0, %%d0) | ||
45 | ADDHALFREGS(%%a1, %%d1) | ||
46 | ADDHALFREGS(%%a2, %%d2) | ||
47 | ADDHALFREGS(%%a3, %%d3) | ||
48 | |||
49 | "movem.l %%d0-%%d3, (%[v1]) \n" | ||
50 | "lea.l (16, %[v1]), %[v1] \n" | ||
51 | "movem.l (%[v1]), %%d0-%%d3 \n" | ||
52 | "lea.l (16, %[v2]), %[v2] \n" | ||
53 | "movem.l (%[v2]), %%a0-%%a3 \n" | ||
54 | |||
55 | ADDHALFREGS(%%a0, %%d0) | ||
56 | ADDHALFREGS(%%a1, %%d1) | ||
57 | ADDHALFREGS(%%a2, %%d2) | ||
58 | ADDHALFREGS(%%a3, %%d3) | ||
59 | |||
60 | "movem.l %%d0-%%d3, (%[v1]) \n" | ||
61 | #if ORDER > 16 | ||
62 | "lea.l (16, %[v1]), %[v1] \n" | ||
63 | "lea.l (16, %[v2]), %[v2] \n" | ||
64 | "subq.l #1, %%d5 \n" | ||
65 | "bne.s 1b \n" | ||
66 | #endif | ||
67 | : /* outputs */ | ||
68 | [v1]"+a"(v1), | ||
69 | [v2]"+a"(v2) | ||
70 | : /* inputs */ | ||
71 | [cnt]"n"(ORDER>>4) | ||
72 | : /* clobbers */ | ||
73 | "d0", "d1", "d2", "d3", "d4", "d5", | ||
74 | "a0", "a1", "a2", "a3", "memory" | ||
75 | ); | ||
76 | } | ||
77 | |||
78 | static inline void vector_sub(int16_t* v1, int16_t* v2) | ||
79 | { | ||
80 | #define SUBHALFREGS(min, sub, dif) /* 'min' can be an A or D reg */ \ | ||
81 | "move.l " #min ", " #dif "\n" /* 'sub' and 'dif' must be D regs */ \ | ||
82 | "sub.l " #sub ", " #min "\n" /* 'min' and 'sub' are clobbered! */ \ | ||
83 | "clr.w " #sub "\n" \ | ||
84 | "sub.l " #sub ", " #dif "\n" \ | ||
85 | "move.w " #min ", " #dif "\n" | ||
86 | |||
87 | asm volatile ( | ||
88 | #if ORDER > 16 | ||
89 | "moveq.l %[cnt], %%d5 \n" | ||
90 | "1: \n" | ||
91 | #endif | ||
92 | "movem.l (%[v1]), %%a0-%%a3 \n" | ||
93 | "movem.l (%[v2]), %%d1-%%d4 \n" | ||
94 | |||
95 | SUBHALFREGS(%%a0, %%d1, %%d0) | ||
96 | SUBHALFREGS(%%a1, %%d2, %%d1) | ||
97 | SUBHALFREGS(%%a2, %%d3, %%d2) | ||
98 | SUBHALFREGS(%%a3, %%d4, %%d3) | ||
99 | |||
100 | "movem.l %%d0-%%d3, (%[v1]) \n" | ||
101 | "lea.l (16, %[v1]), %[v1] \n" | ||
102 | "movem.l (%[v1]), %%a0-%%a3 \n" | ||
103 | "lea.l (16, %[v2]), %[v2] \n" | ||
104 | "movem.l (%[v2]), %%d1-%%d4 \n" | ||
105 | |||
106 | SUBHALFREGS(%%a0, %%d1, %%d0) | ||
107 | SUBHALFREGS(%%a1, %%d2, %%d1) | ||
108 | SUBHALFREGS(%%a2, %%d3, %%d2) | ||
109 | SUBHALFREGS(%%a3, %%d4, %%d3) | ||
110 | |||
111 | "movem.l %%d0-%%d3, (%[v1]) \n" | ||
112 | #if ORDER > 16 | ||
113 | "lea.l (16, %[v1]), %[v1] \n" | ||
114 | "lea.l (16, %[v2]), %[v2] \n" | ||
115 | "subq.l #1, %%d5 \n" | ||
116 | "bne.s 1b \n" | ||
117 | #endif | ||
118 | : /* outputs */ | ||
119 | [v1]"+a"(v1), | ||
120 | [v2]"+a"(v2) | ||
121 | : /* inputs */ | ||
122 | [cnt]"n"(ORDER>>4) | ||
123 | : /* clobbers */ | ||
124 | "d0", "d1", "d2", "d3", "d4", "d5", | ||
125 | "a0", "a1", "a2", "a3", "memory" | ||
126 | ); | ||
127 | } | ||
128 | |||
129 | #define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */ | ||
130 | |||
131 | /* Needs EMAC in signed integer mode! */ | ||
132 | static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) | ||
133 | { | ||
134 | int res = 0; | ||
135 | |||
136 | #define MACBLOCK4 \ | ||
137 | "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" \ | ||
138 | "mac.w %%d0l, %%d1l, (%[v2])+, %%d3, %%acc0\n" \ | ||
139 | "mac.w %%d2u, %%d3u, (%[v1])+, %%d0, %%acc0\n" \ | ||
140 | "mac.w %%d2l, %%d3l, (%[v2])+, %%d1, %%acc0\n" | ||
141 | |||
142 | asm volatile ( | ||
143 | #if ORDER > 32 | ||
144 | "moveq.l %[cnt], %[res] \n" | ||
145 | #endif | ||
146 | "move.l (%[v1])+, %%d0 \n" | ||
147 | "move.l (%[v2])+, %%d1 \n" | ||
148 | "1: \n" | ||
149 | #if ORDER > 16 | ||
150 | MACBLOCK4 | ||
151 | MACBLOCK4 | ||
152 | MACBLOCK4 | ||
153 | MACBLOCK4 | ||
154 | #endif | ||
155 | MACBLOCK4 | ||
156 | MACBLOCK4 | ||
157 | MACBLOCK4 | ||
158 | "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" | ||
159 | "mac.w %%d0l, %%d1l, (%[v2])+, %%d3, %%acc0\n" | ||
160 | #if ORDER > 32 | ||
161 | "mac.w %%d2u, %%d3u, (%[v1])+, %%d0, %%acc0\n" | ||
162 | "mac.w %%d2l, %%d3l, (%[v2])+, %%d1, %%acc0\n" | ||
163 | "subq.l #1, %[res] \n" | ||
164 | "bne.w 1b \n" | ||
165 | #else | ||
166 | "mac.w %%d2u, %%d3u, %%acc0 \n" | ||
167 | "mac.w %%d2l, %%d3l, %%acc0 \n" | ||
168 | #endif | ||
169 | "movclr.l %%acc0, %[res] \n" | ||
170 | : /* outputs */ | ||
171 | [v1]"+a"(v1), | ||
172 | [v2]"+a"(v2), | ||
173 | [res]"=&d"(res) | ||
174 | : /* inputs */ | ||
175 | [cnt]"n"(ORDER>>5) | ||
176 | : /* clobbers */ | ||
177 | "d0", "d1", "d2", "d3" | ||
178 | ); | ||
179 | return res; | ||
180 | } | ||