summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Giacomelli <giac2000@hotmail.com>2010-11-10 18:29:29 +0000
committerMichael Giacomelli <giac2000@hotmail.com>2010-11-10 18:29:29 +0000
commitf51189fa4d16c138c951775a237326b807fdda89 (patch)
treeb1eb6613f15f02da344fb9cabe40ea9ba03ecf23
parent33af0dec28cf31be0ce7195b90546861efcce76f (diff)
downloadrockbox-f51189fa4d16c138c951775a237326b807fdda89.tar.gz
rockbox-f51189fa4d16c138c951775a237326b807fdda89.zip
Commit FS#11709 by me. Introduces ARMv5E optimized iQMF for atrac3 based on packed multiply instructions. Additionally, improves scheduling on arm9 and arm11 and forces cache alignment of buffers on all targets. Accuracy is slightly reduced, but still greater then 16 bit. Clip+ CPU clock required for LP2 files decreases by 13MHz and ARM11 by 18MHz. No performance or accuracy changes on armv4 or non-arm.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@28549 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/atrac3_rm.c2
-rw-r--r--apps/codecs/libatrac/SOURCES3
-rw-r--r--apps/codecs/libatrac/atrac3.c42
-rw-r--r--apps/codecs/libatrac/atrac3.h4
-rw-r--r--apps/codecs/libatrac/atrac3_armv5e.S163
5 files changed, 205 insertions, 9 deletions
diff --git a/apps/codecs/atrac3_rm.c b/apps/codecs/atrac3_rm.c
index 6c559ec868..bad9831a25 100644
--- a/apps/codecs/atrac3_rm.c
+++ b/apps/codecs/atrac3_rm.c
@@ -30,7 +30,7 @@ CODEC_HEADER
30 30
31static RMContext rmctx; 31static RMContext rmctx;
32static RMPacket pkt; 32static RMPacket pkt;
33static ATRAC3Context q IBSS_ATTR; 33static ATRAC3Context q IBSS_ATTR __attribute__ ((aligned (32)));
34 34
35static void init_rm(RMContext *rmctx) 35static void init_rm(RMContext *rmctx)
36{ 36{
diff --git a/apps/codecs/libatrac/SOURCES b/apps/codecs/libatrac/SOURCES
index 3eaf4c9c3b..85f011cb87 100644
--- a/apps/codecs/libatrac/SOURCES
+++ b/apps/codecs/libatrac/SOURCES
@@ -1,5 +1,8 @@
1atrac3.c 1atrac3.c
2#if defined(CPU_ARM) 2#if defined(CPU_ARM)
3atrac3_arm.S 3atrac3_arm.S
4#if (ARM_ARCH >= 5)
5atrac3_armv5e.S
6#endif
4#endif 7#endif
5 8
diff --git a/apps/codecs/libatrac/atrac3.c b/apps/codecs/libatrac/atrac3.c
index b93b058bb2..f6085fa2fa 100644
--- a/apps/codecs/libatrac/atrac3.c
+++ b/apps/codecs/libatrac/atrac3.c
@@ -55,7 +55,11 @@
55#define FFSWAP(type,a,b) do{type SWAP_tmp= b; b= a; a= SWAP_tmp;}while(0) 55#define FFSWAP(type,a,b) do{type SWAP_tmp= b; b= a; a= SWAP_tmp;}while(0)
56 56
57static VLC spectral_coeff_tab[7]; 57static VLC spectral_coeff_tab[7];
58#if defined(CPU_ARM) && (ARM_ARCH >= 5) /*ARMv5e+ uses 32x16 multiplication*/
59static int16_t qmf_window[48] IBSS_ATTR __attribute__ ((aligned (32)));
60#else
58static int32_t qmf_window[48] IBSS_ATTR; 61static int32_t qmf_window[48] IBSS_ATTR;
62#endif
59static int32_t atrac3_spectrum [2][1024] IBSS_ATTR __attribute__((aligned(16))); 63static int32_t atrac3_spectrum [2][1024] IBSS_ATTR __attribute__((aligned(16)));
60static int32_t atrac3_IMDCT_buf[2][ 512] IBSS_ATTR __attribute__((aligned(16))); 64static int32_t atrac3_IMDCT_buf[2][ 512] IBSS_ATTR __attribute__((aligned(16)));
61static int32_t atrac3_prevFrame[2][1024] IBSS_ATTR; 65static int32_t atrac3_prevFrame[2][1024] IBSS_ATTR;
@@ -118,12 +122,30 @@ static channel_unit channel_units[2] IBSS_ATTR_LARGE_IRAM;
118 * } 122 * }
119 */ 123 */
120 124
121#if defined(CPU_ARM) 125#if defined(CPU_ARM) && (ARM_ARCH >= 5)
122 extern void 126 extern void
123 atrac3_iqmf_dewindowing(int32_t *out, 127 atrac3_iqmf_dewindowing_armv5e(int32_t *out,
124 int32_t *in, 128 int32_t *in,
125 int32_t *win, 129 int16_t *win,
126 unsigned int nIn); 130 unsigned int nIn);
131 static inline void
132 atrac3_iqmf_dewindowing(int32_t *out,
133 int32_t *in,
134 int16_t *win,
135 unsigned int nIn)
136 {
137 //atrac3_iqmf_dewindowing_armv5e(out, in, win, nIn);
138
139 }
140
141
142#elif defined(CPU_ARM)
143 extern void
144 atrac3_iqmf_dewindowing(int32_t *out,
145 int32_t *in,
146 int16_t *win,
147 unsigned int nIn);
148
127#elif defined (CPU_COLDFIRE) 149#elif defined (CPU_COLDFIRE)
128 #define MULTIPLY_ADD_BLOCK \ 150 #define MULTIPLY_ADD_BLOCK \
129 "movem.l (%[win]), %%d0-%%d7 \n\t" \ 151 "movem.l (%[win]), %%d0-%%d7 \n\t" \
@@ -206,7 +228,9 @@ static channel_unit channel_units[2] IBSS_ATTR_LARGE_IRAM;
206 228
207 out[0] = s2; 229 out[0] = s2;
208 out[1] = s1; 230 out[1] = s1;
231
209 } 232 }
233
210 } 234 }
211#endif 235#endif
212 236
@@ -244,6 +268,7 @@ atrac3_imdct_windowing(int32_t *buffer,
244 268
245static void iqmf (int32_t *inlo, int32_t *inhi, unsigned int nIn, int32_t *pOut, int32_t *delayBuf, int32_t *temp) 269static void iqmf (int32_t *inlo, int32_t *inhi, unsigned int nIn, int32_t *pOut, int32_t *delayBuf, int32_t *temp)
246{ 270{
271
247 /* Restore the delay buffer */ 272 /* Restore the delay buffer */
248 memcpy(temp, delayBuf, 46*sizeof(int32_t)); 273 memcpy(temp, delayBuf, 46*sizeof(int32_t));
249 274
@@ -274,6 +299,7 @@ static void IMLT(int32_t *pInput, int32_t *pOutput)
274 299
275 /* Windowing. */ 300 /* Windowing. */
276 atrac3_imdct_windowing(pOutput, window_lookup); 301 atrac3_imdct_windowing(pOutput, window_lookup);
302
277} 303}
278 304
279 305
@@ -320,9 +346,13 @@ static void init_atrac3_transforms(void)
320 /* Generate the QMF window. */ 346 /* Generate the QMF window. */
321 for (i=0 ; i<24; i++) { 347 for (i=0 ; i<24; i++) {
322 s = qmf_48tap_half_fix[i] << 1; 348 s = qmf_48tap_half_fix[i] << 1;
323 qmf_window[i] = s; 349 #if defined(CPU_ARM) && (ARM_ARCH >= 5)
324 qmf_window[47 - i] = s; 350 qmf_window[i] = qmf_window[47-i] = (int16_t)((s+(1<<15))>>16);
351 #else
352 qmf_window[i] = qmf_window[47-i] = s;
353 #endif
325 } 354 }
355
326} 356}
327 357
328 358
@@ -1229,7 +1259,7 @@ int atrac3_decode_init(ATRAC3Context *q, struct mp3entry *id3)
1229 vlcs_initialized = 1; 1259 vlcs_initialized = 1;
1230 1260
1231 } 1261 }
1232 1262
1233 init_atrac3_transforms(); 1263 init_atrac3_transforms();
1234 1264
1235 /* init the joint-stereo decoding data */ 1265 /* init the joint-stereo decoding data */
diff --git a/apps/codecs/libatrac/atrac3.h b/apps/codecs/libatrac/atrac3.h
index 74dd992e1b..afe582ab72 100644
--- a/apps/codecs/libatrac/atrac3.h
+++ b/apps/codecs/libatrac/atrac3.h
@@ -67,6 +67,7 @@ typedef struct {
67} channel_unit; 67} channel_unit;
68 68
69typedef struct { 69typedef struct {
70 int32_t outSamples[2048];
70 GetBitContext gb; 71 GetBitContext gb;
71 //@{ 72 //@{
72 /** stream data */ 73 /** stream data */
@@ -90,8 +91,7 @@ typedef struct {
90 int weighting_delay[6]; 91 int weighting_delay[6];
91 //@} 92 //@}
92 //@{ 93 //@{
93 /** data buffers */ 94 /** data buffers */
94 int32_t outSamples[2048];
95 uint8_t decoded_bytes_buffer[1024]; 95 uint8_t decoded_bytes_buffer[1024];
96 int32_t tempBuf[1070]; 96 int32_t tempBuf[1070];
97 //@} 97 //@}
diff --git a/apps/codecs/libatrac/atrac3_armv5e.S b/apps/codecs/libatrac/atrac3_armv5e.S
new file mode 100644
index 0000000000..1add5faef5
--- /dev/null
+++ b/apps/codecs/libatrac/atrac3_armv5e.S
@@ -0,0 +1,163 @@
1/***************************************************************************
2 * __________ __ ___.
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 * \/ \/ \/ \/ \/
8 * $Id:
9 *
10 * Copyright (C) 2010 by Michael Giacomelli
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version.
16 *
17 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
18 * KIND, either express or implied.
19 *
20 ****************************************************************************/
21
22#include "config.h"
23
24 .section .text, "ax", %progbits
25
26
27/****************************************************************************
28 * atrac3_iqmf_dewindowing_armv5e(int32_t *out,
29 * int32_t *in,
30 * int32_t *win,
31 * unsigned int nIn);
32 *
33 * Dewindowing step within iqmf of atrac3 synthesis using 16 bit filter
34 * coefficients and armv5e packed multiply instructions. Uses 2.5 cycles
35 * per filter coefficient (ideal). Benchmarked 3.54 per coefficient (Clip+).
36 *
37 * Reference implementation:
38 *
39 * for (j = nIn; j != 0; j--) {
40 * s1 = fixmul32(in[0], win[0]);
41 * s2 = fixmul32(in[1], win[1]);
42 * for (i = 2; i < 48; i += 2) {
43 * s1 += fixmul32(in[i ], win[i ]);
44 * s2 += fixmul32(in[i+1], win[i+1]);
45 * }
46 * out[0] = s2 << 1;
47 * out[1] = s1 << 1;
48 * in += 2;
49 * out += 2;
50 * }
51 * Note: r12 is a scratch register and can be used without restorage.
52 ****************************************************************************/
53 .align 2
54 .global atrac3_iqmf_dewindowing_armv5e
55 .type atrac3_iqmf_dewindowing_armv5e, %function
56
57atrac3_iqmf_dewindowing_armv5e:
58 /* r0 = dest */
59 /* r1 = input samples */
60 /* r2 = window coefficients */
61 /* r3 = counter */
62 stmfd sp!, {r4-r11, lr} /* save non-scratch registers */
63
64.iqmf_dewindow_outer_loop: /* outer loop 0...counter-1 */
65 /* 0.. 7 */
66 ldmia r2!, {r4, r5, r8, r9} /* load win[0..7] */
67 ldmia r1!, {r6, r7, r10, r11} /* load in[0..3] to avoid stall on arm11 */
68 smulwb lr, r6, r4 /* s1 = in[0] * win[0] */
69 smulwt r12, r7, r4 /* s2 = in[1] * win[1] */
70 smlawb lr, r10, r5, lr /* s1 += in[i ] * win[i ] >> 16 */
71 smlawt r12, r11,r5, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
72
73 ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
74 smlawb lr, r6, r8, lr /* s1 += in[i ] * win[i ] >> 16 */
75 smlawt r12, r7, r8, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
76 smlawb lr, r10, r9, lr /* s1 += in[i ] * win[i ] >> 16 */
77 smlawt r12, r11, r9, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
78
79 /* 8..15 */
80 ldmia r2!, {r4, r5, r8, r9} /* load win[8..15] */
81 ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
82 smlawb lr, r6, r4, lr /* s1 += in[i ] * win[i ] >> 16 */
83 smlawt r12, r7, r4, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
84 smlawb lr, r10, r5, lr /* s1 += in[i ] * win[i ] >> 16 */
85 smlawt r12, r11,r5, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
86
87 ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
88 smlawb lr, r6, r8, lr /* s1 += in[i ] * win[i ] >> 16 */
89 smlawt r12, r7, r8, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
90 smlawb lr, r10, r9, lr /* s1 += in[i ] * win[i ] >> 16 */
91 smlawt r12, r11,r9, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
92
93 /* 16..23 */
94 ldmia r2!, {r4, r5, r8, r9} /* load win[16..23] */
95 ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
96 smlawb lr, r6, r4, lr /* s1 += in[i ] * win[i ] >> 16 */
97 smlawt r12, r7, r4, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
98 smlawb lr, r10, r5, lr /* s1 += in[i ] * win[i ] >> 16 */
99 smlawt r12, r11,r5, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
100
101 ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
102 smlawb lr, r6, r8, lr /* s1 += in[i ] * win[i ] >> 16 */
103 smlawt r12, r7, r8, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
104 smlawb lr, r10, r9, lr /* s1 += in[i ] * win[i ] >> 16 */
105 smlawt r12, r11,r9, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
106
107 /* 24..31 */
108 ldmia r2!, {r4, r5, r8, r9} /* load win[24..31] */
109 ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
110 smlawb lr, r6, r4, lr /* s1 += in[i ] * win[i ] >> 16 */
111 smlawt r12, r7, r4, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
112 smlawb lr, r10, r5, lr /* s1 += in[i ] * win[i ] >> 16 */
113 smlawt r12, r11,r5, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
114
115 ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
116 smlawb lr, r6, r8, lr /* s1 += in[i ] * win[i ] >> 16 */
117 smlawt r12, r7, r8, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
118 smlawb lr, r10, r9, lr /* s1 += in[i ] * win[i ] >> 16 */
119 smlawt r12, r11,r9, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
120
121 /* 32..39 */
122 ldmia r2!, {r4, r5, r8, r9} /* load win[32..39] */
123 ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
124 smlawb lr, r6, r4, lr /* s1 += in[i ] * win[i ] >> 16 */
125 smlawt r12, r7, r4, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
126 smlawb lr, r10, r5, lr /* s1 += in[i ] * win[i ] >> 16 */
127 smlawt r12, r11,r5, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
128
129 ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
130 smlawb lr, r6, r8, lr /* s1 += in[i ] * win[i ] >> 16 */
131 smlawt r12, r7, r8, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
132 smlawb lr, r10, r9, lr /* s1 += in[i ] * win[i ] >> 16 */
133 smlawt r12, r11,r9, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
134
135 /* 40..47 */
136 ldmia r2!, {r4, r5, r8, r9} /* load win[40..47] */
137 ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
138 smlawb lr, r6, r4, lr /* s1 += in[i ] * win[i ] >> 16 */
139 smlawt r12, r7, r4, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
140 smlawb lr, r10, r5, lr /* s1 += in[i ] * win[i ] >> 16 */
141 smlawt r12, r11,r5, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
142
143 ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
144 smlawb lr, r6, r8, lr /* s1 += in[i ] * win[i ] >> 16 */
145 smlawt r12, r7, r8, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
146 smlawb lr, r10, r9, lr /* s1 += in[i ] * win[i ] >> 16 */
147 smlawt r12, r11,r9, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
148
149
150 mov lr , lr , lsl #1
151 mov r12, r12, lsl #1
152
153 stmia r0!, {r12, lr} /* store result out[0]=s2, out[1]=s1 */
154 sub r1, r1, #184 /* roll back 64 entries = 184 bytes */
155 sub r2, r2, #96 /* roll back 48 entries * 2 bytes = 96 bytes = win[0] */
156
157 subs r3, r3, #1 /* outer loop -= 1 */
158 bgt .iqmf_dewindow_outer_loop
159
160 ldmpc regs=r4-r11 /* restore registers */
161
162.atrac3_iqmf_dewindowing_armv5e_end:
163 .size atrac3_iqmf_dewindowing_armv5e,.atrac3_iqmf_dewindowing_armv5e_end-atrac3_iqmf_dewindowing_armv5e