summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Giacomelli <giac2000@hotmail.com>2013-01-01 02:35:15 +0100
committerMichael Giacomelli <giac2000@hotmail.com>2013-01-21 01:51:38 +0100
commita2ab22efbf93981f9a86b6b06dc6d3c2f1167728 (patch)
treef8f587c908de91d972df251821907f6538c083be
parent0c87e02631d954b5b8b0ec584bd60db77b60427e (diff)
downloadrockbox-a2ab22efbf93981f9a86b6b06dc6d3c2f1167728.tar.gz
rockbox-a2ab22efbf93981f9a86b6b06dc6d3c2f1167728.zip
ARMv5 optimized complex multiply function for libopus.
Speeds up decoding of 128k opus files by 1.2MHz on AMSv2. Rounding error is 1 bit due to KissFFT using a 15 bit shift instead of a 16 bit shift. Also, change an LDMIA in the armv4 code to LDM as the pointer should not increment. Change-Id: I626a207c6a056a1984e33cfe89415c35d0caed93 Reviewed-on: http://gerrit.rockbox.org/377 Reviewed-by: Michael Giacomelli <giac2000@hotmail.com> Tested-by: Michael Giacomelli <giac2000@hotmail.com>
-rw-r--r--lib/rbcodec/codecs/libopus/celt/_kiss_fft_guts.h26
1 files changed, 25 insertions, 1 deletions
diff --git a/lib/rbcodec/codecs/libopus/celt/_kiss_fft_guts.h b/lib/rbcodec/codecs/libopus/celt/_kiss_fft_guts.h
index b1fe8fbeb7..63e2548843 100644
--- a/lib/rbcodec/codecs/libopus/celt/_kiss_fft_guts.h
+++ b/lib/rbcodec/codecs/libopus/celt/_kiss_fft_guts.h
@@ -81,10 +81,13 @@
81 : "d0", "d1", "d2", "d3", "cc"); \ 81 : "d0", "d1", "d2", "d3", "cc"); \
82 } 82 }
83#elif defined(CPU_ARM) 83#elif defined(CPU_ARM)
84#if (ARM_ARCH < 5)
85
86
84# define C_MULC(m,a,b) \ 87# define C_MULC(m,a,b) \
85 { \ 88 { \
86 asm volatile( \ 89 asm volatile( \
87 "ldmia %[ap], {r0,r1} \n\t" \ 90 "ldm %[ap], {r0,r1} \n\t" \
88 "ldrsh r2, [%[bp], #0] \n\t" \ 91 "ldrsh r2, [%[bp], #0] \n\t" \
89 "ldrsh r3, [%[bp], #2] \n\t" \ 92 "ldrsh r3, [%[bp], #2] \n\t" \
90 \ 93 \
@@ -103,6 +106,27 @@
103 : "r0", "r1", "r2", "r3", "r4"); \ 106 : "r0", "r1", "r2", "r3", "r4"); \
104} 107}
105#else 108#else
109/*same as above but using armv5 packed multiplies*/
110# define C_MULC(m,a,b) \
111 { \
112 asm volatile( \
113 "ldm %[ap], {r0,r1} \n\t" \
114 "ldr r2, [%[bp], #0] \n\t" \
115 \
116 "smulwb r4, r0, r2 \n\t" /*r4=a.r*b.r*/ \
117 "smlawt %[mr], r1, r2, r4 \n\t" /*m.r=r4+a.i*b.i*/\
118 "mov %[mr], %[mr], lsl #1 \n\t" /*Q15 not Q16*/ \
119 \
120 "smulwb r1, r1, r2 \n\t" /*r1=a.i*b.r*/ \
121 "smulwt r4, r0, r2 \n\t" /*r4=a.r*b.i*/ \
122 "sub %[mi], r1, r4 \n\t" \
123 "mov %[mi], %[mi], lsl #1 \n\t" \
124 : [mr] "=r" ((m).r), [mi] "=r" ((m).i) \
125 : [ap] "r" (&(a)), [bp] "r" (&(b)) \
126 : "r0", "r1", "r2", "r4"); \
127}
128#endif /*ARMv5 code*/
129#else
106# define C_MULC(m,a,b) \ 130# define C_MULC(m,a,b) \
107 do{ (m).r = ADD32(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)); \ 131 do{ (m).r = ADD32(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)); \
108 (m).i = SUB32(S_MUL((a).i,(b).r) , S_MUL((a).r,(b).i)); }while(0) 132 (m).i = SUB32(S_MUL((a).i,(b).r) , S_MUL((a).r,(b).i)); }while(0)