summaryrefslogtreecommitdiff
path: root/apps/codecs/libmusepack
diff options
context:
space:
mode:
Diffstat (limited to 'apps/codecs/libmusepack')
-rw-r--r--apps/codecs/libmusepack/synth_filter.c13
-rwxr-xr-xapps/codecs/libmusepack/synth_filter_arm.S123
2 files changed, 73 insertions, 63 deletions
diff --git a/apps/codecs/libmusepack/synth_filter.c b/apps/codecs/libmusepack/synth_filter.c
index ae94741368..9c8d27eed2 100644
--- a/apps/codecs/libmusepack/synth_filter.c
+++ b/apps/codecs/libmusepack/synth_filter.c
@@ -54,9 +54,16 @@
54 54
55 // in this configuration a post-shift by >>1 is needed after synthesis 55 // in this configuration a post-shift by >>1 is needed after synthesis
56 #else 56 #else
57 // saturate to +/- 2^31 (= value << (31-17)), D-values are +/- 2^17 57 #if defined(CPU_ARM)
58 #define D(value) (value << (14)) 58 // do not up-scale D-values to achieve higher speed in smull/mlal
59 59 // operations. saves ~14/8 = 1.75 cycles per multiplication
60 #define D(value) (value)
61
62 // in this configuration a post-shift by >>16 is needed after synthesis
63 #else
64 // saturate to +/- 2^31 (= value << (31-17)), D-values are +/- 2^17
65 #define D(value) (value << (14))
66 #endif
60 // do not perform pre-shift 67 // do not perform pre-shift
61 #define MPC_V_PRESHIFT(X) (X) 68 #define MPC_V_PRESHIFT(X) (X)
62 #endif 69 #endif
diff --git a/apps/codecs/libmusepack/synth_filter_arm.S b/apps/codecs/libmusepack/synth_filter_arm.S
index 7b2d2dfd23..8bc6bd3c5c 100755
--- a/apps/codecs/libmusepack/synth_filter_arm.S
+++ b/apps/codecs/libmusepack/synth_filter_arm.S
@@ -26,7 +26,8 @@
26 * 26 *
27 * 2nd step within synthesis filter. Does the dewindowing. 27 * 2nd step within synthesis filter. Does the dewindowing.
28 * 32=32x32 multiplies (OPTIMIZE_FOR_SPEED) 28 * 32=32x32 multiplies (OPTIMIZE_FOR_SPEED)
29 * Uses pre-shifted V[] and D[] values. 29 * Uses pre-shifted V[] and D[] values. D[] will always be the second operand
30 * of mul/mla to achieve higher speed as D[] has lower amplitude than V[].
30 ****************************************************************************/ 31 ****************************************************************************/
31#if defined(OPTIMIZE_FOR_SPEED) 32#if defined(OPTIMIZE_FOR_SPEED)
32 .align 2 33 .align 2
@@ -42,40 +43,40 @@ mpc_decoder_windowing_D:
42 43
43 mov lr, #32 44 mov lr, #32
44.loop32: 45.loop32:
45 ldmia r2!, { r3-r10 } /* load first 8 window coefficients */ 46 ldmia r2!, { r3-r10 } /* load D[00..07] */
46 ldr r11, [r1] /* 0 */ 47 ldr r11, [r1] /* 0 */
47 mul r12, r3, r11 48 mul r12, r11, r3
48 ldr r11, [r1, #96*4] /* 1 */ 49 ldr r11, [r1, #96*4] /* 1 */
49 mla r12, r4, r11, r12 50 mla r12, r11, r4, r12
50 ldr r11, [r1, #128*4] /* 2 */ 51 ldr r11, [r1, #128*4] /* 2 */
51 mla r12, r5, r11, r12 52 mla r12, r11, r5, r12
52 ldr r11, [r1, #224*4] /* 3 */ 53 ldr r11, [r1, #224*4] /* 3 */
53 mla r12, r6, r11, r12 54 mla r12, r11, r6, r12
54 ldr r11, [r1, #256*4] /* 4 */ 55 ldr r11, [r1, #256*4] /* 4 */
55 mla r12, r7, r11, r12 56 mla r12, r11, r7, r12
56 ldr r11, [r1, #352*4] /* 5 */ 57 ldr r11, [r1, #352*4] /* 5 */
57 mla r12, r8, r11, r12 58 mla r12, r11, r8, r12
58 ldr r11, [r1, #384*4] /* 6 */ 59 ldr r11, [r1, #384*4] /* 6 */
59 mla r12, r9, r11, r12 60 mla r12, r11, r9, r12
60 ldr r11, [r1, #480*4] /* 7 */ 61 ldr r11, [r1, #480*4] /* 7 */
61 mla r12, r10, r11, r12 62 mla r12, r11, r10, r12
62 ldmia r2!, { r3-r10 } /* load last 8 window coefficients */ 63 ldmia r2!, { r3-r10 } /* load D[08..15] */
63 ldr r11, [r1, #512*4] /* 8 */ 64 ldr r11, [r1, #512*4] /* 8 */
64 mla r12, r3, r11, r12 65 mla r12, r11, r3, r12
65 ldr r11, [r1, #608*4] /* 9 */ 66 ldr r11, [r1, #608*4] /* 9 */
66 mla r12, r4, r11, r12 67 mla r12, r11, r4, r12
67 ldr r11, [r1, #640*4] /* 10 */ 68 ldr r11, [r1, #640*4] /* 10 */
68 mla r12, r5, r11, r12 69 mla r12, r11, r5, r12
69 ldr r11, [r1, #736*4] /* 11 */ 70 ldr r11, [r1, #736*4] /* 11 */
70 mla r12, r6, r11, r12 71 mla r12, r11, r6, r12
71 ldr r11, [r1, #768*4] /* 12 */ 72 ldr r11, [r1, #768*4] /* 12 */
72 mla r12, r7, r11, r12 73 mla r12, r11, r7, r12
73 ldr r11, [r1, #864*4] /* 13 */ 74 ldr r11, [r1, #864*4] /* 13 */
74 mla r12, r8, r11, r12 75 mla r12, r11, r8, r12
75 ldr r11, [r1, #896*4] /* 14 */ 76 ldr r11, [r1, #896*4] /* 14 */
76 mla r12, r9, r11, r12 77 mla r12, r11, r9, r12
77 ldr r11, [r1, #992*4] /* 15 */ 78 ldr r11, [r1, #992*4] /* 15 */
78 mla r12, r10, r11, r12 79 mla r12, r11, r10, r12
79 mov r12, r12, asr #1 /* post shift to compensate for pre-shifting */ 80 mov r12, r12, asr #1 /* post shift to compensate for pre-shifting */
80 str r12, [r0], #4 /* store Data */ 81 str r12, [r0], #4 /* store Data */
81 add r1, r1, #4 /* V++ */ 82 add r1, r1, #4 /* V++ */
@@ -92,9 +93,8 @@ mpc_decoder_windowing_D:
92 * 93 *
93 * 2nd step within synthesis filter. Does the dewindowing. 94 * 2nd step within synthesis filter. Does the dewindowing.
94 * 64=32x32 multiplies 95 * 64=32x32 multiplies
95 * Drops lo-part of 64bit multiply results and will therefor loose 1 bit 96 * Uses un-shifted D[]-values. D[] will always be the second operand of
96 * accuracy. The decoder output is binary identical as this imprecision is 97 * smull/smlal to achieve higher speed as D[] has lower amplitude than V[].
97 * far below the output's 16bit resolution.
98 ****************************************************************************/ 98 ****************************************************************************/
99 .align 2 99 .align 2
100 .global mpc_decoder_windowing_D 100 .global mpc_decoder_windowing_D
@@ -105,52 +105,55 @@ mpc_decoder_windowing_D:
105 /* r2 = D[] */ 105 /* r2 = D[] */
106 /* lr = counter */ 106 /* lr = counter */
107 107
108 stmfd sp!, {r4-r12, lr} 108 stmfd sp!, {r4-r9, lr}
109 109
110 mov lr, #32 110 mov lr, #32
111.loop32: 111.loop32:
112 ldmia r2!, { r3-r10 } /* load first 8 window coefficients */ 112 ldmia r2!, { r3-r6 } /* load D[00..03] */
113 ldr r11, [r1] /* 0 */ 113 ldr r7, [r1] /* 0 */
114 smull r11, r12, r3, r11 114 smull r8, r9, r7, r3
115 ldr r11, [r1, #96*4] /* 1 */ 115 ldr r7, [r1, #96*4] /* 1 */
116 smlal r11, r12, r4, r11 116 smlal r8, r9, r7, r4
117 ldr r11, [r1, #128*4] /* 2 */ 117 ldr r7, [r1, #128*4] /* 2 */
118 smlal r11, r12, r5, r11 118 smlal r8, r9, r7, r5
119 ldr r11, [r1, #224*4] /* 3 */ 119 ldr r7, [r1, #224*4] /* 3 */
120 smlal r11, r12, r6, r11 120 smlal r8, r9, r7, r6
121 ldr r11, [r1, #256*4] /* 4 */ 121 ldmia r2!, { r3-r6 } /* load D[04..07] */
122 smlal r11, r12, r7, r11 122 ldr r7, [r1, #256*4] /* 4 */
123 ldr r11, [r1, #352*4] /* 5 */ 123 smlal r8, r9, r7, r3
124 smlal r11, r12, r8, r11 124 ldr r7, [r1, #352*4] /* 5 */
125 ldr r11, [r1, #384*4] /* 6 */ 125 smlal r8, r9, r7, r4
126 smlal r11, r12, r9, r11 126 ldr r7, [r1, #384*4] /* 6 */
127 ldr r11, [r1, #480*4] /* 7 */ 127 smlal r8, r9, r7, r5
128 smlal r11, r12, r10, r11 128 ldr r7, [r1, #480*4] /* 7 */
129 ldmia r2!, { r3-r10 } /* load last 8 window coefficients */ 129 smlal r8, r9, r7, r6
130 ldr r11, [r1, #512*4] /* 8 */ 130 ldmia r2!, { r3-r6 } /* load D[08..11] */
131 smlal r11, r12, r3, r11 131 ldr r7, [r1, #512*4] /* 8 */
132 ldr r11, [r1, #608*4] /* 9 */ 132 smlal r8, r9, r7, r3
133 smlal r11, r12, r4, r11 133 ldr r7, [r1, #608*4] /* 9 */
134 ldr r11, [r1, #640*4] /* 10 */ 134 smlal r8, r9, r7, r4
135 smlal r11, r12, r5, r11 135 ldr r7, [r1, #640*4] /* 10 */
136 ldr r11, [r1, #736*4] /* 11 */ 136 smlal r8, r9, r7, r5
137 smlal r11, r12, r6, r11 137 ldr r7, [r1, #736*4] /* 11 */
138 ldr r11, [r1, #768*4] /* 12 */ 138 smlal r8, r9, r7, r6
139 smlal r11, r12, r7, r11 139 ldmia r2!, { r3-r6 } /* load D[12..15] */
140 ldr r11, [r1, #864*4] /* 13 */ 140 ldr r7, [r1, #768*4] /* 12 */
141 smlal r11, r12, r8, r11 141 smlal r8, r9, r7, r3
142 ldr r11, [r1, #896*4] /* 14 */ 142 ldr r7, [r1, #864*4] /* 13 */
143 smlal r11, r12, r9, r11 143 smlal r8, r9, r7, r4
144 ldr r11, [r1, #992*4] /* 15 */ 144 ldr r7, [r1, #896*4] /* 14 */
145 smlal r11, r12, r10, r11 145 smlal r8, r9, r7, r5
146 mov r4, r12, lsl #2 /* get result from hi-part, loose 2 bits */ 146 ldr r7, [r1, #992*4] /* 15 */
147 str r4, [r0], #4 /* store Data */ 147 smlal r8, r9, r7, r6
148 mov r8, r8, lsr #16
149 orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
150 str r8, [r0], #4 /* store Data */
148 add r1, r1, #4 /* V++ */ 151 add r1, r1, #4 /* V++ */
149 152
150 subs lr, lr, #1 153 subs lr, lr, #1
151 bgt .loop32 154 bgt .loop32
152 155
153 ldmfd sp!, {r4-r12, pc} 156 ldmfd sp!, {r4-r9, pc}
154.mpc_dewindowing_end: 157.mpc_dewindowing_end:
155 .size mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D 158 .size mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D
156#endif 159#endif