diff options
Diffstat (limited to 'apps')
-rw-r--r-- | apps/codecs/libmusepack/synth_filter.c | 13 | ||||
-rwxr-xr-x | apps/codecs/libmusepack/synth_filter_arm.S | 123 |
2 files changed, 73 insertions, 63 deletions
diff --git a/apps/codecs/libmusepack/synth_filter.c b/apps/codecs/libmusepack/synth_filter.c index ae94741368..9c8d27eed2 100644 --- a/apps/codecs/libmusepack/synth_filter.c +++ b/apps/codecs/libmusepack/synth_filter.c | |||
@@ -54,9 +54,16 @@ | |||
54 | 54 | ||
55 | // in this configuration a post-shift by >>1 is needed after synthesis | 55 | // in this configuration a post-shift by >>1 is needed after synthesis |
56 | #else | 56 | #else |
57 | // saturate to +/- 2^31 (= value << (31-17)), D-values are +/- 2^17 | 57 | #if defined(CPU_ARM) |
58 | #define D(value) (value << (14)) | 58 | // do not up-scale D-values to achieve higher speed in smull/mlal |
59 | 59 | // operations. saves ~14/8 = 1.75 cycles per multiplication | |
60 | #define D(value) (value) | ||
61 | |||
62 | // in this configuration a post-shift by >>16 is needed after synthesis | ||
63 | #else | ||
64 | // saturate to +/- 2^31 (= value << (31-17)), D-values are +/- 2^17 | ||
65 | #define D(value) (value << (14)) | ||
66 | #endif | ||
60 | // do not perform pre-shift | 67 | // do not perform pre-shift |
61 | #define MPC_V_PRESHIFT(X) (X) | 68 | #define MPC_V_PRESHIFT(X) (X) |
62 | #endif | 69 | #endif |
diff --git a/apps/codecs/libmusepack/synth_filter_arm.S b/apps/codecs/libmusepack/synth_filter_arm.S index 7b2d2dfd23..8bc6bd3c5c 100755 --- a/apps/codecs/libmusepack/synth_filter_arm.S +++ b/apps/codecs/libmusepack/synth_filter_arm.S | |||
@@ -26,7 +26,8 @@ | |||
26 | * | 26 | * |
27 | * 2nd step within synthesis filter. Does the dewindowing. | 27 | * 2nd step within synthesis filter. Does the dewindowing. |
28 | * 32=32x32 multiplies (OPTIMIZE_FOR_SPEED) | 28 | * 32=32x32 multiplies (OPTIMIZE_FOR_SPEED) |
29 | * Uses pre-shifted V[] and D[] values. | 29 | * Uses pre-shifted V[] and D[] values. D[] will always be the second operand |
30 | * of mul/mla to achieve higher speed as D[] has lower amplitude than V[]. | ||
30 | ****************************************************************************/ | 31 | ****************************************************************************/ |
31 | #if defined(OPTIMIZE_FOR_SPEED) | 32 | #if defined(OPTIMIZE_FOR_SPEED) |
32 | .align 2 | 33 | .align 2 |
@@ -42,40 +43,40 @@ mpc_decoder_windowing_D: | |||
42 | 43 | ||
43 | mov lr, #32 | 44 | mov lr, #32 |
44 | .loop32: | 45 | .loop32: |
45 | ldmia r2!, { r3-r10 } /* load first 8 window coefficients */ | 46 | ldmia r2!, { r3-r10 } /* load D[00..07] */ |
46 | ldr r11, [r1] /* 0 */ | 47 | ldr r11, [r1] /* 0 */ |
47 | mul r12, r3, r11 | 48 | mul r12, r11, r3 |
48 | ldr r11, [r1, #96*4] /* 1 */ | 49 | ldr r11, [r1, #96*4] /* 1 */ |
49 | mla r12, r4, r11, r12 | 50 | mla r12, r11, r4, r12 |
50 | ldr r11, [r1, #128*4] /* 2 */ | 51 | ldr r11, [r1, #128*4] /* 2 */ |
51 | mla r12, r5, r11, r12 | 52 | mla r12, r11, r5, r12 |
52 | ldr r11, [r1, #224*4] /* 3 */ | 53 | ldr r11, [r1, #224*4] /* 3 */ |
53 | mla r12, r6, r11, r12 | 54 | mla r12, r11, r6, r12 |
54 | ldr r11, [r1, #256*4] /* 4 */ | 55 | ldr r11, [r1, #256*4] /* 4 */ |
55 | mla r12, r7, r11, r12 | 56 | mla r12, r11, r7, r12 |
56 | ldr r11, [r1, #352*4] /* 5 */ | 57 | ldr r11, [r1, #352*4] /* 5 */ |
57 | mla r12, r8, r11, r12 | 58 | mla r12, r11, r8, r12 |
58 | ldr r11, [r1, #384*4] /* 6 */ | 59 | ldr r11, [r1, #384*4] /* 6 */ |
59 | mla r12, r9, r11, r12 | 60 | mla r12, r11, r9, r12 |
60 | ldr r11, [r1, #480*4] /* 7 */ | 61 | ldr r11, [r1, #480*4] /* 7 */ |
61 | mla r12, r10, r11, r12 | 62 | mla r12, r11, r10, r12 |
62 | ldmia r2!, { r3-r10 } /* load last 8 window coefficients */ | 63 | ldmia r2!, { r3-r10 } /* load D[08..15] */ |
63 | ldr r11, [r1, #512*4] /* 8 */ | 64 | ldr r11, [r1, #512*4] /* 8 */ |
64 | mla r12, r3, r11, r12 | 65 | mla r12, r11, r3, r12 |
65 | ldr r11, [r1, #608*4] /* 9 */ | 66 | ldr r11, [r1, #608*4] /* 9 */ |
66 | mla r12, r4, r11, r12 | 67 | mla r12, r11, r4, r12 |
67 | ldr r11, [r1, #640*4] /* 10 */ | 68 | ldr r11, [r1, #640*4] /* 10 */ |
68 | mla r12, r5, r11, r12 | 69 | mla r12, r11, r5, r12 |
69 | ldr r11, [r1, #736*4] /* 11 */ | 70 | ldr r11, [r1, #736*4] /* 11 */ |
70 | mla r12, r6, r11, r12 | 71 | mla r12, r11, r6, r12 |
71 | ldr r11, [r1, #768*4] /* 12 */ | 72 | ldr r11, [r1, #768*4] /* 12 */ |
72 | mla r12, r7, r11, r12 | 73 | mla r12, r11, r7, r12 |
73 | ldr r11, [r1, #864*4] /* 13 */ | 74 | ldr r11, [r1, #864*4] /* 13 */ |
74 | mla r12, r8, r11, r12 | 75 | mla r12, r11, r8, r12 |
75 | ldr r11, [r1, #896*4] /* 14 */ | 76 | ldr r11, [r1, #896*4] /* 14 */ |
76 | mla r12, r9, r11, r12 | 77 | mla r12, r11, r9, r12 |
77 | ldr r11, [r1, #992*4] /* 15 */ | 78 | ldr r11, [r1, #992*4] /* 15 */ |
78 | mla r12, r10, r11, r12 | 79 | mla r12, r11, r10, r12 |
79 | mov r12, r12, asr #1 /* post shift to compensate for pre-shifting */ | 80 | mov r12, r12, asr #1 /* post shift to compensate for pre-shifting */ |
80 | str r12, [r0], #4 /* store Data */ | 81 | str r12, [r0], #4 /* store Data */ |
81 | add r1, r1, #4 /* V++ */ | 82 | add r1, r1, #4 /* V++ */ |
@@ -92,9 +93,8 @@ mpc_decoder_windowing_D: | |||
92 | * | 93 | * |
93 | * 2nd step within synthesis filter. Does the dewindowing. | 94 | * 2nd step within synthesis filter. Does the dewindowing. |
94 | * 64=32x32 multiplies | 95 | * 64=32x32 multiplies |
95 | * Drops lo-part of 64bit multiply results and will therefor loose 1 bit | 96 | * Uses un-shifted D[]-values. D[] will always be the second operand of |
96 | * accuracy. The decoder output is binary identical as this imprecision is | 97 | * smull/smlal to achieve higher speed as D[] has lower amplitude than V[]. |
97 | * far below the output's 16bit resolution. | ||
98 | ****************************************************************************/ | 98 | ****************************************************************************/ |
99 | .align 2 | 99 | .align 2 |
100 | .global mpc_decoder_windowing_D | 100 | .global mpc_decoder_windowing_D |
@@ -105,52 +105,55 @@ mpc_decoder_windowing_D: | |||
105 | /* r2 = D[] */ | 105 | /* r2 = D[] */ |
106 | /* lr = counter */ | 106 | /* lr = counter */ |
107 | 107 | ||
108 | stmfd sp!, {r4-r12, lr} | 108 | stmfd sp!, {r4-r9, lr} |
109 | 109 | ||
110 | mov lr, #32 | 110 | mov lr, #32 |
111 | .loop32: | 111 | .loop32: |
112 | ldmia r2!, { r3-r10 } /* load first 8 window coefficients */ | 112 | ldmia r2!, { r3-r6 } /* load D[00..03] */ |
113 | ldr r11, [r1] /* 0 */ | 113 | ldr r7, [r1] /* 0 */ |
114 | smull r11, r12, r3, r11 | 114 | smull r8, r9, r7, r3 |
115 | ldr r11, [r1, #96*4] /* 1 */ | 115 | ldr r7, [r1, #96*4] /* 1 */ |
116 | smlal r11, r12, r4, r11 | 116 | smlal r8, r9, r7, r4 |
117 | ldr r11, [r1, #128*4] /* 2 */ | 117 | ldr r7, [r1, #128*4] /* 2 */ |
118 | smlal r11, r12, r5, r11 | 118 | smlal r8, r9, r7, r5 |
119 | ldr r11, [r1, #224*4] /* 3 */ | 119 | ldr r7, [r1, #224*4] /* 3 */ |
120 | smlal r11, r12, r6, r11 | 120 | smlal r8, r9, r7, r6 |
121 | ldr r11, [r1, #256*4] /* 4 */ | 121 | ldmia r2!, { r3-r6 } /* load D[04..07] */ |
122 | smlal r11, r12, r7, r11 | 122 | ldr r7, [r1, #256*4] /* 4 */ |
123 | ldr r11, [r1, #352*4] /* 5 */ | 123 | smlal r8, r9, r7, r3 |
124 | smlal r11, r12, r8, r11 | 124 | ldr r7, [r1, #352*4] /* 5 */ |
125 | ldr r11, [r1, #384*4] /* 6 */ | 125 | smlal r8, r9, r7, r4 |
126 | smlal r11, r12, r9, r11 | 126 | ldr r7, [r1, #384*4] /* 6 */ |
127 | ldr r11, [r1, #480*4] /* 7 */ | 127 | smlal r8, r9, r7, r5 |
128 | smlal r11, r12, r10, r11 | 128 | ldr r7, [r1, #480*4] /* 7 */ |
129 | ldmia r2!, { r3-r10 } /* load last 8 window coefficients */ | 129 | smlal r8, r9, r7, r6 |
130 | ldr r11, [r1, #512*4] /* 8 */ | 130 | ldmia r2!, { r3-r6 } /* load D[08..11] */ |
131 | smlal r11, r12, r3, r11 | 131 | ldr r7, [r1, #512*4] /* 8 */ |
132 | ldr r11, [r1, #608*4] /* 9 */ | 132 | smlal r8, r9, r7, r3 |
133 | smlal r11, r12, r4, r11 | 133 | ldr r7, [r1, #608*4] /* 9 */ |
134 | ldr r11, [r1, #640*4] /* 10 */ | 134 | smlal r8, r9, r7, r4 |
135 | smlal r11, r12, r5, r11 | 135 | ldr r7, [r1, #640*4] /* 10 */ |
136 | ldr r11, [r1, #736*4] /* 11 */ | 136 | smlal r8, r9, r7, r5 |
137 | smlal r11, r12, r6, r11 | 137 | ldr r7, [r1, #736*4] /* 11 */ |
138 | ldr r11, [r1, #768*4] /* 12 */ | 138 | smlal r8, r9, r7, r6 |
139 | smlal r11, r12, r7, r11 | 139 | ldmia r2!, { r3-r6 } /* load D[12..15] */ |
140 | ldr r11, [r1, #864*4] /* 13 */ | 140 | ldr r7, [r1, #768*4] /* 12 */ |
141 | smlal r11, r12, r8, r11 | 141 | smlal r8, r9, r7, r3 |
142 | ldr r11, [r1, #896*4] /* 14 */ | 142 | ldr r7, [r1, #864*4] /* 13 */ |
143 | smlal r11, r12, r9, r11 | 143 | smlal r8, r9, r7, r4 |
144 | ldr r11, [r1, #992*4] /* 15 */ | 144 | ldr r7, [r1, #896*4] /* 14 */ |
145 | smlal r11, r12, r10, r11 | 145 | smlal r8, r9, r7, r5 |
146 | mov r4, r12, lsl #2 /* get result from hi-part, loose 2 bits */ | 146 | ldr r7, [r1, #992*4] /* 15 */ |
147 | str r4, [r0], #4 /* store Data */ | 147 | smlal r8, r9, r7, r6 |
148 | mov r8, r8, lsr #16 | ||
149 | orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */ | ||
150 | str r8, [r0], #4 /* store Data */ | ||
148 | add r1, r1, #4 /* V++ */ | 151 | add r1, r1, #4 /* V++ */ |
149 | 152 | ||
150 | subs lr, lr, #1 | 153 | subs lr, lr, #1 |
151 | bgt .loop32 | 154 | bgt .loop32 |
152 | 155 | ||
153 | ldmfd sp!, {r4-r12, pc} | 156 | ldmfd sp!, {r4-r9, pc} |
154 | .mpc_dewindowing_end: | 157 | .mpc_dewindowing_end: |
155 | .size mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D | 158 | .size mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D |
156 | #endif | 159 | #endif |