diff options
-rw-r--r-- | apps/codecs/libwma/wmadec.h | 19 | ||||
-rw-r--r-- | apps/codecs/libwma/wmadeci.c | 217 | ||||
-rw-r--r-- | apps/codecs/libwma/wmafixed.c | 131 | ||||
-rw-r--r-- | apps/codecs/libwma/wmafixed.h | 1 |
4 files changed, 182 insertions, 186 deletions
diff --git a/apps/codecs/libwma/wmadec.h b/apps/codecs/libwma/wmadec.h index aaa06ee2a6..d22e435304 100644 --- a/apps/codecs/libwma/wmadec.h +++ b/apps/codecs/libwma/wmadec.h | |||
@@ -64,6 +64,25 @@ | |||
64 | #endif | 64 | #endif |
65 | #endif | 65 | #endif |
66 | 66 | ||
67 | #define VLCBITS 7 /*7 is the lowest without glitching*/ | ||
68 | #define VLCMAX ((22+VLCBITS-1)/VLCBITS) | ||
69 | |||
70 | #define EXPVLCBITS 7 | ||
71 | #define EXPMAX ((19+EXPVLCBITS-1)/EXPVLCBITS) | ||
72 | |||
73 | #define HGAINVLCBITS 9 | ||
74 | #define HGAINMAX ((13+HGAINVLCBITS-1)/HGAINVLCBITS) | ||
75 | |||
76 | |||
77 | typedef struct CoefVLCTable | ||
78 | { | ||
79 | int n; /* total number of codes */ | ||
80 | const uint32_t *huffcodes; /* VLC bit values */ | ||
81 | const uint8_t *huffbits; /* VLC bit size */ | ||
82 | const uint16_t *levels; /* table to build run/level tables */ | ||
83 | } | ||
84 | CoefVLCTable; | ||
85 | |||
67 | typedef struct WMADecodeContext | 86 | typedef struct WMADecodeContext |
68 | { | 87 | { |
69 | GetBitContext gb; | 88 | GetBitContext gb; |
diff --git a/apps/codecs/libwma/wmadeci.c b/apps/codecs/libwma/wmadeci.c index aa81b5b81f..d1baca4c99 100644 --- a/apps/codecs/libwma/wmadeci.c +++ b/apps/codecs/libwma/wmadeci.c | |||
@@ -28,39 +28,29 @@ | |||
28 | #include "wmadec.h" | 28 | #include "wmadec.h" |
29 | #include "wmafixed.h" | 29 | #include "wmafixed.h" |
30 | #include "bitstream.h" | 30 | #include "bitstream.h" |
31 | #include "wmadata.h" | ||
31 | 32 | ||
32 | 33 | ||
33 | #define VLCBITS 7 /*7 is the lowest without glitching*/ | ||
34 | #define VLCMAX ((22+VLCBITS-1)/VLCBITS) | ||
35 | |||
36 | #define EXPVLCBITS 7 | ||
37 | #define EXPMAX ((19+EXPVLCBITS-1)/EXPVLCBITS) | ||
38 | |||
39 | #define HGAINVLCBITS 9 | ||
40 | #define HGAINMAX ((13+HGAINVLCBITS-1)/HGAINVLCBITS) | ||
41 | |||
42 | |||
43 | typedef struct CoefVLCTable | ||
44 | { | ||
45 | int n; /* total number of codes */ | ||
46 | const uint32_t *huffcodes; /* VLC bit values */ | ||
47 | const uint8_t *huffbits; /* VLC bit size */ | ||
48 | const uint16_t *levels; /* table to build run/level tables */ | ||
49 | } | ||
50 | CoefVLCTable; | ||
51 | |||
52 | static void wma_lsp_to_curve_init(WMADecodeContext *s, int frame_len); | 34 | static void wma_lsp_to_curve_init(WMADecodeContext *s, int frame_len); |
35 | inline void vector_fmul_add_add(fixed32 *dst, const fixed32 *data, | ||
36 | const fixed32 *window, int n); | ||
37 | inline void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, | ||
38 | const fixed32 *src1, int len); | ||
39 | |||
40 | /*declarations of statically allocated variables used to remove malloc calls*/ | ||
53 | 41 | ||
54 | fixed32 coefsarray[MAX_CHANNELS][BLOCK_MAX_SIZE] IBSS_ATTR; | 42 | fixed32 coefsarray[MAX_CHANNELS][BLOCK_MAX_SIZE] IBSS_ATTR; |
55 | /*decode and window into IRAM on targets with at least 80KB of codec IRAM*/ | 43 | /*decode and window into IRAM on targets with at least 80KB of codec IRAM*/ |
56 | fixed32 frame_out_buf[MAX_CHANNELS][BLOCK_MAX_SIZE * 2] IBSS_ATTR_WMA_LARGE_IRAM; | 44 | fixed32 frame_out_buf[MAX_CHANNELS][BLOCK_MAX_SIZE * 2] IBSS_ATTR_WMA_LARGE_IRAM; |
57 | 45 | ||
58 | //static variables that replace malloced stuff | 46 | /*MDCT reconstruction windows*/ |
59 | fixed32 stat0[2048], stat1[1024], stat2[512], stat3[256], stat4[128]; //these are the MDCT reconstruction windows | 47 | fixed32 stat0[2048], stat1[1024], stat2[512], stat3[256], stat4[128]; |
60 | 48 | ||
61 | uint16_t *runtabarray[2], *levtabarray[2]; //these are VLC lookup tables | 49 | /*VLC lookup tables*/ |
50 | uint16_t *runtabarray[2], *levtabarray[2]; | ||
62 | 51 | ||
63 | uint16_t runtab0[1336], runtab1[1336], levtab0[1336], levtab1[1336]; //these could be made smaller since only one can be 1336 | 52 | /*these could be made smaller since only one can be 1336*/ |
53 | uint16_t runtab0[1336], runtab1[1336], levtab0[1336], levtab1[1336]; | ||
64 | 54 | ||
65 | #define VLCBUF1SIZE 4598 | 55 | #define VLCBUF1SIZE 4598 |
66 | #define VLCBUF2SIZE 3574 | 56 | #define VLCBUF2SIZE 3574 |
@@ -76,141 +66,6 @@ VLC_TYPE vlcbuf4[VLCBUF4SIZE][2]; | |||
76 | 66 | ||
77 | 67 | ||
78 | 68 | ||
79 | #include "wmadata.h" // PJJ | ||
80 | |||
81 | |||
82 | |||
83 | /* | ||
84 | * Helper functions for wma_window. | ||
85 | * | ||
86 | * | ||
87 | */ | ||
88 | |||
89 | #ifdef CPU_ARM | ||
90 | static inline | ||
91 | void vector_fmul_add_add(fixed32 *dst, const fixed32 *data, | ||
92 | const fixed32 *window, int n) | ||
93 | { | ||
94 | /* Block sizes are always power of two */ | ||
95 | asm volatile ( | ||
96 | "0:" | ||
97 | "ldmia %[d]!, {r0, r1};" | ||
98 | "ldmia %[w]!, {r4, r5};" | ||
99 | /* consume the first data and window value so we can use those | ||
100 | * registers again */ | ||
101 | "smull r8, r9, r0, r4;" | ||
102 | "ldmia %[dst], {r0, r4};" | ||
103 | "add r0, r0, r9, lsl #1;" /* *dst=*dst+(r9<<1)*/ | ||
104 | "smull r8, r9, r1, r5;" | ||
105 | "add r1, r4, r9, lsl #1;" | ||
106 | "stmia %[dst]!, {r0, r1};" | ||
107 | "subs %[n], %[n], #2;" | ||
108 | "bne 0b;" | ||
109 | : [d] "+r" (data), [w] "+r" (window), [dst] "+r" (dst), [n] "+r" (n) | ||
110 | : : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc"); | ||
111 | } | ||
112 | |||
113 | static inline | ||
114 | void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const fixed32 *src1, | ||
115 | int len) | ||
116 | { | ||
117 | /* Block sizes are always power of two */ | ||
118 | asm volatile ( | ||
119 | "add %[s1], %[s1], %[n], lsl #2;" | ||
120 | "0:" | ||
121 | "ldmia %[s0]!, {r0, r1};" | ||
122 | "ldmdb %[s1]!, {r4, r5};" | ||
123 | "smull r8, r9, r0, r5;" | ||
124 | "mov r0, r9, lsl #1;" | ||
125 | "smull r8, r9, r1, r4;" | ||
126 | "mov r1, r9, lsl #1;" | ||
127 | "stmia %[dst]!, {r0, r1};" | ||
128 | "subs %[n], %[n], #2;" | ||
129 | "bne 0b;" | ||
130 | : [s0] "+r" (src0), [s1] "+r" (src1), [dst] "+r" (dst), [n] "+r" (len) | ||
131 | : : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc"); | ||
132 | } | ||
133 | |||
134 | #elif defined(CPU_COLDFIRE) | ||
135 | |||
136 | static inline | ||
137 | void vector_fmul_add_add(fixed32 *dst, const fixed32 *data, | ||
138 | const fixed32 *window, int n) | ||
139 | { | ||
140 | /* Block sizes are always power of two. Smallest block is always way bigger | ||
141 | * than four too.*/ | ||
142 | asm volatile ( | ||
143 | "0:" | ||
144 | "movem.l (%[d]), %%d0-%%d3;" | ||
145 | "movem.l (%[w]), %%d4-%%d5/%%a0-%%a1;" | ||
146 | "mac.l %%d0, %%d4, %%acc0;" | ||
147 | "mac.l %%d1, %%d5, %%acc1;" | ||
148 | "mac.l %%d2, %%a0, %%acc2;" | ||
149 | "mac.l %%d3, %%a1, %%acc3;" | ||
150 | "lea.l (16, %[d]), %[d];" | ||
151 | "lea.l (16, %[w]), %[w];" | ||
152 | "movclr.l %%acc0, %%d0;" | ||
153 | "movclr.l %%acc1, %%d1;" | ||
154 | "movclr.l %%acc2, %%d2;" | ||
155 | "movclr.l %%acc3, %%d3;" | ||
156 | "movem.l (%[dst]), %%d4-%%d5/%%a0-%%a1;" | ||
157 | "add.l %%d4, %%d0;" | ||
158 | "add.l %%d5, %%d1;" | ||
159 | "add.l %%a0, %%d2;" | ||
160 | "add.l %%a1, %%d3;" | ||
161 | "movem.l %%d0-%%d3, (%[dst]);" | ||
162 | "lea.l (16, %[dst]), %[dst];" | ||
163 | "subq.l #4, %[n];" | ||
164 | "jne 0b;" | ||
165 | : [d] "+a" (data), [w] "+a" (window), [dst] "+a" (dst), [n] "+d" (n) | ||
166 | : : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc"); | ||
167 | } | ||
168 | |||
169 | static inline | ||
170 | void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const fixed32 *src1, | ||
171 | int len) | ||
172 | { | ||
173 | /* Block sizes are always power of two. Smallest block is always way bigger | ||
174 | * than four too.*/ | ||
175 | asm volatile ( | ||
176 | "lea.l (-16, %[s1], %[n]*4), %[s1];" | ||
177 | "0:" | ||
178 | "movem.l (%[s0]), %%d0-%%d3;" | ||
179 | "movem.l (%[s1]), %%d4-%%d5/%%a0-%%a1;" | ||
180 | "mac.l %%d0, %%a1, %%acc0;" | ||
181 | "mac.l %%d1, %%a0, %%acc1;" | ||
182 | "mac.l %%d2, %%d5, %%acc2;" | ||
183 | "mac.l %%d3, %%d4, %%acc3;" | ||
184 | "lea.l (16, %[s0]), %[s0];" | ||
185 | "lea.l (-16, %[s1]), %[s1];" | ||
186 | "movclr.l %%acc0, %%d0;" | ||
187 | "movclr.l %%acc1, %%d1;" | ||
188 | "movclr.l %%acc2, %%d2;" | ||
189 | "movclr.l %%acc3, %%d3;" | ||
190 | "movem.l %%d0-%%d3, (%[dst]);" | ||
191 | "lea.l (16, %[dst]), %[dst];" | ||
192 | "subq.l #4, %[n];" | ||
193 | "jne 0b;" | ||
194 | : [s0] "+a" (src0), [s1] "+a" (src1), [dst] "+a" (dst), [n] "+d" (len) | ||
195 | : : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc"); | ||
196 | } | ||
197 | |||
198 | #else | ||
199 | |||
200 | static inline void vector_fmul_add_add(fixed32 *dst, const fixed32 *src0, const fixed32 *src1, int len){ | ||
201 | int i; | ||
202 | for(i=0; i<len; i++) | ||
203 | dst[i] = fixmul32b(src0[i], src1[i]) + dst[i]; | ||
204 | } | ||
205 | |||
206 | static inline void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const fixed32 *src1, int len){ | ||
207 | int i; | ||
208 | src1 += len-1; | ||
209 | for(i=0; i<len; i++) | ||
210 | dst[i] = fixmul32b(src0[i], src1[-i]); | ||
211 | } | ||
212 | |||
213 | #endif | ||
214 | 69 | ||
215 | /** | 70 | /** |
216 | * Apply MDCT window and add into output. | 71 | * Apply MDCT window and add into output. |
@@ -227,7 +82,9 @@ static inline void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const | |||
227 | int block_len, bsize, n; | 82 | int block_len, bsize, n; |
228 | 83 | ||
229 | /* left part */ | 84 | /* left part */ |
230 | /*previous block was larger, so we'll use the size of the current block to set the window size*/ | 85 | |
86 | /* previous block was larger, so we'll use the size of the current | ||
87 | * block to set the window size*/ | ||
231 | if (s->block_len_bits <= s->prev_block_len_bits) { | 88 | if (s->block_len_bits <= s->prev_block_len_bits) { |
232 | block_len = s->block_len; | 89 | block_len = s->block_len; |
233 | bsize = s->frame_len_bits - s->block_len_bits; | 90 | bsize = s->frame_len_bits - s->block_len_bits; |
@@ -314,7 +171,7 @@ static void init_coef_vlc(VLC *vlc, | |||
314 | 171 | ||
315 | int wma_decode_init(WMADecodeContext* s, asf_waveformatex_t *wfx) | 172 | int wma_decode_init(WMADecodeContext* s, asf_waveformatex_t *wfx) |
316 | { | 173 | { |
317 | //WMADecodeContext *s = avctx->priv_data; | 174 | |
318 | int i, flags1, flags2; | 175 | int i, flags1, flags2; |
319 | fixed32 *window; | 176 | fixed32 *window; |
320 | uint8_t *extradata; | 177 | uint8_t *extradata; |
@@ -608,10 +465,11 @@ int wma_decode_init(WMADecodeContext* s, asf_waveformatex_t *wfx) | |||
608 | } | 465 | } |
609 | */ | 466 | */ |
610 | 467 | ||
611 | /*ffmpeg uses malloc to only allocate as many window sizes as needed. However, we're really only interested in the worst case memory usage. | 468 | /* ffmpeg uses malloc to only allocate as many window sizes as needed. |
612 | * In the worst case you can have 5 window sizes, 128 doubling up 2048 | 469 | * However, we're really only interested in the worst case memory usage. |
613 | * Smaller windows are handled differently. | 470 | * In the worst case you can have 5 window sizes, 128 doubling up 2048 |
614 | * Since we don't have malloc, just statically allocate this | 471 | * Smaller windows are handled differently. |
472 | * Since we don't have malloc, just statically allocate this | ||
615 | */ | 473 | */ |
616 | fixed32 *temp[5]; | 474 | fixed32 *temp[5]; |
617 | temp[0] = stat0; | 475 | temp[0] = stat0; |
@@ -626,19 +484,15 @@ int wma_decode_init(WMADecodeContext* s, asf_waveformatex_t *wfx) | |||
626 | int n, j; | 484 | int n, j; |
627 | fixed32 alpha; | 485 | fixed32 alpha; |
628 | n = 1 << (s->frame_len_bits - i); | 486 | n = 1 << (s->frame_len_bits - i); |
629 | //window = av_malloc(sizeof(fixed32) * n); | ||
630 | window = temp[i]; | 487 | window = temp[i]; |
631 | 488 | ||
632 | //fixed32 n2 = itofix32(n<<1); //2x the window length | 489 | /* this calculates 0.5/(2*n) */ |
633 | //alpha = fixdiv32(M_PI_F, n2); //PI / (2x Window length) == PI<<(s->frame_len_bits - i+1) | 490 | alpha = (1<<15)>>(s->frame_len_bits - i+1); |
634 | |||
635 | //alpha = M_PI_F>>(s->frame_len_bits - i+1); | ||
636 | alpha = (1<<15)>>(s->frame_len_bits - i+1); /* this calculates 0.5/(2*n) */ | ||
637 | for(j=0;j<n;++j) | 491 | for(j=0;j<n;++j) |
638 | { | 492 | { |
639 | fixed32 j2 = itofix32(j) + 0x8000; | 493 | fixed32 j2 = itofix32(j) + 0x8000; |
640 | window[j] = fsincos(fixmul32(j2,alpha)<<16, 0); //alpha between 0 and pi/2 | 494 | /*alpha between 0 and pi/2*/ |
641 | 495 | window[j] = fsincos(fixmul32(j2,alpha)<<16, 0); | |
642 | } | 496 | } |
643 | s->windows[i] = window; | 497 | s->windows[i] = window; |
644 | 498 | ||
@@ -663,6 +517,7 @@ int wma_decode_init(WMADecodeContext* s, asf_waveformatex_t *wfx) | |||
663 | s->noise_table = noisetable_exp; | 517 | s->noise_table = noisetable_exp; |
664 | } | 518 | } |
665 | #if 0 | 519 | #if 0 |
520 | /* We use a lookup table computered in advance, so no need to do this*/ | ||
666 | { | 521 | { |
667 | unsigned int seed; | 522 | unsigned int seed; |
668 | fixed32 norm; | 523 | fixed32 norm; |
@@ -836,7 +691,9 @@ static void wma_lsp_to_curve(WMADecodeContext *s, | |||
836 | *val_max_ptr = val_max; | 691 | *val_max_ptr = val_max; |
837 | } | 692 | } |
838 | 693 | ||
839 | /* decode exponents coded with LSP coefficients (same idea as Vorbis) */ | 694 | /* decode exponents coded with LSP coefficients (same idea as Vorbis) |
695 | * only used for low bitrate (< 16kbps) files | ||
696 | */ | ||
840 | static void decode_exp_lsp(WMADecodeContext *s, int ch) | 697 | static void decode_exp_lsp(WMADecodeContext *s, int ch) |
841 | { | 698 | { |
842 | fixed32 lsp_coefs[NB_LSP_COEFS]; | 699 | fixed32 lsp_coefs[NB_LSP_COEFS]; |
@@ -858,7 +715,7 @@ static void decode_exp_lsp(WMADecodeContext *s, int ch) | |||
858 | lsp_coefs); | 715 | lsp_coefs); |
859 | } | 716 | } |
860 | 717 | ||
861 | /* decode exponents coded with VLC codes */ | 718 | /* decode exponents coded with VLC codes - used for bitrate >= 32kbps*/ |
862 | static int decode_exp_vlc(WMADecodeContext *s, int ch) | 719 | static int decode_exp_vlc(WMADecodeContext *s, int ch) |
863 | { | 720 | { |
864 | int last_exp, n, code; | 721 | int last_exp, n, code; |
@@ -879,7 +736,7 @@ static int decode_exp_vlc(WMADecodeContext *s, int ch) | |||
879 | if (s->version == 1) //wmav1 only | 736 | if (s->version == 1) //wmav1 only |
880 | { | 737 | { |
881 | last_exp = get_bits(&s->gb, 5) + 10; | 738 | last_exp = get_bits(&s->gb, 5) + 10; |
882 | /* XXX: use a table */ | 739 | |
883 | v = pow_10_to_yover16_ptr[last_exp]; | 740 | v = pow_10_to_yover16_ptr[last_exp]; |
884 | max_scale = v; | 741 | max_scale = v; |
885 | n = *ptr++; | 742 | n = *ptr++; |
@@ -901,7 +758,7 @@ static int decode_exp_vlc(WMADecodeContext *s, int ch) | |||
901 | } | 758 | } |
902 | /* NOTE: this offset is the same as MPEG4 AAC ! */ | 759 | /* NOTE: this offset is the same as MPEG4 AAC ! */ |
903 | last_exp += code - 60; | 760 | last_exp += code - 60; |
904 | /* XXX: use a table */ | 761 | |
905 | v = pow_10_to_yover16_ptr[last_exp]; | 762 | v = pow_10_to_yover16_ptr[last_exp]; |
906 | if (v > max_scale) | 763 | if (v > max_scale) |
907 | { | 764 | { |
@@ -1136,7 +993,7 @@ static int wma_decode_block(WMADecodeContext *s, int32_t *scratch_buffer) | |||
1136 | for(;;) | 993 | for(;;) |
1137 | { | 994 | { |
1138 | code = get_vlc2(&s->gb, coef_vlc->table, VLCBITS, VLCMAX); | 995 | code = get_vlc2(&s->gb, coef_vlc->table, VLCBITS, VLCMAX); |
1139 | //code = get_vlc(&s->gb, coef_vlc); | 996 | |
1140 | if (code < 0) | 997 | if (code < 0) |
1141 | { | 998 | { |
1142 | return -8; | 999 | return -8; |
@@ -1228,7 +1085,9 @@ static int wma_decode_block(WMADecodeContext *s, int32_t *scratch_buffer) | |||
1228 | 1085 | ||
1229 | 1086 | ||
1230 | if (s->use_noise_coding) | 1087 | if (s->use_noise_coding) |
1231 | { | 1088 | { |
1089 | /*This case is only used for low bitrates (typically less then 32kbps)*/ | ||
1090 | |||
1232 | /*TODO: mult should be converted to 32 bit to speed up noise coding*/ | 1091 | /*TODO: mult should be converted to 32 bit to speed up noise coding*/ |
1233 | 1092 | ||
1234 | mult = fixdiv64(pow_table[total_gain+20],Fixed32To64(s->max_exponent[ch])); | 1093 | mult = fixdiv64(pow_table[total_gain+20],Fixed32To64(s->max_exponent[ch])); |
diff --git a/apps/codecs/libwma/wmafixed.c b/apps/codecs/libwma/wmafixed.c index 3c96700076..5569309145 100644 --- a/apps/codecs/libwma/wmafixed.c +++ b/apps/codecs/libwma/wmafixed.c | |||
@@ -63,20 +63,139 @@ fixed64 Fixed32To64(fixed32 x) | |||
63 | return (fixed64)x; | 63 | return (fixed64)x; |
64 | } | 64 | } |
65 | 65 | ||
66 | |||
66 | /* | 67 | /* |
67 | Not performance senstitive code here | 68 | * Helper functions for wma_window. |
69 | * | ||
70 | * | ||
71 | */ | ||
68 | 72 | ||
69 | */ | 73 | #ifdef CPU_ARM |
74 | inline void vector_fmul_add_add(fixed32 *dst, const fixed32 *data, | ||
75 | const fixed32 *window, int n) | ||
76 | { | ||
77 | /* Block sizes are always power of two */ | ||
78 | asm volatile ( | ||
79 | "0:" | ||
80 | "ldmia %[d]!, {r0, r1};" | ||
81 | "ldmia %[w]!, {r4, r5};" | ||
82 | /* consume the first data and window value so we can use those | ||
83 | * registers again */ | ||
84 | "smull r8, r9, r0, r4;" | ||
85 | "ldmia %[dst], {r0, r4};" | ||
86 | "add r0, r0, r9, lsl #1;" /* *dst=*dst+(r9<<1)*/ | ||
87 | "smull r8, r9, r1, r5;" | ||
88 | "add r1, r4, r9, lsl #1;" | ||
89 | "stmia %[dst]!, {r0, r1};" | ||
90 | "subs %[n], %[n], #2;" | ||
91 | "bne 0b;" | ||
92 | : [d] "+r" (data), [w] "+r" (window), [dst] "+r" (dst), [n] "+r" (n) | ||
93 | : : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc"); | ||
94 | } | ||
95 | |||
96 | inline void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const fixed32 *src1, | ||
97 | int len) | ||
98 | { | ||
99 | /* Block sizes are always power of two */ | ||
100 | asm volatile ( | ||
101 | "add %[s1], %[s1], %[n], lsl #2;" | ||
102 | "0:" | ||
103 | "ldmia %[s0]!, {r0, r1};" | ||
104 | "ldmdb %[s1]!, {r4, r5};" | ||
105 | "smull r8, r9, r0, r5;" | ||
106 | "mov r0, r9, lsl #1;" | ||
107 | "smull r8, r9, r1, r4;" | ||
108 | "mov r1, r9, lsl #1;" | ||
109 | "stmia %[dst]!, {r0, r1};" | ||
110 | "subs %[n], %[n], #2;" | ||
111 | "bne 0b;" | ||
112 | : [s0] "+r" (src0), [s1] "+r" (src1), [dst] "+r" (dst), [n] "+r" (len) | ||
113 | : : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc"); | ||
114 | } | ||
70 | 115 | ||
116 | #elif defined(CPU_COLDFIRE) | ||
71 | 117 | ||
72 | fixed64 fixmul64byfixed(fixed64 x, fixed32 y) | 118 | inline void vector_fmul_add_add(fixed32 *dst, const fixed32 *data, |
119 | const fixed32 *window, int n) | ||
73 | { | 120 | { |
121 | /* Block sizes are always power of two. Smallest block is always way bigger | ||
122 | * than four too.*/ | ||
123 | asm volatile ( | ||
124 | "0:" | ||
125 | "movem.l (%[d]), %%d0-%%d3;" | ||
126 | "movem.l (%[w]), %%d4-%%d5/%%a0-%%a1;" | ||
127 | "mac.l %%d0, %%d4, %%acc0;" | ||
128 | "mac.l %%d1, %%d5, %%acc1;" | ||
129 | "mac.l %%d2, %%a0, %%acc2;" | ||
130 | "mac.l %%d3, %%a1, %%acc3;" | ||
131 | "lea.l (16, %[d]), %[d];" | ||
132 | "lea.l (16, %[w]), %[w];" | ||
133 | "movclr.l %%acc0, %%d0;" | ||
134 | "movclr.l %%acc1, %%d1;" | ||
135 | "movclr.l %%acc2, %%d2;" | ||
136 | "movclr.l %%acc3, %%d3;" | ||
137 | "movem.l (%[dst]), %%d4-%%d5/%%a0-%%a1;" | ||
138 | "add.l %%d4, %%d0;" | ||
139 | "add.l %%d5, %%d1;" | ||
140 | "add.l %%a0, %%d2;" | ||
141 | "add.l %%a1, %%d3;" | ||
142 | "movem.l %%d0-%%d3, (%[dst]);" | ||
143 | "lea.l (16, %[dst]), %[dst];" | ||
144 | "subq.l #4, %[n];" | ||
145 | "jne 0b;" | ||
146 | : [d] "+a" (data), [w] "+a" (window), [dst] "+a" (dst), [n] "+d" (n) | ||
147 | : : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc"); | ||
148 | } | ||
74 | 149 | ||
75 | //return x * y; | 150 | inline void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const fixed32 *src1, |
76 | return (x * y); | 151 | int len) |
77 | // return (fixed64) fixmul32(Fixed32From64(x),y); | 152 | { |
153 | /* Block sizes are always power of two. Smallest block is always way bigger | ||
154 | * than four too.*/ | ||
155 | asm volatile ( | ||
156 | "lea.l (-16, %[s1], %[n]*4), %[s1];" | ||
157 | "0:" | ||
158 | "movem.l (%[s0]), %%d0-%%d3;" | ||
159 | "movem.l (%[s1]), %%d4-%%d5/%%a0-%%a1;" | ||
160 | "mac.l %%d0, %%a1, %%acc0;" | ||
161 | "mac.l %%d1, %%a0, %%acc1;" | ||
162 | "mac.l %%d2, %%d5, %%acc2;" | ||
163 | "mac.l %%d3, %%d4, %%acc3;" | ||
164 | "lea.l (16, %[s0]), %[s0];" | ||
165 | "lea.l (-16, %[s1]), %[s1];" | ||
166 | "movclr.l %%acc0, %%d0;" | ||
167 | "movclr.l %%acc1, %%d1;" | ||
168 | "movclr.l %%acc2, %%d2;" | ||
169 | "movclr.l %%acc3, %%d3;" | ||
170 | "movem.l %%d0-%%d3, (%[dst]);" | ||
171 | "lea.l (16, %[dst]), %[dst];" | ||
172 | "subq.l #4, %[n];" | ||
173 | "jne 0b;" | ||
174 | : [s0] "+a" (src0), [s1] "+a" (src1), [dst] "+a" (dst), [n] "+d" (len) | ||
175 | : : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc"); | ||
78 | } | 176 | } |
79 | 177 | ||
178 | #else | ||
179 | |||
180 | inline void vector_fmul_add_add(fixed32 *dst, const fixed32 *src0, const fixed32 *src1, int len){ | ||
181 | int i; | ||
182 | for(i=0; i<len; i++) | ||
183 | dst[i] = fixmul32b(src0[i], src1[i]) + dst[i]; | ||
184 | } | ||
185 | |||
186 | inline void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const fixed32 *src1, int len){ | ||
187 | int i; | ||
188 | src1 += len-1; | ||
189 | for(i=0; i<len; i++) | ||
190 | dst[i] = fixmul32b(src0[i], src1[-i]); | ||
191 | } | ||
192 | |||
193 | #endif | ||
194 | |||
195 | /* | ||
196 | Not performance senstitive code here | ||
197 | |||
198 | */ | ||
80 | 199 | ||
81 | fixed32 fixdiv32(fixed32 x, fixed32 y) | 200 | fixed32 fixdiv32(fixed32 x, fixed32 y) |
82 | { | 201 | { |
diff --git a/apps/codecs/libwma/wmafixed.h b/apps/codecs/libwma/wmafixed.h index fb31cf8bfa..6b5137e044 100644 --- a/apps/codecs/libwma/wmafixed.h +++ b/apps/codecs/libwma/wmafixed.h | |||
@@ -47,7 +47,6 @@ fixed64 IntTo64(int x); | |||
47 | int IntFrom64(fixed64 x); | 47 | int IntFrom64(fixed64 x); |
48 | fixed32 Fixed32From64(fixed64 x); | 48 | fixed32 Fixed32From64(fixed64 x); |
49 | fixed64 Fixed32To64(fixed32 x); | 49 | fixed64 Fixed32To64(fixed32 x); |
50 | fixed64 fixmul64byfixed(fixed64 x, fixed32 y); | ||
51 | fixed32 fixdiv32(fixed32 x, fixed32 y); | 50 | fixed32 fixdiv32(fixed32 x, fixed32 y); |
52 | fixed64 fixdiv64(fixed64 x, fixed64 y); | 51 | fixed64 fixdiv64(fixed64 x, fixed64 y); |
53 | fixed32 fixsqrt32(fixed32 x); | 52 | fixed32 fixsqrt32(fixed32 x); |