summaryrefslogtreecommitdiff
path: root/lib/rbcodec/codecs/libspc/spc_dsp.c
diff options
context:
space:
mode:
Diffstat (limited to 'lib/rbcodec/codecs/libspc/spc_dsp.c')
-rw-r--r--lib/rbcodec/codecs/libspc/spc_dsp.c1733
1 files changed, 537 insertions, 1196 deletions
diff --git a/lib/rbcodec/codecs/libspc/spc_dsp.c b/lib/rbcodec/codecs/libspc/spc_dsp.c
index 6350c4c331..c94fbc990e 100644
--- a/lib/rbcodec/codecs/libspc/spc_dsp.c
+++ b/lib/rbcodec/codecs/libspc/spc_dsp.c
@@ -27,15 +27,103 @@
27#include "spc_codec.h" 27#include "spc_codec.h"
28#include "spc_profiler.h" 28#include "spc_profiler.h"
29 29
30#if defined(CPU_COLDFIRE) || defined (CPU_ARM) 30#define CLAMP16( n ) clip_sample_16( n )
31int32_t fir_buf[FIR_BUF_CNT] IBSS_ATTR_SPC 31
32 __attribute__((aligned(FIR_BUF_ALIGN*1))); 32#if defined(CPU_ARM)
33#if ARM_ARCH >= 6
34#include "cpu/spc_dsp_armv6.c"
35#else
36#include "cpu/spc_dsp_armv4.c"
33#endif 37#endif
34#if SPC_BRRCACHE 38#elif defined (CPU_COLDFIRE)
35/* a little extra for samples that go past end */ 39#include "cpu/spc_dsp_coldfire.c"
36int16_t BRRcache [BRR_CACHE_SIZE] CACHEALIGN_ATTR;
37#endif 40#endif
38 41
42/* Above may still use generic implementations. Also defines final
43 function names. */
44#include "spc_dsp_generic.c"
45
46/* each rate divides exactly into 0x7800 without remainder */
47static unsigned short const env_rates [0x20] ICONST_ATTR_SPC =
48{
49 0x0000, 0x000F, 0x0014, 0x0018, 0x001E, 0x0028, 0x0030, 0x003C,
50 0x0050, 0x0060, 0x0078, 0x00A0, 0x00C0, 0x00F0, 0x0140, 0x0180,
51 0x01E0, 0x0280, 0x0300, 0x03C0, 0x0500, 0x0600, 0x0780, 0x0A00,
52 0x0C00, 0x0F00, 0x1400, 0x1800, 0x1E00, 0x2800, 0x3C00, 0x7800
53};
54
55#if !SPC_NOINTERP
56/* Interleved gauss table (to improve cache coherency). */
57/* gauss [i * 2 + j] = normal_gauss [(1 - j) * 256 + i] */
58static int16_t const gauss_table [512] ICONST_ATTR_SPC MEM_ALIGN_ATTR =
59{
60 370,1305, 366,1305, 362,1304, 358,1304,
61 354,1304, 351,1304, 347,1304, 343,1303,
62 339,1303, 336,1303, 332,1302, 328,1302,
63 325,1301, 321,1300, 318,1300, 314,1299,
64 311,1298, 307,1297, 304,1297, 300,1296,
65 297,1295, 293,1294, 290,1293, 286,1292,
66 283,1291, 280,1290, 276,1288, 273,1287,
67 270,1286, 267,1284, 263,1283, 260,1282,
68 257,1280, 254,1279, 251,1277, 248,1275,
69 245,1274, 242,1272, 239,1270, 236,1269,
70 233,1267, 230,1265, 227,1263, 224,1261,
71 221,1259, 218,1257, 215,1255, 212,1253,
72 210,1251, 207,1248, 204,1246, 201,1244,
73 199,1241, 196,1239, 193,1237, 191,1234,
74 188,1232, 186,1229, 183,1227, 180,1224,
75 178,1221, 175,1219, 173,1216, 171,1213,
76 168,1210, 166,1207, 163,1205, 161,1202,
77 159,1199, 156,1196, 154,1193, 152,1190,
78 150,1186, 147,1183, 145,1180, 143,1177,
79 141,1174, 139,1170, 137,1167, 134,1164,
80 132,1160, 130,1157, 128,1153, 126,1150,
81 124,1146, 122,1143, 120,1139, 118,1136,
82 117,1132, 115,1128, 113,1125, 111,1121,
83 109,1117, 107,1113, 106,1109, 104,1106,
84 102,1102, 100,1098, 99,1094, 97,1090,
85 95,1086, 94,1082, 92,1078, 90,1074,
86 89,1070, 87,1066, 86,1061, 84,1057,
87 83,1053, 81,1049, 80,1045, 78,1040,
88 77,1036, 76,1032, 74,1027, 73,1023,
89 71,1019, 70,1014, 69,1010, 67,1005,
90 66,1001, 65, 997, 64, 992, 62, 988,
91 61, 983, 60, 978, 59, 974, 58, 969,
92 56, 965, 55, 960, 54, 955, 53, 951,
93 52, 946, 51, 941, 50, 937, 49, 932,
94 48, 927, 47, 923, 46, 918, 45, 913,
95 44, 908, 43, 904, 42, 899, 41, 894,
96 40, 889, 39, 884, 38, 880, 37, 875,
97 36, 870, 36, 865, 35, 860, 34, 855,
98 33, 851, 32, 846, 32, 841, 31, 836,
99 30, 831, 29, 826, 29, 821, 28, 816,
100 27, 811, 27, 806, 26, 802, 25, 797,
101 24, 792, 24, 787, 23, 782, 23, 777,
102 22, 772, 21, 767, 21, 762, 20, 757,
103 20, 752, 19, 747, 19, 742, 18, 737,
104 17, 732, 17, 728, 16, 723, 16, 718,
105 15, 713, 15, 708, 15, 703, 14, 698,
106 14, 693, 13, 688, 13, 683, 12, 678,
107 12, 674, 11, 669, 11, 664, 11, 659,
108 10, 654, 10, 649, 10, 644, 9, 640,
109 9, 635, 9, 630, 8, 625, 8, 620,
110 8, 615, 7, 611, 7, 606, 7, 601,
111 6, 596, 6, 592, 6, 587, 6, 582,
112 5, 577, 5, 573, 5, 568, 5, 563,
113 4, 559, 4, 554, 4, 550, 4, 545,
114 4, 540, 3, 536, 3, 531, 3, 527,
115 3, 522, 3, 517, 2, 513, 2, 508,
116 2, 504, 2, 499, 2, 495, 2, 491,
117 2, 486, 1, 482, 1, 477, 1, 473,
118 1, 469, 1, 464, 1, 460, 1, 456,
119 1, 451, 1, 447, 1, 443, 1, 439,
120 0, 434, 0, 430, 0, 426, 0, 422,
121 0, 418, 0, 414, 0, 410, 0, 405,
122 0, 401, 0, 397, 0, 393, 0, 389,
123 0, 385, 0, 381, 0, 378, 0, 374,
124};
125#endif /* !SPC_NOINTERP */
126
39void DSP_write( struct Spc_Dsp* this, int i, int data ) 127void DSP_write( struct Spc_Dsp* this, int i, int data )
40{ 128{
41 assert( (unsigned) i < REGISTER_COUNT ); 129 assert( (unsigned) i < REGISTER_COUNT );
@@ -51,230 +139,395 @@ void DSP_write( struct Spc_Dsp* this, int i, int data )
51 v->volume [0] = left; 139 v->volume [0] = left;
52 v->volume [1] = right; 140 v->volume [1] = right;
53 } 141 }
142 else if ( low < 4 ) /* voice rates */
143 {
144 struct voice_t* v = this->voice_state + high;
145 v->rate = GET_LE16A( this->r.voice[high].rate ) & 0x3fff;
146 }
147#if !SPC_NOECHO
54 else if ( low == 0x0F ) /* fir coefficients */ 148 else if ( low == 0x0F ) /* fir coefficients */
55 { 149 {
56 this->fir_coeff [7 - high] = (int8_t) data; /* sign-extend */ 150 this->fir.coeff [7 - high] = (int8_t) data; /* sign-extend */
57 } 151 }
152#endif /* !SPC_NOECHO */
58} 153}
59 154
60#define CLAMP16( n ) clip_sample_16( n ) 155/* Decode BRR block */
156static inline void
157decode_brr_block( struct voice_t* voice, uint8_t const* addr, int16_t* out )
158{
159 /* header */
160 unsigned block_header = *addr;
161 voice->wave.block_header = block_header;
162
163 /* point to next header */
164 addr += 9;
165 voice->wave.addr = addr;
166
167 /* previous samples */
168 int smp2 = out [0];
169 int smp1 = out [1];
170
171 int offset = -BRR_BLOCK_SIZE * 4;
172
173#if !SPC_BRRCACHE
174 out [-(BRR_BLOCK_SIZE + 1)] = out [-1];
175
176 /* if next block has end flag set,
177 this block ends early (verified) */
178 if ( (block_header & 3) != 3 && (*addr & 3) == 1 )
179 {
180 /* arrange for last 9 samples to be skipped */
181 int const skip = 9;
182 out [skip - (BRR_BLOCK_SIZE + 1)] = out [-1];
183 out += (skip & 1);
184 voice->wave.position += skip * 0x1000;
185 offset = (-BRR_BLOCK_SIZE + (skip & ~1)) * 4;
186 addr -= skip / 2;
187 /* force sample to end on next decode */
188 voice->wave.block_header = 1;
189 }
190#endif /* !SPC_BRRCACHE */
191
192 int const filter = block_header & 0x0c;
193 int const scale = block_header >> 4;
194
195 if ( filter == 0x08 ) /* filter 2 (30-90% of the time) */
196 {
197 /* y[n] = x[n] + 61/32 * y[n-1] - 15/16 * y[n-2] */
198 do /* decode and filter 16 samples */
199 {
200 /* Get nybble, sign-extend, then scale
201 get byte, select which nybble, sign-extend, then shift
202 based on scaling. */
203 int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4;
204 delta = (delta << scale) >> 1;
205
206 if (scale > 0xc)
207 delta = (delta >> 17) << 11;
208
209 out [offset >> 2] = smp2;
210
211 delta -= smp2 >> 1;
212 delta += smp2 >> 5;
213 delta += smp1;
214 delta += (-smp1 - (smp1 >> 1)) >> 5;
215
216 delta = CLAMP16( delta );
217 smp2 = smp1;
218 smp1 = (int16_t) (delta * 2); /* sign-extend */
219 }
220 while ( (offset += 4) != 0 );
221 }
222 else if ( filter == 0x04 ) /* filter 1 */
223 {
224 /* y[n] = x[n] + 15/16 * y[n-1] */
225 do /* decode and filter 16 samples */
226 {
227 /* Get nybble, sign-extend, then scale
228 get byte, select which nybble, sign-extend, then shift
229 based on scaling. */
230 int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4;
231 delta = (delta << scale) >> 1;
232
233 if (scale > 0xc)
234 delta = (delta >> 17) << 11;
235
236 out [offset >> 2] = smp2;
237
238 delta += smp1 >> 1;
239 delta += (-smp1) >> 5;
240
241 delta = CLAMP16( delta );
242 smp2 = smp1;
243 smp1 = (int16_t) (delta * 2); /* sign-extend */
244 }
245 while ( (offset += 4) != 0 );
246 }
247 else if ( filter == 0x0c ) /* filter 3 */
248 {
249 /* y[n] = x[n] + 115/64 * y[n-1] - 13/16 * y[n-2] */
250 do /* decode and filter 16 samples */
251 {
252 /* Get nybble, sign-extend, then scale
253 get byte, select which nybble, sign-extend, then shift
254 based on scaling. */
255 int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4;
256 delta = (delta << scale) >> 1;
257
258 if (scale > 0xc)
259 delta = (delta >> 17) << 11;
260
261 out [offset >> 2] = smp2;
262
263 delta -= smp2 >> 1;
264 delta += (smp2 + (smp2 >> 1)) >> 4;
265 delta += smp1;
266 delta += (-smp1 * 13) >> 7;
267
268 delta = CLAMP16( delta );
269 smp2 = smp1;
270 smp1 = (int16_t) (delta * 2); /* sign-extend */
271 }
272 while ( (offset += 4) != 0 );
273 }
274 else /* filter 0 */
275 {
276 /* y[n] = x[n] */
277 do /* decode and filter 16 samples */
278 {
279 /* Get nybble, sign-extend, then scale
280 get byte, select which nybble, sign-extend, then shift
281 based on scaling. */
282 int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4;
283 delta = (delta << scale) >> 1;
284
285 if (scale > 0xc)
286 delta = (delta >> 17) << 11;
287
288 out [offset >> 2] = smp2;
289
290 smp2 = smp1;
291 smp1 = delta * 2;
292 }
293 while ( (offset += 4) != 0 );
294 }
61 295
62#if SPC_BRRCACHE 296#if SPC_BRRCACHE
63static void decode_brr( struct Spc_Dsp* this, unsigned start_addr, 297 if ( !(block_header & 1) )
64 struct voice_t* voice, 298 {
65 struct raw_voice_t const* const raw_voice ) ICODE_ATTR_SPC; 299 /* save to end of next block (for next call) */
66static void decode_brr( struct Spc_Dsp* this, unsigned start_addr, 300 out [BRR_BLOCK_SIZE ] = smp2;
67 struct voice_t* voice, 301 out [BRR_BLOCK_SIZE + 1] = smp1;
68 struct raw_voice_t const* const raw_voice ) 302 }
303 else
304#endif /* SPC_BRRCACHE */
305 {
306 /* save to end of this block */
307 out [0] = smp2;
308 out [1] = smp1;
309 }
310}
311
312#if SPC_BRRCACHE
313static void NO_INLINE ICODE_ATTR_SPC
314brr_decode_cache( struct Spc_Dsp* this, struct src_dir const* sd,
315 unsigned start_addr, struct voice_t* voice,
316 struct raw_voice_t const* raw_voice )
69{ 317{
70 /* setup same variables as where decode_brr() is called from */ 318 /* a little extra for samples that go past end */
71 #undef RAM 319 static int16_t BRRcache [BRR_CACHE_SIZE] CACHEALIGN_ATTR;
72 #define RAM ram.ram 320
321 DEBUGF( "decode at %08x (wave #%d)\n",
322 start_addr, raw_voice->waveform );
73 323
74 struct src_dir const* const sd =
75 &ram.sd[this->r.g.wave_page * 0x100/sizeof(struct src_dir)];
76 struct cache_entry_t* const wave_entry = 324 struct cache_entry_t* const wave_entry =
77 &this->wave_entry [raw_voice->waveform]; 325 &this->wave_entry [raw_voice->waveform];
78 326
79 /* the following block can be put in place of the call to 327 wave_entry->start_addr = start_addr;
80 decode_brr() below 328
81 */ 329 uint8_t const* const loop_ptr =
330 ram.ram + letoh16( sd [raw_voice->waveform].loop );
331
332 int16_t* loop_start = NULL;
333
334 uint8_t const* addr = ram.ram + start_addr;
335
336 int16_t* out = BRRcache + start_addr * 2;
337 wave_entry->samples = out;
338
339 /* BRR filter uses previous samples */
340 out [BRR_BLOCK_SIZE + 1] = 0;
341 out [BRR_BLOCK_SIZE + 2] = 0;
342 *out++ = 0;
343
344 unsigned block_header;
345
346 do
347 {
348 if ( addr == loop_ptr )
349 {
350 loop_start = out;
351 DEBUGF( "loop at %08lx (wave #%d)\n",
352 (unsigned long)(addr - RAM), raw_voice->waveform );
353 }
354
355 /* output position - preincrement */
356 out += BRR_BLOCK_SIZE;
357
358 decode_brr_block( voice, addr, out );
359
360 block_header = voice->wave.block_header;
361 addr = voice->wave.addr;
362
363 /* if next block has end flag set, this block ends early */
364 /* (verified) */
365 if ( (block_header & 3) != 3 && (*addr & 3) == 1 )
366 {
367 /* skip last 9 samples */
368 DEBUGF( "block early end\n" );
369 out -= 9;
370 break;
371 }
372 }
373 while ( !(block_header & 1) && addr < RAM + 0x10000 );
374
375 wave_entry->end = (out - 1 - wave_entry->samples) << 12;
376 wave_entry->loop = 0;
377
378 if ( (block_header & 2) )
379 {
380 if ( loop_start )
381 {
382 wave_entry->loop = out - loop_start;
383 wave_entry->end += 0x3000;
384
385 out [2] = loop_start [2];
386 out [3] = loop_start [3];
387 out [4] = loop_start [4];
388 }
389 else
390 {
391 DEBUGF( "loop point outside initial wave\n" );
392 }
393 }
394
395 DEBUGF( "end at %08lx (wave #%d)\n",
396 (unsigned long)(addr - RAM), raw_voice->waveform );
397
398 /* add to cache */
399 this->wave_entry_old [this->oldsize++] = *wave_entry;
400}
401
402static inline void
403brr_key_on( struct Spc_Dsp* this, struct src_dir const* sd,
404 struct voice_t* voice, struct raw_voice_t const* raw_voice )
405{
406 unsigned start_addr = letoh16( sd [raw_voice->waveform].start );
407 struct cache_entry_t* const wave_entry =
408 &this->wave_entry [raw_voice->waveform];
409
410 /* predecode BRR if not already */
411 if ( wave_entry->start_addr != start_addr )
82 { 412 {
83 DEBUGF( "decode at %08x (wave #%d)\n",
84 start_addr, raw_voice->waveform );
85
86 /* see if in cache */ 413 /* see if in cache */
87 int i; 414 for ( int i = 0; i < this->oldsize; i++ )
88 for ( i = 0; i < this->oldsize; i++ )
89 { 415 {
90 struct cache_entry_t* e = &this->wave_entry_old [i]; 416 struct cache_entry_t* e = &this->wave_entry_old [i];
417
91 if ( e->start_addr == start_addr ) 418 if ( e->start_addr == start_addr )
92 { 419 {
93 DEBUGF( "found in wave_entry_old (oldsize=%d)\n", 420 DEBUGF( "found in wave_entry_old (oldsize=%d)\n",
94 this->oldsize ); 421 this->oldsize );
95 *wave_entry = *e; 422 *wave_entry = *e;
96 goto wave_in_cache; 423 goto wave_in_cache; /* Wave in cache */
97 } 424 }
98 } 425 }
99 426
100 wave_entry->start_addr = start_addr; 427 /* actually decode it */
101 428 brr_decode_cache( this, sd, start_addr, voice, raw_voice );
102 uint8_t const* const loop_ptr =
103 RAM + letoh16(sd[raw_voice->waveform].loop);
104 short* loop_start = 0;
105
106 short* out = BRRcache + start_addr * 2;
107 wave_entry->samples = out;
108 *out++ = 0;
109 int smp1 = 0;
110 int smp2 = 0;
111
112 uint8_t const* addr = RAM + start_addr;
113 int block_header;
114 do
115 {
116 if ( addr == loop_ptr )
117 {
118 loop_start = out;
119 DEBUGF( "loop at %08lx (wave #%d)\n",
120 (unsigned long)(addr - RAM), raw_voice->waveform );
121 }
122
123 /* header */
124 block_header = *addr;
125 addr += 9;
126 voice->addr = addr;
127 int const filter = (block_header & 0x0C) - 0x08;
128
129 /* scaling
130 (invalid scaling gives -4096 for neg nybble, 0 for pos) */
131 static unsigned char const right_shifts [16] = {
132 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 29, 29, 29,
133 };
134 static unsigned char const left_shifts [16] = {
135 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 11, 11
136 };
137 int const scale = block_header >> 4;
138 int const right_shift = right_shifts [scale];
139 int const left_shift = left_shifts [scale];
140
141 /* output position */
142 out += BRR_BLOCK_SIZE;
143 int offset = -BRR_BLOCK_SIZE << 2;
144
145 do /* decode and filter 16 samples */
146 {
147 /* Get nybble, sign-extend, then scale
148 get byte, select which nybble, sign-extend, then shift based
149 on scaling. also handles invalid scaling values. */
150 int delta = (int) (int8_t) (addr [offset >> 3] << (offset & 4))
151 >> right_shift << left_shift;
152
153 out [offset >> 2] = smp2;
154
155 if ( filter == 0 ) /* mode 0x08 (30-90% of the time) */
156 {
157 delta -= smp2 >> 1;
158 delta += smp2 >> 5;
159 smp2 = smp1;
160 delta += smp1;
161 delta += (-smp1 - (smp1 >> 1)) >> 5;
162 }
163 else
164 {
165 if ( filter == -4 ) /* mode 0x04 */
166 {
167 delta += smp1 >> 1;
168 delta += (-smp1) >> 5;
169 }
170 else if ( filter > -4 ) /* mode 0x0C */
171 {
172 delta -= smp2 >> 1;
173 delta += (smp2 + (smp2 >> 1)) >> 4;
174 delta += smp1;
175 delta += (-smp1 * 13) >> 7;
176 }
177 smp2 = smp1;
178 }
179
180 delta = CLAMP16( delta );
181 smp1 = (int16_t) (delta * 2); /* sign-extend */
182 }
183 while ( (offset += 4) != 0 );
184
185 /* if next block has end flag set, this block ends early */
186 /* (verified) */
187 if ( (block_header & 3) != 3 && (*addr & 3) == 1 )
188 {
189 /* skip last 9 samples */
190 out -= 9;
191 goto early_end;
192 }
193 }
194 while ( !(block_header & 1) && addr < RAM + 0x10000 );
195
196 out [0] = smp2;
197 out [1] = smp1;
198
199 early_end:
200 wave_entry->end = (out - 1 - wave_entry->samples) << 12;
201
202 wave_entry->loop = 0;
203 if ( (block_header & 2) )
204 {
205 if ( loop_start )
206 {
207 int loop = out - loop_start;
208 wave_entry->loop = loop;
209 wave_entry->end += 0x3000;
210 out [2] = loop_start [2];
211 out [3] = loop_start [3];
212 out [4] = loop_start [4];
213 }
214 else
215 {
216 DEBUGF( "loop point outside initial wave\n" );
217 }
218 }
219
220 DEBUGF( "end at %08lx (wave #%d)\n",
221 (unsigned long)(addr - RAM), raw_voice->waveform );
222
223 /* add to cache */
224 this->wave_entry_old [this->oldsize++] = *wave_entry;
225wave_in_cache:;
226 } 429 }
430
431wave_in_cache:
432 voice->wave.position = 3 * 0x1000 - 1; /* 0x2fff */
433 voice->wave.samples = wave_entry->samples;
434 voice->wave.end = wave_entry->end;
435 voice->wave.loop = wave_entry->loop;
436}
437
438static inline int brr_decode( struct src_dir const* sd, struct voice_t* voice,
439 struct raw_voice_t const* raw_voice )
440{
441 if ( voice->wave.position < voice->wave.end )
442 return 0;
443
444 long loop_len = voice->wave.loop << 12;
445
446 if ( !loop_len )
447 return 2;
448
449 voice->wave.position -= loop_len;
450 return 1;
451
452 (void)sd; (void)raw_voice;
227} 453}
228#endif
229 454
230static void key_on(struct Spc_Dsp* const this, struct voice_t* const voice, 455#else /* !SPC_BRRCACHE */
231 struct src_dir const* const sd, 456
232 struct raw_voice_t const* const raw_voice, 457static inline void
233 const int key_on_delay, const int vbit) ICODE_ATTR_SPC; 458brr_key_on( struct Spc_Dsp* this, struct src_dir const* sd,
234static void key_on(struct Spc_Dsp* const this, struct voice_t* const voice, 459 struct voice_t* voice, struct raw_voice_t const* raw_voice )
235 struct src_dir const* const sd, 460{
236 struct raw_voice_t const* const raw_voice, 461 voice->wave.addr = ram.ram + letoh16( sd [raw_voice->waveform].start );
237 const int key_on_delay, const int vbit) { 462 /* BRR filter uses previous samples */
463 voice->wave.samples [BRR_BLOCK_SIZE + 1] = 0;
464 voice->wave.samples [BRR_BLOCK_SIZE + 2] = 0;
465 /* force decode on next brr_decode call */
466 voice->wave.position = (BRR_BLOCK_SIZE + 3) * 0x1000 - 1; /* 0x12fff */
467 voice->wave.block_header = 0; /* "previous" BRR header */
468 (void)this;
469}
470
471static inline int brr_decode( struct src_dir const* sd, struct voice_t* voice,
472 struct raw_voice_t const* raw_voice )
473{
238 #undef RAM 474 #undef RAM
475#if defined(CPU_ARM) && !SPC_BRRCACHE
476 uint8_t* const ram_ = ram.ram;
477 #define RAM ram_
478#else
239 #define RAM ram.ram 479 #define RAM ram.ram
240 int const env_rate_init = 0x7800; 480#endif
481
482 if ( voice->wave.position < BRR_BLOCK_SIZE * 0x1000 )
483 return 0;
484
485 voice->wave.position -= BRR_BLOCK_SIZE * 0x1000;
486
487 uint8_t const* addr = voice->wave.addr;
488
489 if ( addr >= RAM + 0x10000 )
490 addr -= 0x10000;
491
492 unsigned block_header = voice->wave.block_header;
493
494 /* action based on previous block's header */
495 int dec = 0;
496
497 if ( block_header & 1 )
498 {
499 addr = RAM + letoh16( sd [raw_voice->waveform].loop );
500 dec = 1;
501
502 if ( !(block_header & 2) ) /* 1% of the time */
503 {
504 /* first block was end block;
505 don't play anything (verified) */
506 return 2;
507 }
508 }
509
510 decode_brr_block( voice, addr, &voice->wave.samples [1 + BRR_BLOCK_SIZE] );
511
512 return dec;
513}
514#endif /* SPC_BRRCACHE */
515
516static void NO_INLINE ICODE_ATTR_SPC
517key_on( struct Spc_Dsp* const this, struct voice_t* const voice,
518 struct src_dir const* const sd,
519 struct raw_voice_t const* const raw_voice,
520 const int key_on_delay, const int vbit )
521{
241 voice->key_on_delay = key_on_delay; 522 voice->key_on_delay = key_on_delay;
523
242 if ( key_on_delay == 0 ) 524 if ( key_on_delay == 0 )
243 { 525 {
244 this->keys_down |= vbit; 526 this->keys_down |= vbit;
245 voice->envx = 0; 527 voice->envx = 0;
246 voice->env_mode = state_attack; 528 voice->env_mode = state_attack;
247 voice->env_timer = env_rate_init; /* TODO: inaccurate? */ 529 voice->env_timer = ENV_RATE_INIT; /* TODO: inaccurate? */
248 unsigned start_addr = letoh16(sd[raw_voice->waveform].start); 530 brr_key_on( this, sd, voice, raw_voice );
249 #if !SPC_BRRCACHE
250 {
251 voice->addr = RAM + start_addr;
252 /* BRR filter uses previous samples */
253 voice->samples [BRR_BLOCK_SIZE + 1] = 0;
254 voice->samples [BRR_BLOCK_SIZE + 2] = 0;
255 /* decode three samples immediately */
256 voice->position = (BRR_BLOCK_SIZE + 3) * 0x1000 - 1;
257 voice->block_header = 0; /* "previous" BRR header */
258 }
259 #else
260 {
261 voice->position = 3 * 0x1000 - 1;
262 struct cache_entry_t* const wave_entry =
263 &this->wave_entry [raw_voice->waveform];
264
265 /* predecode BRR if not already */
266 if ( wave_entry->start_addr != start_addr )
267 {
268 /* the following line can be replaced by the indicated block
269 in decode_brr() */
270 decode_brr( this, start_addr, voice, raw_voice );
271 }
272
273 voice->samples = wave_entry->samples;
274 voice->wave_end = wave_entry->end;
275 voice->wave_loop = wave_entry->loop;
276 }
277 #endif
278 } 531 }
279} 532}
280 533
@@ -287,10 +540,8 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
287#else 540#else
288 #define RAM ram.ram 541 #define RAM ram.ram
289#endif 542#endif
290#if 0
291 EXIT_TIMER(cpu); 543 EXIT_TIMER(cpu);
292 ENTER_TIMER(dsp); 544 ENTER_TIMER(dsp);
293#endif
294 545
295 /* Here we check for keys on/off. Docs say that successive writes 546 /* Here we check for keys on/off. Docs say that successive writes
296 to KON/KOF must be separated by at least 2 Ts periods or risk 547 to KON/KOF must be separated by at least 2 Ts periods or risk
@@ -327,98 +578,60 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
327 } 578 }
328 579
329 struct src_dir const* const sd = 580 struct src_dir const* const sd =
330 &ram.sd[this->r.g.wave_page * 0x100/sizeof(struct src_dir)]; 581 &ram.sd [this->r.g.wave_page * 0x100/sizeof(struct src_dir)];
331 582
332 #ifdef ROCKBOX_BIG_ENDIAN
333 /* Convert endiannesses before entering loops - these
334 get used alot */
335 const uint32_t rates[VOICE_COUNT] =
336 {
337 GET_LE16A( this->r.voice[0].rate ) & 0x3FFF,
338 GET_LE16A( this->r.voice[1].rate ) & 0x3FFF,
339 GET_LE16A( this->r.voice[2].rate ) & 0x3FFF,
340 GET_LE16A( this->r.voice[3].rate ) & 0x3FFF,
341 GET_LE16A( this->r.voice[4].rate ) & 0x3FFF,
342 GET_LE16A( this->r.voice[5].rate ) & 0x3FFF,
343 GET_LE16A( this->r.voice[6].rate ) & 0x3FFF,
344 GET_LE16A( this->r.voice[7].rate ) & 0x3FFF,
345 };
346 #define VOICE_RATE(x) *(x)
347 #define IF_RBE(...) __VA_ARGS__
348 #ifdef CPU_COLDFIRE
349 /* Initialize mask register with the buffer address mask */
350 asm volatile ("move.l %[m], %%mask" : : [m]"i"(FIR_BUF_MASK));
351 const int echo_wrap = (this->r.g.echo_delay & 15) * 0x800;
352 const int echo_start = this->r.g.echo_page * 0x100;
353 #endif /* CPU_COLDFIRE */
354 #else
355 #define VOICE_RATE(x) (GET_LE16(raw_voice->rate) & 0x3FFF)
356 #define IF_RBE(...)
357 #endif /* ROCKBOX_BIG_ENDIAN */
358
359#if !SPC_NOINTERP 583#if !SPC_NOINTERP
360 int const slow_gaussian = (this->r.g.pitch_mods >> 1) | 584 int const slow_gaussian = (this->r.g.pitch_mods >> 1) |
361 this->r.g.noise_enables; 585 this->r.g.noise_enables;
586#endif
587#if !SPC_NOECHO
588 int const echo_start = this->r.g.echo_page * 0x100;
589 int const echo_delay = (this->r.g.echo_delay & 15) * 0x800;
362#endif 590#endif
363 /* (g.flags & 0x40) ? 30 : 14 */ 591 /* (g.flags & 0x40) ? 30 : 14 */
364 int const global_muting = ((this->r.g.flags & 0x40) >> 2) + 14 - 8; 592 int const global_muting = ((this->r.g.flags & 0x40) >> 2) + 14 - 8;
365 int const global_vol_0 = this->r.g.volume_0; 593 int const global_vol_0 = this->r.g.volume_0;
366 int const global_vol_1 = this->r.g.volume_1; 594 int const global_vol_1 = this->r.g.volume_1;
367 595
368 /* each rate divides exactly into 0x7800 without remainder */
369 int const env_rate_init = 0x7800;
370 static unsigned short const env_rates [0x20] ICONST_ATTR_SPC =
371 {
372 0x0000, 0x000F, 0x0014, 0x0018, 0x001E, 0x0028, 0x0030, 0x003C,
373 0x0050, 0x0060, 0x0078, 0x00A0, 0x00C0, 0x00F0, 0x0140, 0x0180,
374 0x01E0, 0x0280, 0x0300, 0x03C0, 0x0500, 0x0600, 0x0780, 0x0A00,
375 0x0C00, 0x0F00, 0x1400, 0x1800, 0x1E00, 0x2800, 0x3C00, 0x7800
376 };
377
378 do /* one pair of output samples per iteration */ 596 do /* one pair of output samples per iteration */
379 { 597 {
380 /* Noise */ 598 /* Noise */
381 if ( this->r.g.noise_enables ) 599 if ( this->r.g.noise_enables )
382 { 600 {
383 if ( (this->noise_count -= 601 this->noise_count -= env_rates [this->r.g.flags & 0x1F];
384 env_rates [this->r.g.flags & 0x1F]) <= 0 ) 602
603 if ( this->noise_count <= 0 )
385 { 604 {
386 this->noise_count = env_rate_init; 605 this->noise_count = ENV_RATE_INIT;
387 int feedback = (this->noise << 13) ^ (this->noise << 14); 606 int feedback = (this->noise << 13) ^ (this->noise << 14);
388 this->noise = (feedback & 0x8000) ^ (this->noise >> 1 & ~1); 607 this->noise = (feedback & 0x8000) ^ (this->noise >> 1 & ~1);
389 } 608 }
390 } 609 }
391 610
392#if !SPC_NOECHO 611 #if !SPC_NOECHO
393 int echo_0 = 0; 612 int echo_0 = 0, echo_1 = 0;
394 int echo_1 = 0; 613 #endif /* !SPC_NOECHO */
395#endif
396 long prev_outx = 0; /* TODO: correct value for first channel? */ 614 long prev_outx = 0; /* TODO: correct value for first channel? */
397 int chans_0 = 0; 615 int chans_0 = 0, chans_1 = 0;
398 int chans_1 = 0; 616
399 /* TODO: put raw_voice pointer in voice_t? */ 617 /* TODO: put raw_voice pointer in voice_t? */
400 struct raw_voice_t * raw_voice = this->r.voice; 618 struct raw_voice_t * raw_voice = this->r.voice;
401 struct voice_t* voice = this->voice_state; 619 struct voice_t* voice = this->voice_state;
402 int vbit = 1; 620
403 IF_RBE( const uint32_t* vr = rates; ) 621 for (int vbit = 1; vbit < 0x100; vbit <<= 1, ++voice, ++raw_voice )
404 for ( ; vbit < 0x100; vbit <<= 1, ++voice, ++raw_voice IF_RBE( , ++vr ) )
405 { 622 {
406 /* pregen involves checking keyon, etc */ 623 /* pregen involves checking keyon, etc */
407#if 0
408 ENTER_TIMER(dsp_pregen); 624 ENTER_TIMER(dsp_pregen);
409#endif
410 625
411 /* Key on events are delayed */ 626 /* Key on events are delayed */
412 int key_on_delay = voice->key_on_delay; 627 int key_on_delay = voice->key_on_delay;
413 628
414 if ( UNLIKELY ( --key_on_delay >= 0 ) ) /* <1% of the time */ 629 if ( UNLIKELY ( --key_on_delay >= 0 ) ) /* <1% of the time */
415 { 630 key_on( this, voice, sd, raw_voice, key_on_delay, vbit );
416 key_on(this,voice,sd,raw_voice,key_on_delay,vbit);
417 }
418 631
419 if ( !(this->keys_down & vbit) ) /* Silent channel */ 632 if ( !(this->keys_down & vbit) ) /* Silent channel */
420 { 633 {
421 silent_chan: 634 silent_chan:
422 raw_voice->envx = 0; 635 raw_voice->envx = 0;
423 raw_voice->outx = 0; 636 raw_voice->outx = 0;
424 prev_outx = 0; 637 prev_outx = 0;
@@ -461,7 +674,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
461 voice->envx = envx; 674 voice->envx = envx;
462 /* TODO: should this be 8? */ 675 /* TODO: should this be 8? */
463 raw_voice->envx = envx >> 4; 676 raw_voice->envx = envx >> 4;
464 env_timer = env_rate_init; 677 env_timer = ENV_RATE_INIT;
465 } 678 }
466 679
467 int sustain_level = adsr1 >> 5; 680 int sustain_level = adsr1 >> 5;
@@ -561,994 +774,131 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
561 } 774 }
562 } 775 }
563 init_env_timer: 776 init_env_timer:
564 env_timer = env_rate_init; 777 env_timer = ENV_RATE_INIT;
565 write_env_timer: 778 write_env_timer:
566 voice->env_timer = env_timer; 779 voice->env_timer = env_timer;
567 env_end:; 780 env_end:;
568 } 781 }
569#if 0 782
570 EXIT_TIMER(dsp_pregen); 783 EXIT_TIMER(dsp_pregen);
571 784
572 ENTER_TIMER(dsp_gen); 785 ENTER_TIMER(dsp_gen);
573#endif
574 #if !SPC_BRRCACHE
575 /* Decode BRR block */
576 if ( voice->position >= BRR_BLOCK_SIZE * 0x1000 )
577 {
578 voice->position -= BRR_BLOCK_SIZE * 0x1000;
579
580 uint8_t const* addr = voice->addr;
581 if ( addr >= RAM + 0x10000 )
582 addr -= 0x10000;
583
584 /* action based on previous block's header */
585 if ( voice->block_header & 1 )
586 {
587 addr = RAM + letoh16(sd[raw_voice->waveform].loop);
588 this->r.g.wave_ended |= vbit;
589 if ( !(voice->block_header & 2) ) /* 1% of the time */
590 {
591 /* first block was end block;
592 don't play anything (verified) */
593 /* bit was set, so this clears it */
594 this->keys_down ^= vbit;
595
596 /* since voice->envx is 0,
597 samples and position don't matter */
598 raw_voice->envx = 0;
599 voice->envx = 0;
600 goto skip_decode;
601 }
602 }
603
604 /* header */
605 int const block_header = *addr;
606 addr += 9;
607 voice->addr = addr;
608 voice->block_header = block_header;
609
610 /* previous samples */
611 int smp2 = voice->samples [BRR_BLOCK_SIZE + 1];
612 int smp1 = voice->samples [BRR_BLOCK_SIZE + 2];
613 voice->samples [0] = voice->samples [BRR_BLOCK_SIZE];
614
615 /* output position */
616 short* out = voice->samples + (1 + BRR_BLOCK_SIZE);
617 int offset = -BRR_BLOCK_SIZE << 2;
618
619 /* if next block has end flag set,
620 this block ends early (verified) */
621 if ( (block_header & 3) != 3 && (*addr & 3) == 1 )
622 {
623 /* arrange for last 9 samples to be skipped */
624 int const skip = 9;
625 out += (skip & 1);
626 voice->samples [skip] = voice->samples [BRR_BLOCK_SIZE];
627 voice->position += skip * 0x1000;
628 offset = (-BRR_BLOCK_SIZE + (skip & ~1)) << 2;
629 addr -= skip / 2;
630 /* force sample to end on next decode */
631 voice->block_header = 1;
632 }
633
634 int const filter = block_header & 0x0c;
635 int const scale = block_header >> 4;
636
637 if ( filter == 0x08 ) /* filter 2 (30-90% of the time) */
638 {
639 /* y[n] = x[n] + 61/32 * y[n-1] - 15/16 * y[n-2] */
640 do /* decode and filter 16 samples */
641 {
642 /* Get nybble, sign-extend, then scale
643 get byte, select which nybble, sign-extend, then shift
644 based on scaling. */
645 int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4;
646 delta = (delta << scale) >> 1;
647 786
648 if (scale > 0xc) 787 switch ( brr_decode( sd, voice, raw_voice ) )
649 delta = (delta >> 17) << 11; 788 {
650 789 case 2:
651 out [offset >> 2] = smp2; 790 /* bit was set, so this clears it */
652 791 this->keys_down ^= vbit;
653 delta -= smp2 >> 1;
654 delta += smp2 >> 5;
655 delta += smp1;
656 delta += (-smp1 - (smp1 >> 1)) >> 5;
657
658 delta = CLAMP16( delta );
659 smp2 = smp1;
660 smp1 = (int16_t) (delta * 2); /* sign-extend */
661 }
662 while ( (offset += 4) != 0 );
663 }
664 else if ( filter == 0x04 ) /* filter 1 */
665 {
666 /* y[n] = x[n] + 15/16 * y[n-1] */
667 do /* decode and filter 16 samples */
668 {
669 /* Get nybble, sign-extend, then scale
670 get byte, select which nybble, sign-extend, then shift
671 based on scaling. */
672 int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4;
673 delta = (delta << scale) >> 1;
674
675 if (scale > 0xc)
676 delta = (delta >> 17) << 11;
677
678 out [offset >> 2] = smp2;
679
680 delta += smp1 >> 1;
681 delta += (-smp1) >> 5;
682
683 delta = CLAMP16( delta );
684 smp2 = smp1;
685 smp1 = (int16_t) (delta * 2); /* sign-extend */
686 }
687 while ( (offset += 4) != 0 );
688 }
689 else if ( filter == 0x0c ) /* filter 3 */
690 {
691 /* y[n] = x[n] + 115/64 * y[n-1] - 13/16 * y[n-2] */
692 do /* decode and filter 16 samples */
693 {
694 /* Get nybble, sign-extend, then scale
695 get byte, select which nybble, sign-extend, then shift
696 based on scaling. */
697 int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4;
698 delta = (delta << scale) >> 1;
699
700 if (scale > 0xc)
701 delta = (delta >> 17) << 11;
702
703 out [offset >> 2] = smp2;
704
705 delta -= smp2 >> 1;
706 delta += (smp2 + (smp2 >> 1)) >> 4;
707 delta += smp1;
708 delta += (-smp1 * 13) >> 7;
709
710 delta = CLAMP16( delta );
711 smp2 = smp1;
712 smp1 = (int16_t) (delta * 2); /* sign-extend */
713 }
714 while ( (offset += 4) != 0 );
715 }
716 else /* filter 0 */
717 {
718 /* y[n] = x[n] */
719 do /* decode and filter 16 samples */
720 {
721 /* Get nybble, sign-extend, then scale
722 get byte, select which nybble, sign-extend, then shift
723 based on scaling. */
724 int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4;
725 delta = (delta << scale) >> 1;
726
727 if (scale > 0xc)
728 delta = (delta >> 17) << 11;
729
730 out [offset >> 2] = smp2;
731
732 smp2 = smp1;
733 smp1 = delta * 2;
734 }
735 while ( (offset += 4) != 0 );
736 }
737 792
738 out [0] = smp2; 793 /* since voice->envx is 0,
739 out [1] = smp1; 794 samples and position don't matter */
740 795 raw_voice->envx = 0;
741 skip_decode:; 796 voice->envx = 0;
797 case 1:
798 this->r.g.wave_ended |= vbit;
742 } 799 }
743 #endif /* !SPC_BRRCACHE */ 800
744 /* Get rate (with possible modulation) */ 801 /* Get rate (with possible modulation) */
745 int rate = VOICE_RATE(vr); 802 int rate = voice->rate;
746 if ( this->r.g.pitch_mods & vbit ) 803 if ( this->r.g.pitch_mods & vbit )
747 rate = (rate * (prev_outx + 32768)) >> 15; 804 rate = (rate * (prev_outx + 32768)) >> 15;
748 805
806 uint32_t position = voice->wave.position;
807 voice->wave.position += rate;
808
809 int output;
810 int amp_0, amp_1;
811
749 #if !SPC_NOINTERP 812 #if !SPC_NOINTERP
750 /* Interleved gauss table (to improve cache coherency). */
751 /* gauss [i * 2 + j] = normal_gauss [(1 - j) * 256 + i] */
752 static short const gauss [512] ICONST_ATTR_SPC MEM_ALIGN_ATTR =
753 {
754370,1305, 366,1305, 362,1304, 358,1304, 354,1304, 351,1304, 347,1304, 343,1303,
755339,1303, 336,1303, 332,1302, 328,1302, 325,1301, 321,1300, 318,1300, 314,1299,
756311,1298, 307,1297, 304,1297, 300,1296, 297,1295, 293,1294, 290,1293, 286,1292,
757283,1291, 280,1290, 276,1288, 273,1287, 270,1286, 267,1284, 263,1283, 260,1282,
758257,1280, 254,1279, 251,1277, 248,1275, 245,1274, 242,1272, 239,1270, 236,1269,
759233,1267, 230,1265, 227,1263, 224,1261, 221,1259, 218,1257, 215,1255, 212,1253,
760210,1251, 207,1248, 204,1246, 201,1244, 199,1241, 196,1239, 193,1237, 191,1234,
761188,1232, 186,1229, 183,1227, 180,1224, 178,1221, 175,1219, 173,1216, 171,1213,
762168,1210, 166,1207, 163,1205, 161,1202, 159,1199, 156,1196, 154,1193, 152,1190,
763150,1186, 147,1183, 145,1180, 143,1177, 141,1174, 139,1170, 137,1167, 134,1164,
764132,1160, 130,1157, 128,1153, 126,1150, 124,1146, 122,1143, 120,1139, 118,1136,
765117,1132, 115,1128, 113,1125, 111,1121, 109,1117, 107,1113, 106,1109, 104,1106,
766102,1102, 100,1098, 99,1094, 97,1090, 95,1086, 94,1082, 92,1078, 90,1074,
767 89,1070, 87,1066, 86,1061, 84,1057, 83,1053, 81,1049, 80,1045, 78,1040,
768 77,1036, 76,1032, 74,1027, 73,1023, 71,1019, 70,1014, 69,1010, 67,1005,
769 66,1001, 65, 997, 64, 992, 62, 988, 61, 983, 60, 978, 59, 974, 58, 969,
770 56, 965, 55, 960, 54, 955, 53, 951, 52, 946, 51, 941, 50, 937, 49, 932,
771 48, 927, 47, 923, 46, 918, 45, 913, 44, 908, 43, 904, 42, 899, 41, 894,
772 40, 889, 39, 884, 38, 880, 37, 875, 36, 870, 36, 865, 35, 860, 34, 855,
773 33, 851, 32, 846, 32, 841, 31, 836, 30, 831, 29, 826, 29, 821, 28, 816,
774 27, 811, 27, 806, 26, 802, 25, 797, 24, 792, 24, 787, 23, 782, 23, 777,
775 22, 772, 21, 767, 21, 762, 20, 757, 20, 752, 19, 747, 19, 742, 18, 737,
776 17, 732, 17, 728, 16, 723, 16, 718, 15, 713, 15, 708, 15, 703, 14, 698,
777 14, 693, 13, 688, 13, 683, 12, 678, 12, 674, 11, 669, 11, 664, 11, 659,
778 10, 654, 10, 649, 10, 644, 9, 640, 9, 635, 9, 630, 8, 625, 8, 620,
779 8, 615, 7, 611, 7, 606, 7, 601, 6, 596, 6, 592, 6, 587, 6, 582,
780 5, 577, 5, 573, 5, 568, 5, 563, 4, 559, 4, 554, 4, 550, 4, 545,
781 4, 540, 3, 536, 3, 531, 3, 527, 3, 522, 3, 517, 2, 513, 2, 508,
782 2, 504, 2, 499, 2, 495, 2, 491, 2, 486, 1, 482, 1, 477, 1, 473,
783 1, 469, 1, 464, 1, 460, 1, 456, 1, 451, 1, 447, 1, 443, 1, 439,
784 0, 434, 0, 430, 0, 426, 0, 422, 0, 418, 0, 414, 0, 410, 0, 405,
785 0, 401, 0, 397, 0, 393, 0, 389, 0, 385, 0, 381, 0, 378, 0, 374,
786 };
787 /* Gaussian interpolation using most recent 4 samples */ 813 /* Gaussian interpolation using most recent 4 samples */
788 long position = voice->position; 814
789 voice->position += rate;
790 short const* interp = voice->samples + (position >> 12);
791 int offset = position >> 4 & 0xFF;
792
793 /* Only left half of gaussian kernel is in table, so we must mirror 815 /* Only left half of gaussian kernel is in table, so we must mirror
794 for right half */ 816 for right half */
795 short const* fwd = gauss + offset * 2; 817 int offset = ( position >> 4 ) & 0xFF;
796 short const* rev = gauss + 510 - offset * 2; 818 int16_t const* fwd = gauss_table + offset * 2;
819 int16_t const* rev = gauss_table + 510 - offset * 2;
797 820
798 /* Use faster gaussian interpolation when exact result isn't needed 821 /* Use faster gaussian interpolation when exact result isn't needed
799 by pitch modulator of next channel */ 822 by pitch modulator of next channel */
800 int amp_0, amp_1; /* Also serve as temps _0, and _1 */
801 if ( LIKELY ( !(slow_gaussian & vbit) ) ) /* 99% of the time */ 823 if ( LIKELY ( !(slow_gaussian & vbit) ) ) /* 99% of the time */
802 { 824 {
803 /* Main optimization is lack of clamping. Not a problem since 825 /* Main optimization is lack of clamping. Not a problem since
804 output never goes more than +/- 16 outside 16-bit range and 826 output never goes more than +/- 16 outside 16-bit range and
805 things are clamped later anyway. Other optimization is to 827 things are clamped later anyway. Other optimization is to
806 preserve fractional accuracy, eliminating several masks. */ 828 preserve fractional accuracy, eliminating several masks. */
807 #if defined (CPU_ARM) 829 output = gaussian_fast_interp( voice->wave.samples, position,
808 int output; 830 fwd, rev );
809 int _2, _3; /* All-purpose temps */ 831 output = gaussian_fast_amp( voice, output, &amp_0, &amp_1 );
810 /* Multiple ASM blocks keep regs free and reduce result
811 * latency issues. */
812 #if ARM_ARCH >= 6
813 /* Interpolate */
814 asm volatile (
815 "ldr %[_0], [%[interp]] \r\n" /* _0=i0i1 */
816 "ldr %[_2], [%[fwd]] \r\n" /* _2=f0f1 */
817 "ldr %[_1], [%[interp], #4] \r\n" /* _1=i2i3 */
818 "ldr %[_3], [%[rev]] \r\n" /* _3=r0r1 */
819 "smuad %[out], %[_0], %[_2] \r\n" /* out=f0*i0 + f1*i1 */
820 "smladx %[out], %[_1], %[_3], %[out] \r\n" /* out+=r1*i2 + r0*i3 */
821 : [out]"=r"(output),
822 [_0]"=&r"(amp_0), [_1]"=&r"(amp_1),
823 [_2]"=&r"(_2), [_3]"=r"(_3)
824 : [fwd]"r"(fwd), [rev]"r"(rev),
825 [interp]"r"(interp));
826 /* Apply voice envelope */
827 asm volatile (
828 "mov %[_2], %[out], asr #(11-5) \r\n" /* To do >> 16 later */
829 "mul %[out], %[_2], %[envx] \r\n" /* and avoid exp. shift */
830 : [out]"+r"(output), [_2]"=&r"(_2)
831 : [envx]"r"((int)voice->envx));
832 /* Apply left and right volume */
833 asm volatile (
834 "smulwb %[amp_0], %[out], %[vvol_0] \r\n" /* (32x16->48)[47:16]->[31:0] */
835 "smulwb %[amp_1], %[out], %[vvol_1] \r\n"
836 : [out]"+r"(output),
837 [amp_0]"=&r"(amp_0), [amp_1]"=r"(amp_1)
838 : [vvol_0]"r"(voice->volume[0]),
839 [vvol_1]"r"(voice->volume[1]));
840
841 raw_voice->outx = output >> (8+5); /* 'output' still 5 bits too big */
842 #else /* ARM_ARCH < 6 */
843 /* Perform gaussian interpolation on four samples */
844 asm volatile (
845 "ldrsh %[_0], [%[interp]] \r\n"
846 "ldrsh %[_2], [%[fwd]] \r\n"
847 "ldrsh %[_1], [%[interp], #2] \r\n"
848 "ldrsh %[_3], [%[fwd], #2] \r\n"
849 "mul %[out], %[_0], %[_2] \r\n" /* out= fwd[0]*interp[0] */
850 "ldrsh %[_0], [%[interp], #4] \r\n"
851 "ldrsh %[_2], [%[rev], #2] \r\n"
852 "mla %[out], %[_1], %[_3], %[out] \r\n" /* out+=fwd[1]*interp[1] */
853 "ldrsh %[_1], [%[interp], #6] \r\n"
854 "ldrsh %[_3], [%[rev]] \r\n"
855 "mla %[out], %[_0], %[_2], %[out] \r\n" /* out+=rev[1]*interp[2] */
856 "mla %[out], %[_1], %[_3], %[out] \r\n" /* out+=rev[0]*interp[3] */
857 : [out]"=&r"(output),
858 [_0]"=&r"(amp_0), [_1]"=&r"(amp_1),
859 [_2]"=&r"(_2), [_3]"=&r"(_3)
860 : [fwd]"r"(fwd), [rev]"r"(rev),
861 [interp]"r"(interp));
862 /* Apply voice envelope */
863 asm volatile (
864 "mov %[_2], %[out], asr #11 \r\n"
865 "mul %[out], %[_2], %[envx] \r\n"
866 : [out]"+r"(output), [_2]"=&r"(_2)
867 : [envx]"r"((int)voice->envx));
868 /* Reduce and apply left and right volume */
869 asm volatile (
870 "mov %[out], %[out], asr #11 \r\n"
871 "mul %[amp_0], %[out], %[vvol_0] \r\n"
872 "mul %[amp_1], %[out], %[vvol_1] \r\n"
873 : [out]"+r"(output),
874 [amp_0]"=&r"(amp_0), [amp_1]"=r"(amp_1)
875 : [vvol_0]"r"((int)voice->volume[0]),
876 [vvol_1]"r"((int)voice->volume[1]));
877
878 raw_voice->outx = output >> 8;
879 #endif /* ARM_ARCH */
880 #else /* Unoptimized CPU */
881 int output = (((fwd [0] * interp [0] +
882 fwd [1] * interp [1] +
883 rev [1] * interp [2] +
884 rev [0] * interp [3] ) >> 11) * voice->envx) >> 11;
885
886 /* duplicated here to give compiler more to run in parallel */
887 amp_0 = voice->volume [0] * output;
888 amp_1 = voice->volume [1] * output;
889
890 raw_voice->outx = output >> 8;
891 #endif /* CPU_* */
892 } 832 }
893 else /* slow gaussian */ 833 else /* slow gaussian */
834 #endif /* !SPC_NOINTERP (else two-point linear interpolation) */
894 { 835 {
895 #if defined(CPU_ARM) 836 output = *(int16_t *)&this->noise;
896 #if ARM_ARCH >= 6
897 int output = *(int16_t*) &this->noise;
898
899 if ( !(this->r.g.noise_enables & vbit) )
900 {
901 /* Interpolate */
902 int _2, _3;
903 asm volatile (
904 /* NOTE: often-unaligned accesses */
905 "ldr %[_0], [%[interp]] \r\n" /* _0=i0i1 */
906 "ldr %[_2], [%[fwd]] \r\n" /* _2=f0f1 */
907 "ldr %[_1], [%[interp], #4] \r\n" /* _1=i2i3 */
908 "ldr %[_3], [%[rev]] \r\n" /* _3=f2f3 */
909 "smulbb %[out], %[_0], %[_2] \r\n" /* out=f0*i0 */
910 "smultt %[_0], %[_0], %[_2] \r\n" /* _0=f1*i1 */
911 "smulbt %[_2], %[_1], %[_3] \r\n" /* _2=r1*i2 */
912 "smultb %[_3], %[_1], %[_3] \r\n" /* _3=r0*i3 */
913 : [out]"=r"(output),
914 [_0]"=&r"(amp_0), [_1]"=&r"(amp_1),
915 [_2]"=&r"(_2), [_3]"=r"(_3)
916 : [fwd]"r"(fwd), [rev]"r"(rev),
917 [interp]"r"(interp));
918 asm volatile (
919 "mov %[out], %[out], asr#12 \r\n"
920 "add %[_0], %[out], %[_0], asr #12 \r\n"
921 "add %[_2], %[_0], %[_2], asr #12 \r\n"
922 "pkhbt %[_0], %[_2], %[_3], asl #4 \r\n" /* _3[31:16], _2[15:0] */
923 "sadd16 %[_0], %[_0], %[_0] \r\n" /* _3[31:16]*2, _2[15:0]*2 */
924 "qsubaddx %[out], %[_0], %[_0] \r\n" /* out[15:0]=
925 * sat16(_3[31:16]+_2[15:0]) */
926 : [out]"+r"(output),
927 [_0]"+r"(amp_0), [_2]"+r"(_2), [_3]"+r"(_3));
928 }
929 /* Apply voice envelope */
930 asm volatile (
931 "smulbb %[out], %[out], %[envx] \r\n"
932 : [out]"+r"(output)
933 : [envx]"r"(voice->envx));
934 /* Reduce and apply left and right volume */
935 asm volatile (
936 "mov %[out], %[out], asr #11 \r\n"
937 "bic %[out], %[out], #0x1 \r\n"
938 "mul %[amp_0], %[out], %[vvol_0] \r\n"
939 "mul %[amp_1], %[out], %[vvol_1] \r\n"
940 : [out]"+r"(output),
941 [amp_0]"=&r"(amp_0), [amp_1]"=r"(amp_1)
942 : [vvol_0]"r"((int)voice->volume[0]),
943 [vvol_1]"r"((int)voice->volume[1]));
944
945 prev_outx = output;
946 raw_voice->outx = output >> 8;
947 #else /* ARM_ARCH < 6 */
948 int output = *(int16_t*) &this->noise;
949
950 if ( !(this->r.g.noise_enables & vbit) )
951 {
952 /* Interpolate */
953 int _2, _3;
954 asm volatile (
955 "ldrsh %[_0], [%[interp]] \r\n"
956 "ldrsh %[_2], [%[fwd]] \r\n"
957 "ldrsh %[_1], [%[interp], #2] \r\n"
958 "ldrsh %[_3], [%[fwd], #2] \r\n"
959 "mul %[out], %[_2], %[_0] \r\n" /* fwd[0]*interp[0] */
960 "ldrsh %[_2], [%[rev], #2] \r\n"
961 "mul %[_0], %[_3], %[_1] \r\n" /* fwd[1]*interp[1] */
962 "ldrsh %[_1], [%[interp], #4] \r\n"
963 "mov %[out], %[out], asr #12 \r\n"
964 "ldrsh %[_3], [%[rev]] \r\n"
965 "mul %[_2], %[_1], %[_2] \r\n" /* rev[1]*interp[2] */
966 "ldrsh %[_1], [%[interp], #6] \r\n"
967 "add %[_0], %[out], %[_0], asr #12 \r\n"
968 "mul %[_3], %[_1], %[_3] \r\n" /* rev[0]*interp[3] */
969 "add %[_2], %[_0], %[_2], asr #12 \r\n"
970 "mov %[_2], %[_2], lsl #17 \r\n"
971 "mov %[_3], %[_3], asr #12 \r\n"
972 "mov %[_3], %[_3], asl #1 \r\n"
973 "add %[out], %[_3], %[_2], asr #16 \r\n"
974 : [out]"=&r"(output),
975 [_0]"=&r"(amp_0), [_1]"=&r"(amp_1),
976 [_2]"=&r"(_2), [_3]"=&r"(_3)
977 : [fwd]"r"(fwd), [rev]"r"(rev),
978 [interp]"r"(interp));
979
980 output = CLAMP16(output);
981 }
982 /* Apply voice envelope */
983 asm volatile (
984 "mul %[_0], %[out], %[envx] \r\n"
985 : [_0]"=r"(amp_0)
986 : [out]"r"(output), [envx]"r"((int)voice->envx));
987 /* Reduce and apply left and right volume */
988 asm volatile (
989 "mov %[out], %[amp_0], asr #11 \r\n" /* amp_0 = _0 */
990 "bic %[out], %[out], #0x1 \r\n"
991 "mul %[amp_0], %[out], %[vvol_0] \r\n"
992 "mul %[amp_1], %[out], %[vvol_1] \r\n"
993 : [out]"+r"(output),
994 [amp_0]"+r"(amp_0), [amp_1]"=r"(amp_1)
995 : [vvol_0]"r"((int)voice->volume[0]),
996 [vvol_1]"r"((int)voice->volume[1]));
997
998 prev_outx = output;
999 raw_voice->outx = output >> 8;
1000 #endif /* ARM_ARCH >= 6 */
1001 #else /* Unoptimized CPU */
1002 int output = *(int16_t*) &this->noise;
1003 837
1004 if ( !(this->r.g.noise_enables & vbit) ) 838 if ( !(this->r.g.noise_enables & vbit) )
1005 { 839 output = interp( voice->wave.samples, position, fwd, rev );
1006 output = (fwd [0] * interp [0]) & ~0xFFF;
1007 output = (output + fwd [1] * interp [1]) & ~0xFFF;
1008 output = (output + rev [1] * interp [2]) >> 12;
1009 output = (int16_t) (output * 2);
1010 output += ((rev [0] * interp [3]) >> 12) * 2;
1011 output = CLAMP16( output );
1012 }
1013 output = (output * voice->envx) >> 11 & ~1;
1014
1015 /* duplicated here to give compiler more to run in parallel */
1016 amp_0 = voice->volume [0] * output;
1017 amp_1 = voice->volume [1] * output;
1018
1019 prev_outx = output;
1020 raw_voice->outx = output >> 8;
1021 #endif /* CPU_* */
1022 }
1023 #else /* SPCNOINTERP */
1024 /* two-point linear interpolation */
1025 #ifdef CPU_COLDFIRE
1026 int amp_0 = (int16_t)this->noise;
1027 int amp_1;
1028
1029 if ( (this->r.g.noise_enables & vbit) == 0 )
1030 {
1031 uint32_t f = voice->position;
1032 int32_t y0;
1033
1034 /**
1035 * Formula (fastest found so far of MANY):
1036 * output = y0 + f*y1 - f*y0
1037 */
1038 asm volatile (
1039 /* separate fractional and whole parts */
1040 "move.l %[f], %[y1] \r\n"
1041 "and.l #0xfff, %[f] \r\n"
1042 "lsr.l %[sh], %[y1] \r\n"
1043 /* load samples y0 (upper) & y1 (lower) */
1044 "move.l 2(%[s], %[y1].l*2), %[y1] \r\n"
1045 /* %acc0 = f*y1 */
1046 "mac.w %[f]l, %[y1]l, %%acc0 \r\n"
1047 /* %acc0 -= f*y0 */
1048 "msac.w %[f]l, %[y1]u, %%acc0 \r\n"
1049 /* separate out y0 and sign extend */
1050 "swap %[y1] \r\n"
1051 "movea.w %[y1], %[y0] \r\n"
1052 /* fetch result, scale down and add y0 */
1053 "movclr.l %%acc0, %[y1] \r\n"
1054 /* output = y0 + (result >> 12) */
1055 "asr.l %[sh], %[y1] \r\n"
1056 "add.l %[y0], %[y1] \r\n"
1057 : [f]"+d"(f), [y0]"=&a"(y0), [y1]"=&d"(amp_0)
1058 : [s]"a"(voice->samples), [sh]"d"(12));
1059 }
1060 840
1061 /* apply voice envelope to output */ 841 /* Apply envelope and volume */
1062 asm volatile ( 842 output = apply_amp( voice, output, &amp_0, &amp_1 );
1063 "mac.w %[out]l, %[envx]l, %%acc0 \r\n"
1064 :
1065 : [out]"r"(amp_0), [envx]"r"(voice->envx));
1066
1067 /* advance voice position */
1068 voice->position += rate;
1069
1070 /* fetch output, scale and apply left and right
1071 voice volume */
1072 asm volatile (
1073 "movclr.l %%acc0, %[out] \r\n"
1074 "asr.l %[sh], %[out] \r\n"
1075 "mac.l %[vvol_0], %[out], %%acc0 \r\n"
1076 "mac.l %[vvol_1], %[out], %%acc1 \r\n"
1077 : [out]"=&d"(amp_0)
1078 : [vvol_0]"r"((int)voice->volume[0]),
1079 [vvol_1]"r"((int)voice->volume[1]),
1080 [sh]"d"(11));
1081
1082 /* save this output into previous, scale and save in
1083 output register */
1084 prev_outx = amp_0;
1085 raw_voice->outx = amp_0 >> 8;
1086
1087 /* fetch final voice output */
1088 asm volatile (
1089 "movclr.l %%acc0, %[amp_0] \r\n"
1090 "movclr.l %%acc1, %[amp_1] \r\n"
1091 : [amp_0]"=r"(amp_0), [amp_1]"=r"(amp_1));
1092 #elif defined (CPU_ARM)
1093 int amp_0, amp_1;
1094
1095 if ( (this->r.g.noise_enables & vbit) != 0 )
1096 {
1097 amp_0 = *(int16_t *)&this->noise;
1098 }
1099 else
1100 {
1101 uint32_t f = voice->position;
1102 amp_0 = (uint32_t)voice->samples;
1103
1104 asm volatile(
1105 "mov %[y1], %[f], lsr #12 \r\n"
1106 "eor %[f], %[f], %[y1], lsl #12 \r\n"
1107 "add %[y1], %[y0], %[y1], lsl #1 \r\n"
1108 "ldrsh %[y0], [%[y1], #2] \r\n"
1109 "ldrsh %[y1], [%[y1], #4] \r\n"
1110 "sub %[y1], %[y1], %[y0] \r\n"
1111 "mul %[f], %[y1], %[f] \r\n"
1112 "add %[y0], %[y0], %[f], asr #12 \r\n"
1113 : [f]"+r"(f), [y0]"+r"(amp_0), [y1]"=&r"(amp_1));
1114 }
1115
1116 voice->position += rate;
1117
1118 asm volatile(
1119 "mul %[amp_1], %[amp_0], %[envx] \r\n"
1120 "mov %[amp_0], %[amp_1], asr #11 \r\n"
1121 "mov %[amp_1], %[amp_0], asr #8 \r\n"
1122 : [amp_0]"+r"(amp_0), [amp_1]"=r"(amp_1)
1123 : [envx]"r"(voice->envx));
1124
1125 prev_outx = amp_0;
1126 raw_voice->outx = (int8_t)amp_1;
1127
1128 asm volatile(
1129 "mul %[amp_1], %[amp_0], %[vol_1] \r\n"
1130 "mul %[amp_0], %[vol_0], %[amp_0] \r\n"
1131 : [amp_0]"+r"(amp_0), [amp_1]"=&r"(amp_1)
1132 : [vol_0]"r"((int)voice->volume[0]),
1133 [vol_1]"r"((int)voice->volume[1]));
1134 #else /* Unoptimized CPU */
1135 int output;
1136
1137 if ( (this->r.g.noise_enables & vbit) == 0 )
1138 {
1139 int const fraction = voice->position & 0xfff;
1140 short const* const pos = (voice->samples + (voice->position >> 12)) + 1;
1141 output = pos[0] + ((fraction * (pos[1] - pos[0])) >> 12);
1142 } else {
1143 output = *(int16_t *)&this->noise;
1144 } 843 }
1145 844
1146 voice->position += rate;
1147
1148 output = (output * voice->envx) >> 11;
1149
1150 /* duplicated here to give compiler more to run in parallel */
1151 int amp_0 = voice->volume [0] * output;
1152 int amp_1 = voice->volume [1] * output;
1153
1154 prev_outx = output; 845 prev_outx = output;
1155 raw_voice->outx = (int8_t) (output >> 8); 846 raw_voice->outx = output >> 8;
1156 #endif /* CPU_* */
1157 #endif /* SPCNOINTERP */
1158 847
1159 #if SPC_BRRCACHE
1160 if ( voice->position >= voice->wave_end )
1161 {
1162 long loop_len = voice->wave_loop << 12;
1163 voice->position -= loop_len;
1164 this->r.g.wave_ended |= vbit;
1165 if ( !loop_len )
1166 {
1167 this->keys_down ^= vbit;
1168 raw_voice->envx = 0;
1169 voice->envx = 0;
1170 }
1171 }
1172 #endif
1173#if 0
1174 EXIT_TIMER(dsp_gen); 848 EXIT_TIMER(dsp_gen);
1175 849
1176 ENTER_TIMER(dsp_mix); 850 ENTER_TIMER(dsp_mix);
1177#endif 851
1178 chans_0 += amp_0; 852 chans_0 += amp_0;
1179 chans_1 += amp_1; 853 chans_1 += amp_1;
1180 #if !SPC_NOECHO 854 #if !SPC_NOECHO
1181 if ( this->r.g.echo_ons & vbit ) 855 if ( this->r.g.echo_ons & vbit )
1182 { 856 {
1183 echo_0 += amp_0; 857 echo_0 += amp_0;
1184 echo_1 += amp_1; 858 echo_1 += amp_1;
1185 } 859 }
1186 #endif 860 #endif /* !SPC_NOECHO */
1187#if 0 861
1188 EXIT_TIMER(dsp_mix); 862 EXIT_TIMER(dsp_mix);
1189#endif
1190 } 863 }
1191 /* end of voice loop */ 864 /* end of voice loop */
1192 865
866 /* Generate output */
867 int amp_0, amp_1;
1193 #if !SPC_NOECHO 868 #if !SPC_NOECHO
1194 #ifdef CPU_COLDFIRE
1195 /* Read feedback from echo buffer */ 869 /* Read feedback from echo buffer */
1196 int echo_pos = this->echo_pos; 870 int echo_pos = this->echo_pos;
1197 uint8_t* const echo_ptr = RAM + ((echo_start + echo_pos) & 0xFFFF); 871 uint8_t* const echo_ptr = RAM + ((echo_start + echo_pos) & 0xFFFF);
1198 echo_pos += 4;
1199 if ( echo_pos >= echo_wrap )
1200 echo_pos = 0;
1201 this->echo_pos = echo_pos;
1202 int fb = swap_odd_even32(*(int32_t *)echo_ptr);
1203 int out_0, out_1;
1204
1205 /* Keep last 8 samples */
1206 *this->last_fir_ptr = fb;
1207 this->last_fir_ptr = this->fir_ptr;
1208
1209 /* Apply echo FIR filter to output samples read from echo buffer -
1210 circular buffer is hardware incremented and masked; FIR
1211 coefficients and buffer history are loaded in parallel with
1212 multiply accumulate operations. Shift left by one here and once
1213 again when calculating feedback to have sample values justified
1214 to bit 31 in the output to ease endian swap, interleaving and
1215 clamping before placing result in the program's echo buffer. */
1216 int _0, _1, _2;
1217 asm volatile (
1218 "move.l (%[fir_c]) , %[_2] \r\n"
1219 "mac.w %[fb]u, %[_2]u, <<, (%[fir_p])+&, %[_0], %%acc0 \r\n"
1220 "mac.w %[fb]l, %[_2]u, <<, (%[fir_p])& , %[_1], %%acc1 \r\n"
1221 "mac.w %[_0]u, %[_2]l, << , %%acc0 \r\n"
1222 "mac.w %[_0]l, %[_2]l, <<, 4(%[fir_c]) , %[_2], %%acc1 \r\n"
1223 "mac.w %[_1]u, %[_2]u, <<, 4(%[fir_p])& , %[_0], %%acc0 \r\n"
1224 "mac.w %[_1]l, %[_2]u, <<, 8(%[fir_p])& , %[_1], %%acc1 \r\n"
1225 "mac.w %[_0]u, %[_2]l, << , %%acc0 \r\n"
1226 "mac.w %[_0]l, %[_2]l, <<, 8(%[fir_c]) , %[_2], %%acc1 \r\n"
1227 "mac.w %[_1]u, %[_2]u, <<, 12(%[fir_p])& , %[_0], %%acc0 \r\n"
1228 "mac.w %[_1]l, %[_2]u, <<, 16(%[fir_p])& , %[_1], %%acc1 \r\n"
1229 "mac.w %[_0]u, %[_2]l, << , %%acc0 \r\n"
1230 "mac.w %[_0]l, %[_2]l, <<, 12(%[fir_c]) , %[_2], %%acc1 \r\n"
1231 "mac.w %[_1]u, %[_2]u, <<, 20(%[fir_p])& , %[_0], %%acc0 \r\n"
1232 "mac.w %[_1]l, %[_2]u, << , %%acc1 \r\n"
1233 "mac.w %[_0]u, %[_2]l, << , %%acc0 \r\n"
1234 "mac.w %[_0]l, %[_2]l, << , %%acc1 \r\n"
1235 : [_0]"=&r"(_0), [_1]"=&r"(_1), [_2]"=&r"(_2),
1236 [fir_p]"+a"(this->fir_ptr)
1237 : [fir_c]"a"(this->fir_coeff), [fb]"r"(fb)
1238 );
1239
1240 /* Generate output */
1241 asm volatile (
1242 /* fetch filter results _after_ gcc loads asm
1243 block parameters to eliminate emac stalls */
1244 "movclr.l %%acc0, %[out_0] \r\n"
1245 "movclr.l %%acc1, %[out_1] \r\n"
1246 /* apply global volume */
1247 "mac.l %[chans_0], %[gv_0] , %%acc2 \r\n"
1248 "mac.l %[chans_1], %[gv_1] , %%acc3 \r\n"
1249 /* apply echo volume and add to final output */
1250 "mac.l %[ev_0], %[out_0], >>, %%acc2 \r\n"
1251 "mac.l %[ev_1], %[out_1], >>, %%acc3 \r\n"
1252 : [out_0]"=&r"(out_0), [out_1]"=&r"(out_1)
1253 : [chans_0]"r"(chans_0), [gv_0]"r"(global_vol_0),
1254 [ev_0]"r"((int)this->r.g.echo_volume_0),
1255 [chans_1]"r"(chans_1), [gv_1]"r"(global_vol_1),
1256 [ev_1]"r"((int)this->r.g.echo_volume_1)
1257 );
1258
1259 /* Feedback into echo buffer */
1260 if ( !(this->r.g.flags & 0x20) )
1261 {
1262 int sh = 1 << 9;
1263
1264 asm volatile (
1265 /* scale echo voices; saturate if overflow */
1266 "mac.l %[sh], %[e1] , %%acc1 \r\n"
1267 "mac.l %[sh], %[e0] , %%acc0 \r\n"
1268 /* add scaled output from FIR filter */
1269 "mac.l %[out_1], %[ef], <<, %%acc1 \r\n"
1270 "mac.l %[out_0], %[ef], <<, %%acc0 \r\n"
1271 /* swap and fetch feedback results - simply
1272 swap_odd_even32 mixed in between macs and
1273 movclrs to mitigate stall issues */
1274 "move.l #0x00ff00ff, %[sh] \r\n"
1275 "movclr.l %%acc1, %[e1] \r\n"
1276 "swap %[e1] \r\n"
1277 "movclr.l %%acc0, %[e0] \r\n"
1278 "move.w %[e1], %[e0] \r\n"
1279 "and.l %[e0], %[sh] \r\n"
1280 "eor.l %[sh], %[e0] \r\n"
1281 "lsl.l #8, %[sh] \r\n"
1282 "lsr.l #8, %[e0] \r\n"
1283 "or.l %[sh], %[e0] \r\n"
1284 /* save final feedback into echo buffer */
1285 "move.l %[e0], (%[echo_ptr]) \r\n"
1286 : [e0]"+d"(echo_0), [e1]"+d"(echo_1), [sh]"+d"(sh)
1287 : [out_0]"r"(out_0), [out_1]"r"(out_1),
1288 [ef]"r"((int)this->r.g.echo_feedback),
1289 [echo_ptr]"a"((int32_t *)echo_ptr)
1290 );
1291 }
1292 872
1293 /* Output final samples */
1294 asm volatile (
1295 /* fetch output saved in %acc2 and %acc3 */
1296 "movclr.l %%acc2, %[out_0] \r\n"
1297 "movclr.l %%acc3, %[out_1] \r\n"
1298 /* scale right by global_muting shift */
1299 "asr.l %[gm], %[out_0] \r\n"
1300 "asr.l %[gm], %[out_1] \r\n"
1301 : [out_0]"=&d"(out_0), [out_1]"=&d"(out_1)
1302 : [gm]"d"(global_muting)
1303 );
1304
1305 out_buf [ 0] = out_0;
1306 out_buf [WAV_CHUNK_SIZE] = out_1;
1307 out_buf ++;
1308 #elif defined (CPU_ARM)
1309 /* Read feedback from echo buffer */
1310 int echo_pos = this->echo_pos;
1311 uint8_t* const echo_ptr = RAM +
1312 ((this->r.g.echo_page * 0x100 + echo_pos) & 0xFFFF);
1313 echo_pos += 4; 873 echo_pos += 4;
1314 if ( echo_pos >= (this->r.g.echo_delay & 15) * 0x800 ) 874
875 if ( echo_pos >= echo_delay )
1315 echo_pos = 0; 876 echo_pos = 0;
1316 this->echo_pos = echo_pos;
1317 877
1318 #if ARM_ARCH >= 6 878 this->echo_pos = echo_pos;
1319 int32_t *fir_ptr, *fir_coeff;
1320 int fb_0, fb_1;
1321 879
1322 /* Apply FIR */ 880 /* Apply FIR */
1323 881 int fb_0, fb_1;
1324 /* Keep last 8 samples */ 882 echo_apply( this, echo_ptr, &fb_0, &fb_1 );
1325 asm volatile (
1326 "ldr %[fb_0], [%[echo_p]] \r\n"
1327 "add %[fir_p], %[t_fir_p], #4 \r\n"
1328 "bic %[t_fir_p], %[fir_p], %[mask] \r\n"
1329 "str %[fb_0], [%[fir_p], #-4] \r\n"
1330 /* duplicate at +8 eliminates wrap checking below */
1331 "str %[fb_0], [%[fir_p], #28] \r\n"
1332 : [fir_p]"=&r"(fir_ptr), [t_fir_p]"+r"(this->fir_ptr),
1333 [fb_0]"=&r"(fb_0)
1334 : [echo_p]"r"(echo_ptr), [mask]"i"(~FIR_BUF_MASK));
1335
1336 fir_coeff = (int32_t *)this->fir_coeff;
1337
1338 /* Fugly, but the best version found. */
1339 int _0;
1340 asm volatile ( /* L0R0 = acc0 */
1341 "ldmia %[fir_p]!, { r2-r5 } \r\n" /* L1R1-L4R4 = r2-r5 */
1342 "ldmia %[fir_c]!, { r0-r1 } \r\n" /* C0C1-C2C3 = r0-r1 */
1343 "pkhbt %[_0], %[acc0], r2, asl #16 \r\n" /* L0R0,L1R1->L0L1,R0R1 */
1344 "pkhtb r2, r2, %[acc0], asr #16 \r\n"
1345 "smuad %[acc0], %[_0], r0 \r\n" /* acc0=L0*C0+L1*C1 */
1346 "smuad %[acc1], r2, r0 \r\n" /* acc1=R0*C0+R1*C1 */
1347 "pkhbt %[_0], r3, r4, asl #16 \r\n" /* L2R2,L3R3->L2L3,R2R3 */
1348 "pkhtb r4, r4, r3, asr #16 \r\n"
1349 "smlad %[acc0], %[_0], r1, %[acc0] \r\n" /* acc0+=L2*C2+L3*C3 */
1350 "smlad %[acc1], r4, r1, %[acc1] \r\n" /* acc1+=R2*C2+R3*C3 */
1351 "ldmia %[fir_p], { r2-r4 } \r\n" /* L5R5-L7R7 = r2-r4 */
1352 "ldmia %[fir_c], { r0-r1 } \r\n" /* C4C5-C6C7 = r0-r1 */
1353 "pkhbt %[_0], r5, r2, asl #16 \r\n" /* L4R4,L5R5->L4L5,R4R5 */
1354 "pkhtb r2, r2, r5, asr #16 \r\n"
1355 "smlad %[acc0], %[_0], r0, %[acc0] \r\n" /* acc0+=L4*C4+L5*C5 */
1356 "smlad %[acc1], r2, r0, %[acc1] \r\n" /* acc1+=R4*C4+R5*C5 */
1357 "pkhbt %[_0], r3, r4, asl #16 \r\n" /* L6R6,L7R7->L6L7,R6R7 */
1358 "pkhtb r4, r4, r3, asr #16 \r\n"
1359 "smlad %[acc0], %[_0], r1, %[acc0] \r\n" /* acc0+=L6*C6+L7*C7 */
1360 "smlad %[acc1], r4, r1, %[acc1] \r\n" /* acc1+=R6*C6+R7*C7 */
1361 : [acc0]"+r"(fb_0), [acc1]"=&r"(fb_1), [_0]"=&r"(_0),
1362 [fir_p]"+r"(fir_ptr), [fir_c]"+r"(fir_coeff)
1363 :
1364 : "r0", "r1", "r2", "r3", "r4", "r5");
1365
1366 /* Generate output */
1367 int amp_0, amp_1;
1368
1369 asm volatile (
1370 "mul %[amp_0], %[gvol_0], %[chans_0] \r\n"
1371 "mul %[amp_1], %[gvol_1], %[chans_1] \r\n"
1372 : [amp_0]"=&r"(amp_0), [amp_1]"=r"(amp_1)
1373 : [gvol_0]"r"(global_vol_0), [gvol_1]"r"(global_vol_1),
1374 [chans_0]"r"(chans_0), [chans_1]"r"(chans_1));
1375 asm volatile (
1376 "mla %[amp_0], %[fb_0], %[ev_0], %[amp_0] \r\n"
1377 "mla %[amp_1], %[fb_1], %[ev_1], %[amp_1] \r\n"
1378 : [amp_0]"+r"(amp_0), [amp_1]"+r"(amp_1)
1379 : [fb_0]"r"(fb_0), [fb_1]"r"(fb_1),
1380 [ev_0]"r"((int)this->r.g.echo_volume_0),
1381 [ev_1]"r"((int)this->r.g.echo_volume_1));
1382
1383 out_buf [ 0] = amp_0 >> global_muting;
1384 out_buf [WAV_CHUNK_SIZE] = amp_1 >> global_muting;
1385 out_buf ++;
1386 883
1387 if ( !(this->r.g.flags & 0x20) ) 884 if ( !(this->r.g.flags & 0x20) )
1388 { 885 {
1389 /* Feedback into echo buffer */ 886 /* Feedback into echo buffer */
1390 int e0, e1; 887 echo_feedback( this, echo_ptr, echo_0, echo_1, fb_0, fb_1 );
1391
1392 asm volatile (
1393 "mov %[e0], %[echo_0], asl #7 \r\n"
1394 "mov %[e1], %[echo_1], asl #7 \r\n"
1395 "mla %[e0], %[fb_0], %[efb], %[e0] \r\n"
1396 "mla %[e1], %[fb_1], %[efb], %[e1] \r\n"
1397 : [e0]"=&r"(e0), [e1]"=&r"(e1)
1398 : [echo_0]"r"(echo_0), [echo_1]"r"(echo_1),
1399 [fb_0]"r"(fb_0), [fb_1]"r"(fb_1),
1400 [efb]"r"((int)this->r.g.echo_feedback));
1401 asm volatile (
1402 "ssat %[e0], #16, %[e0], asr #14 \r\n"
1403 "ssat %[e1], #16, %[e1], asr #14 \r\n"
1404 "pkhbt %[e0], %[e0], %[e1], lsl #16 \r\n"
1405 "str %[e0], [%[echo_p]] \r\n"
1406 : [e0]"+r"(e0), [e1]"+r"(e1)
1407 : [echo_p]"r"(echo_ptr));
1408 } 888 }
1409 #else /* ARM_ARCH < 6 */ 889 #endif /* !SPC_NOECHO */
1410 int fb_0 = GET_LE16SA( echo_ptr );
1411 int fb_1 = GET_LE16SA( echo_ptr + 2 );
1412 int32_t *fir_ptr, *fir_coeff;
1413
1414 /* Keep last 8 samples */
1415
1416 /* Apply FIR */
1417 asm volatile (
1418 "add %[fir_p], %[t_fir_p], #8 \r\n"
1419 "bic %[t_fir_p], %[fir_p], %[mask] \r\n"
1420 "str %[fb_0], [%[fir_p], #-8] \r\n"
1421 "str %[fb_1], [%[fir_p], #-4] \r\n"
1422 /* duplicate at +8 eliminates wrap checking below */
1423 "str %[fb_0], [%[fir_p], #56] \r\n"
1424 "str %[fb_1], [%[fir_p], #60] \r\n"
1425 : [fir_p]"=&r"(fir_ptr), [t_fir_p]"+r"(this->fir_ptr)
1426 : [fb_0]"r"(fb_0), [fb_1]"r"(fb_1), [mask]"i"(~FIR_BUF_MASK));
1427
1428 fir_coeff = this->fir_coeff;
1429
1430 asm volatile (
1431 "ldmia %[fir_c]!, { r0-r1 } \r\n"
1432 "ldmia %[fir_p]!, { r4-r5 } \r\n"
1433 "mul %[fb_0], r0, %[fb_0] \r\n"
1434 "mul %[fb_1], r0, %[fb_1] \r\n"
1435 "mla %[fb_0], r4, r1, %[fb_0] \r\n"
1436 "mla %[fb_1], r5, r1, %[fb_1] \r\n"
1437 "ldmia %[fir_c]!, { r0-r1 } \r\n"
1438 "ldmia %[fir_p]!, { r2-r5 } \r\n"
1439 "mla %[fb_0], r2, r0, %[fb_0] \r\n"
1440 "mla %[fb_1], r3, r0, %[fb_1] \r\n"
1441 "mla %[fb_0], r4, r1, %[fb_0] \r\n"
1442 "mla %[fb_1], r5, r1, %[fb_1] \r\n"
1443 "ldmia %[fir_c]!, { r0-r1 } \r\n"
1444 "ldmia %[fir_p]!, { r2-r5 } \r\n"
1445 "mla %[fb_0], r2, r0, %[fb_0] \r\n"
1446 "mla %[fb_1], r3, r0, %[fb_1] \r\n"
1447 "mla %[fb_0], r4, r1, %[fb_0] \r\n"
1448 "mla %[fb_1], r5, r1, %[fb_1] \r\n"
1449 "ldmia %[fir_c]!, { r0-r1 } \r\n"
1450 "ldmia %[fir_p]!, { r2-r5 } \r\n"
1451 "mla %[fb_0], r2, r0, %[fb_0] \r\n"
1452 "mla %[fb_1], r3, r0, %[fb_1] \r\n"
1453 "mla %[fb_0], r4, r1, %[fb_0] \r\n"
1454 "mla %[fb_1], r5, r1, %[fb_1] \r\n"
1455 : [fb_0]"+r"(fb_0), [fb_1]"+r"(fb_1),
1456 [fir_p]"+r"(fir_ptr), [fir_c]"+r"(fir_coeff)
1457 :
1458 : "r0", "r1", "r2", "r3", "r4", "r5");
1459
1460 /* Generate output */
1461 int amp_0 = (chans_0 * global_vol_0 + fb_0 * this->r.g.echo_volume_0)
1462 >> global_muting;
1463 int amp_1 = (chans_1 * global_vol_1 + fb_1 * this->r.g.echo_volume_1)
1464 >> global_muting;
1465
1466 out_buf [ 0] = amp_0;
1467 out_buf [WAV_CHUNK_SIZE] = amp_1;
1468 out_buf ++;
1469 890
1470 if ( !(this->r.g.flags & 0x20) ) 891 mix_output( this, global_muting, global_vol_0, global_vol_1,
1471 { 892 chans_0, chans_1, fb_0, fb_1, &amp_0, &amp_1 );
1472 /* Feedback into echo buffer */
1473 int e0 = (echo_0 >> 7) + ((fb_0 * this->r.g.echo_feedback) >> 14);
1474 int e1 = (echo_1 >> 7) + ((fb_1 * this->r.g.echo_feedback) >> 14);
1475 e0 = CLAMP16( e0 );
1476 SET_LE16A( echo_ptr , e0 );
1477 e1 = CLAMP16( e1 );
1478 SET_LE16A( echo_ptr + 2, e1 );
1479 }
1480 #endif /* ARM_ARCH */
1481 #else /* Unoptimized CPU */
1482 /* Read feedback from echo buffer */
1483 int echo_pos = this->echo_pos;
1484 uint8_t* const echo_ptr = RAM +
1485 ((this->r.g.echo_page * 0x100 + echo_pos) & 0xFFFF);
1486 echo_pos += 4;
1487 if ( echo_pos >= (this->r.g.echo_delay & 15) * 0x800 )
1488 echo_pos = 0;
1489 this->echo_pos = echo_pos;
1490 int fb_0 = GET_LE16SA( echo_ptr );
1491 int fb_1 = GET_LE16SA( echo_ptr + 2 );
1492
1493 /* Keep last 8 samples */
1494 int (* const fir_ptr) [2] = this->fir_buf + this->fir_pos;
1495 this->fir_pos = (this->fir_pos + 1) & (FIR_BUF_HALF - 1);
1496 fir_ptr [ 0] [0] = fb_0;
1497 fir_ptr [ 0] [1] = fb_1;
1498 /* duplicate at +8 eliminates wrap checking below */
1499 fir_ptr [FIR_BUF_HALF] [0] = fb_0;
1500 fir_ptr [FIR_BUF_HALF] [1] = fb_1;
1501
1502 /* Apply FIR */
1503 fb_0 *= this->fir_coeff [0];
1504 fb_1 *= this->fir_coeff [0];
1505 893
1506 #define DO_PT( i )\
1507 fb_0 += fir_ptr [i] [0] * this->fir_coeff [i];\
1508 fb_1 += fir_ptr [i] [1] * this->fir_coeff [i];
1509
1510 DO_PT( 1 )
1511 DO_PT( 2 )
1512 DO_PT( 3 )
1513 DO_PT( 4 )
1514 DO_PT( 5 )
1515 DO_PT( 6 )
1516 DO_PT( 7 )
1517
1518 /* Generate output */
1519 int amp_0 = (chans_0 * global_vol_0 + fb_0 * this->r.g.echo_volume_0)
1520 >> global_muting;
1521 int amp_1 = (chans_1 * global_vol_1 + fb_1 * this->r.g.echo_volume_1)
1522 >> global_muting;
1523 out_buf [ 0] = amp_0;
1524 out_buf [WAV_CHUNK_SIZE] = amp_1;
1525 out_buf ++;
1526
1527 if ( !(this->r.g.flags & 0x20) )
1528 {
1529 /* Feedback into echo buffer */
1530 int e0 = (echo_0 >> 7) + ((fb_0 * this->r.g.echo_feedback) >> 14);
1531 int e1 = (echo_1 >> 7) + ((fb_1 * this->r.g.echo_feedback) >> 14);
1532 e0 = CLAMP16( e0 );
1533 SET_LE16A( echo_ptr , e0 );
1534 e1 = CLAMP16( e1 );
1535 SET_LE16A( echo_ptr + 2, e1 );
1536 }
1537 #endif /* CPU_* */
1538 #else /* SPCNOECHO == 1*/
1539 /* Generate output */
1540 int amp_0 = (chans_0 * global_vol_0) >> global_muting;
1541 int amp_1 = (chans_1 * global_vol_1) >> global_muting;
1542 out_buf [ 0] = amp_0; 894 out_buf [ 0] = amp_0;
1543 out_buf [WAV_CHUNK_SIZE] = amp_1; 895 out_buf [WAV_CHUNK_SIZE] = amp_1;
1544 out_buf ++; 896 out_buf ++;
1545 #endif /* SPCNOECHO */
1546 } 897 }
1547 while ( --count ); 898 while ( --count );
1548#if 0 899
1549 EXIT_TIMER(dsp); 900 EXIT_TIMER(dsp);
1550 ENTER_TIMER(cpu); 901 ENTER_TIMER(cpu);
1551#endif
1552} 902}
1553 903
1554void DSP_reset( struct Spc_Dsp* this ) 904void DSP_reset( struct Spc_Dsp* this )
@@ -1563,31 +913,22 @@ void DSP_reset( struct Spc_Dsp* this )
1563 913
1564 ci->memset( this->voice_state, 0, sizeof this->voice_state ); 914 ci->memset( this->voice_state, 0, sizeof this->voice_state );
1565 915
1566 int i; 916 for ( int i = VOICE_COUNT; --i >= 0; )
1567 for ( i = VOICE_COUNT; --i >= 0; )
1568 { 917 {
1569 struct voice_t* v = this->voice_state + i; 918 struct voice_t* v = this->voice_state + i;
1570 v->env_mode = state_release; 919 v->env_mode = state_release;
1571 v->addr = ram.ram; 920 v->wave.addr = ram.ram;
1572 } 921 }
1573 922
1574 #if SPC_BRRCACHE 923#if SPC_BRRCACHE
1575 this->oldsize = 0; 924 this->oldsize = 0;
1576 for ( i = 0; i < 256; i++ ) 925 for ( int i = 0; i < 256; i++ )
1577 this->wave_entry [i].start_addr = -1; 926 this->wave_entry [i].start_addr = -1;
1578 #endif 927#endif /* SPC_BRRCACHE */
1579 928
1580#if defined(CPU_COLDFIRE) 929#if !SPC_NOECHO
1581 this->fir_ptr = fir_buf; 930 echo_init(this);
1582 this->last_fir_ptr = &fir_buf [7]; 931#endif /* SPC_NOECHO */
1583 ci->memset( fir_buf, 0, sizeof fir_buf );
1584#elif defined (CPU_ARM)
1585 this->fir_ptr = fir_buf;
1586 ci->memset( fir_buf, 0, sizeof fir_buf );
1587#else
1588 this->fir_pos = 0;
1589 ci->memset( this->fir_buf, 0, sizeof this->fir_buf );
1590#endif
1591 932
1592 assert( offsetof (struct globals_t,unused9 [2]) == REGISTER_COUNT ); 933 assert( offsetof (struct globals_t,unused9 [2]) == REGISTER_COUNT );
1593 assert( sizeof (this->r.voice) == REGISTER_COUNT ); 934 assert( sizeof (this->r.voice) == REGISTER_COUNT );