diff options
Diffstat (limited to 'apps/codecs')
-rw-r--r-- | apps/codecs/spc/Spc_Dsp.h | 93 |
1 files changed, 62 insertions, 31 deletions
diff --git a/apps/codecs/spc/Spc_Dsp.h b/apps/codecs/spc/Spc_Dsp.h index 6b530a7a62..fdcd37f226 100644 --- a/apps/codecs/spc/Spc_Dsp.h +++ b/apps/codecs/spc/Spc_Dsp.h | |||
@@ -974,23 +974,35 @@ static void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) | |||
974 | if ( (this->r.g.noise_enables & vbit) == 0 ) | 974 | if ( (this->r.g.noise_enables & vbit) == 0 ) |
975 | { | 975 | { |
976 | uint32_t f = voice->position; | 976 | uint32_t f = voice->position; |
977 | int32_t y1; | 977 | int32_t y0; |
978 | 978 | ||
979 | /** | ||
980 | * Formula (fastest found so far of MANY): | ||
981 | * output = y0 + f*y1 - f*y0 | ||
982 | */ | ||
979 | asm volatile ( | 983 | asm volatile ( |
980 | "move.l %[f], %[y0] \r\n" /* separate fraction */ | 984 | /* separate fractional and whole parts */ |
981 | "and.l #0xfff, %[f] \r\n" /* and whole parts */ | 985 | "move.l %[f], %[y1] \r\n" |
982 | "lsr.l %[sh], %[y0] \r\n" | 986 | "and.l #0xfff, %[f] \r\n" |
983 | "move.l 2(%[s], %[y0].l*2), %[y1] \r\n" /* load two samples */ | 987 | "lsr.l %[sh], %[y1] \r\n" |
984 | "move.l %[y1], %[y0] \r\n" /* separate samples */ | 988 | /* load samples y0 (upper) & y1 (lower) */ |
985 | "ext.l %[y1] \r\n" /* y0=s[1], y1=s[2] */ | 989 | "move.l 2(%[s], %[y1].l*2), %[y1] \r\n" |
986 | "swap %[y0] \r\n" | 990 | /* %acc0 = f*y1 */ |
987 | "ext.l %[y0] \r\n" | 991 | "mac.w %[f]l, %[y1]l, %%acc0 \r\n" |
988 | "sub.l %[y0], %[y1] \r\n" /* diff = y1 - y0 */ | 992 | /* msac.w is 2% boostier so add negative */ |
989 | "muls.l %[f], %[y1] \r\n" /* y0 += f*diff */ | 993 | "neg.l %[f] \r\n" |
990 | "asr.l %[sh], %[y1] \r\n" | 994 | /* %acc0 -= f*y0 */ |
991 | "add.l %[y1], %[y0] \r\n" | 995 | "mac.w %[f]l, %[y1]u, %%acc0 \r\n" |
992 | : [f]"+&d"(f), [y0]"=&d"(output), [y1]"=&d"(y1) | 996 | /* separate out y0 and sign extend */ |
993 | : [s]"a"(voice->samples), [sh]"r"(12) | 997 | "swap %[y1] \r\n" |
998 | "movea.w %[y1], %[y0] \r\n" | ||
999 | /* fetch result, scale down and add y0 */ | ||
1000 | "movclr.l %%acc0, %[y1] \r\n" | ||
1001 | /* output = y0 + (result >> 12) */ | ||
1002 | "asr.l %[sh], %[y1] \r\n" | ||
1003 | "add.l %[y0], %[y1] \r\n" | ||
1004 | : [f]"+&d"(f), [y0]"=&a"(y0), [y1]"=&d"(output) | ||
1005 | : [s]"a"(voice->samples), [sh]"d"(12) | ||
994 | ); | 1006 | ); |
995 | } | 1007 | } |
996 | 1008 | ||
@@ -1093,9 +1105,13 @@ static void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) | |||
1093 | *this->last_fir_ptr = fb; | 1105 | *this->last_fir_ptr = fb; |
1094 | this->last_fir_ptr = this->fir_ptr; | 1106 | this->last_fir_ptr = this->fir_ptr; |
1095 | 1107 | ||
1096 | /* Apply echo FIR filter to output - circular buffer is hardware | 1108 | /* Apply echo FIR filter to output samples read from echo buffer - |
1097 | incremented and masked; FIR coefficients and buffer history are | 1109 | circular buffer is hardware incremented and masked; FIR |
1098 | loaded in parallel with multiply accumulate operations. */ | 1110 | coefficients and buffer history are loaded in parallel with |
1111 | multiply accumulate operations. Shift left by one here and once | ||
1112 | again when calculating feedback to have sample values justified | ||
1113 | to bit 31 in the output to ease endian swap, interleaving and | ||
1114 | clamping before placing result in the program's echo buffer. */ | ||
1099 | int _0, _1, _2; | 1115 | int _0, _1, _2; |
1100 | asm volatile ( | 1116 | asm volatile ( |
1101 | "move.l (%[fir_c]) , %[_2] \r\n" | 1117 | "move.l (%[fir_c]) , %[_2] \r\n" |
@@ -1115,53 +1131,68 @@ static void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf ) | |||
1115 | "mac.w %[_1]l, %[_2]u, << , %%acc1 \r\n" | 1131 | "mac.w %[_1]l, %[_2]u, << , %%acc1 \r\n" |
1116 | "mac.w %[_0]u, %[_2]l, << , %%acc0 \r\n" | 1132 | "mac.w %[_0]u, %[_2]l, << , %%acc0 \r\n" |
1117 | "mac.w %[_0]l, %[_2]l, << , %%acc1 \r\n" | 1133 | "mac.w %[_0]l, %[_2]l, << , %%acc1 \r\n" |
1118 | "movclr.l %%acc0, %[out_0] \r\n" | ||
1119 | "movclr.l %%acc1, %[out_1] \r\n" | ||
1120 | : [_0]"=&r"(_0), [_1]"=&r"(_1), [_2]"=&r"(_2), | 1134 | : [_0]"=&r"(_0), [_1]"=&r"(_1), [_2]"=&r"(_2), |
1121 | [fir_p]"+a"(this->fir_ptr), | 1135 | [fir_p]"+a"(this->fir_ptr) |
1122 | [out_0]"=r"(out_0), [out_1]"=r"(out_1) | ||
1123 | : [fir_c]"a"(this->fir_coeff), [fb]"r"(fb) | 1136 | : [fir_c]"a"(this->fir_coeff), [fb]"r"(fb) |
1124 | ); | 1137 | ); |
1125 | 1138 | ||
1126 | /* Generate output */ | 1139 | /* Generate output */ |
1127 | asm volatile ( | 1140 | asm volatile ( |
1141 | /* fetch filter results to eliminate stalls */ | ||
1142 | "movclr.l %%acc0, %[out_0] \r\n" | ||
1143 | "movclr.l %%acc1, %[out_1] \r\n" | ||
1144 | /* apply global volume */ | ||
1128 | "mac.l %[chans_0], %[gv_0] , %%acc2 \r\n" | 1145 | "mac.l %[chans_0], %[gv_0] , %%acc2 \r\n" |
1129 | "mac.l %[chans_1], %[gv_1] , %%acc3 \r\n" | 1146 | "mac.l %[chans_1], %[gv_1] , %%acc3 \r\n" |
1147 | /* apply echo volume and add to final output */ | ||
1130 | "mac.l %[ev_0], %[out_0], >>, %%acc2 \r\n" | 1148 | "mac.l %[ev_0], %[out_0], >>, %%acc2 \r\n" |
1131 | "mac.l %[ev_1], %[out_1], >>, %%acc3 \r\n" | 1149 | "mac.l %[ev_1], %[out_1], >>, %%acc3 \r\n" |
1132 | : | 1150 | : [out_0]"=&r"(out_0), [out_1]"=&r"(out_1) |
1133 | : [chans_0]"r"(chans_0), [gv_0]"r"(global_vol_0), | 1151 | : [chans_0]"r"(chans_0), [gv_0]"r"(global_vol_0), |
1134 | [ev_0]"r"((int)this->r.g.echo_volume_0), | 1152 | [ev_0]"r"((int)this->r.g.echo_volume_0), |
1135 | [chans_1]"r"(chans_1), [gv_1]"r"(global_vol_1), | 1153 | [chans_1]"r"(chans_1), [gv_1]"r"(global_vol_1), |
1136 | [ev_1]"r"((int)this->r.g.echo_volume_1), | 1154 | [ev_1]"r"((int)this->r.g.echo_volume_1) |
1137 | [out_0]"r"(out_0), [out_1]"r"(out_1) | ||
1138 | ); | 1155 | ); |
1139 | 1156 | ||
1140 | /* Feedback into echo buffer */ | 1157 | /* Feedback into echo buffer */ |
1141 | if ( !(this->r.g.flags & 0x20) ) | 1158 | if ( !(this->r.g.flags & 0x20) ) |
1142 | { | 1159 | { |
1143 | asm volatile ( | 1160 | asm volatile ( |
1144 | "mac.l %[sh], %[e0] , %%acc0 \r\n" | 1161 | /* scale echo voices; saturate if overflow */ |
1145 | "mac.l %[out_0], %[ef], <<, %%acc0 \r\n" | ||
1146 | "mac.l %[sh], %[e1] , %%acc1 \r\n" | 1162 | "mac.l %[sh], %[e1] , %%acc1 \r\n" |
1163 | "mac.l %[sh], %[e0] , %%acc0 \r\n" | ||
1164 | /* add scaled output from FIR filter */ | ||
1147 | "mac.l %[out_1], %[ef], <<, %%acc1 \r\n" | 1165 | "mac.l %[out_1], %[ef], <<, %%acc1 \r\n" |
1148 | "movclr.l %%acc0, %[e0] \r\n" | 1166 | "mac.l %[out_0], %[ef], <<, %%acc0 \r\n" |
1167 | /* swap and fetch feedback results - simply | ||
1168 | swap_odd_even32 mixed in between macs and | ||
1169 | movclrs to mitigate stall issues */ | ||
1170 | "move.l #0x00ff00ff, %[sh] \r\n" | ||
1149 | "movclr.l %%acc1, %[e1] \r\n" | 1171 | "movclr.l %%acc1, %[e1] \r\n" |
1150 | "swap %[e1] \r\n" | 1172 | "swap %[e1] \r\n" |
1173 | "movclr.l %%acc0, %[e0] \r\n" | ||
1151 | "move.w %[e1], %[e0] \r\n" | 1174 | "move.w %[e1], %[e0] \r\n" |
1175 | "and.l %[e0], %[sh] \r\n" | ||
1176 | "eor.l %[sh], %[e0] \r\n" | ||
1177 | "lsl.l #8, %[sh] \r\n" | ||
1178 | "lsr.l #8, %[e0] \r\n" | ||
1179 | "or.l %[sh], %[e0] \r\n" | ||
1180 | /* save final feedback into echo buffer */ | ||
1181 | "move.l %[e0], (%[echo_ptr]) \r\n" | ||
1152 | : [e0]"+&d"(echo_0), [e1]"+&d"(echo_1) | 1182 | : [e0]"+&d"(echo_0), [e1]"+&d"(echo_1) |
1153 | : [out_0]"r"(out_0), [out_1]"r"(out_1), | 1183 | : [out_0]"r"(out_0), [out_1]"r"(out_1), |
1154 | [ef]"r"((int)this->r.g.echo_feedback), | 1184 | [ef]"r"((int)this->r.g.echo_feedback), |
1155 | [sh]"r"(1 << 9) | 1185 | [echo_ptr]"a"((int32_t *)echo_ptr), |
1186 | [sh]"d"(1 << 9) | ||
1156 | ); | 1187 | ); |
1157 | |||
1158 | *(int32_t *)echo_ptr = swap_odd_even32(echo_0); | ||
1159 | } | 1188 | } |
1160 | 1189 | ||
1161 | /* Output final samples */ | 1190 | /* Output final samples */ |
1162 | asm volatile ( | 1191 | asm volatile ( |
1192 | /* fetch output saved in %acc2 and %acc3 */ | ||
1163 | "movclr.l %%acc2, %[out_0] \r\n" | 1193 | "movclr.l %%acc2, %[out_0] \r\n" |
1164 | "movclr.l %%acc3, %[out_1] \r\n" | 1194 | "movclr.l %%acc3, %[out_1] \r\n" |
1195 | /* scale right by global_muting shift */ | ||
1165 | "asr.l %[gm], %[out_0] \r\n" | 1196 | "asr.l %[gm], %[out_0] \r\n" |
1166 | "asr.l %[gm], %[out_1] \r\n" | 1197 | "asr.l %[gm], %[out_1] \r\n" |
1167 | : [out_0]"=&d"(out_0), [out_1]"=&d"(out_1) | 1198 | : [out_0]"=&d"(out_0), [out_1]"=&d"(out_1) |