Added asm optimized short block IMDCT and windowing.

Removed a warning in synth.c. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@6159 a1c6a512-1295-4272-9138-f99709370657
author: Thom Johansen <thomj@rockbox.org> 2005-03-06 22:13:44 +0000
committer: Thom Johansen <thomj@rockbox.org> 2005-03-06 22:13:44 +0000
commit: e78186f4cfe0adae87cf499b73a75807d019d9bc (patch)
tree: 3d051bd1bbc80a5d43bce3bc7b238e76755425cc /apps/codecs
parent: 6a33dd761eab41585fc1eaed933e561724454d99 (diff)
download: rockbox-e78186f4cfe0adae87cf499b73a75807d019d9bc.tar.gz
rockbox-e78186f4cfe0adae87cf499b73a75807d019d9bc.zip
2 files changed, 116 insertions, 3 deletions
diff --git a/apps/codecs/libmad/layer3.c b/apps/codecs/libmad/layer3.c
index b1a9919af0..27c8d18430 100644
--- a/apps/codecs/libmad/layer3.c
+++ b/apps/codecs/libmad/layer3.c
@@ -2144,6 +2144,116 @@ void III_imdct_l(mad_fixed_t const X[18], mad_fixed_t z[36],
 * NAME:        III_imdct_s()
 * DESCRIPTION: perform IMDCT and windowing for short blocks
 */
+# if CONFIG_CPU==MCF5249 && !defined(SIMULATOR)
+/* this should probably be stuffed in a .S file somewhere, it's almost
+   100% asm as it is.
+ */
+static
+void III_imdct_s(mad_fixed_t const X[18], mad_fixed_t z[36])
+{
+  mad_fixed_t y[36], *yptr;
+  mad_fixed_t const *wptr;
+  /* IMDCT */
+  yptr = &y[0];
+  /* if additional precision is needed in this block, it is possible to
+   * get more low bits out of the accext01 register _before_ doing the
+   * movclrs.
+   */
+  asm volatile (
+    "move.l #0x000000b0, %%macsr\n\t" /* frac. mode, saturation, rounding */
+    "suba.l %%a0, %%a0\n\t"         /* clear loop variable */
+    ".align 2\n\t.imdctloop:\n\t"   /* outer loop label */
+    "lea.l imdct_s, %%a1\n\t"       /* load pointer to imdct coefs in a1 */
+    "movem.l (%[X]), %%d0-%%d5\n\t" /* load input data in d0-d5 */
+    
+    "clr.l %%d7\n\t"                /* init loop variable */
+    "move.l (%%a1)+, %%a5\n\t"      /* load imdct coef in a5 */
+    ".align 2\n\t.macloop:\n\t"     /* inner loop label */
+    "mac.l %%d0, %%a5, (%%a1)+, %%a5, %%acc0\n\t" /* mac sequence */
+    "mac.l %%d1, %%a5, (%%a1)+, %%a5, %%acc0\n\t"
+    "mac.l %%d2, %%a5, (%%a1)+, %%a5, %%acc0\n\t"
+    "mac.l %%d3, %%a5, (%%a1)+, %%a5, %%acc0\n\t"
+    "mac.l %%d4, %%a5, (%%a1)+, %%a5, %%acc0\n\t"
+    "mac.l %%d5, %%a5, (%%a1)+, %%a5, %%acc0\n\t"
+    "movclr.l %%acc0, %%d6\n\t"     /* get result, left shifted once */
+    "asl.l #3, %%d6\n\t"            /* got one shift free, shift three more */
+    "mov.l %%d6, (%[yptr], %%d7.l*4)\n\t"         /* yptr[i] = result */
+    "neg.l %%d6\n\t"
+    "neg.l %%d7\n\t"
+    "mov.l %%d6, (5*4, %[yptr], %%d7.l*4)\n\t"    /* yptr[5 - 1] = -result */
+    "mac.l %%d0, %%a5, (%%a1)+, %%a5, %%acc0\n\t" /* mac sequence */
+    "mac.l %%d1, %%a5, (%%a1)+, %%a5, %%acc0\n\t"
+    "mac.l %%d2, %%a5, (%%a1)+, %%a5, %%acc0\n\t"
+    "mac.l %%d3, %%a5, (%%a1)+, %%a5, %%acc0\n\t"
+    "mac.l %%d4, %%a5, (%%a1)+, %%a5, %%acc0\n\t"
+    "mac.l %%d5, %%a5, (%%a1)+, %%a5, %%acc0\n\t"
+    "movclr.l %%acc0, %%d6\n\t"    /* get result */
+    "asl.l #3, %%d6\n\t"
+    "mov.l %%d6, (11*4, %[yptr], %%d7.l*4)\n\t"   /* yptr[11 - i] = result*/
+    "neg.l %%d7\n\t"
+    "mov.l %%d6, (6*4, %[yptr], %%d7.l*4)\n\t"    /* yptr[i + 6] = result */
+    "addq.l #1, %%d7\n\t"           /* increment inner loop variable */
+    "cmp.l #3, %%d7\n\t"            /* we do three inner loop iterations */
+    "jne .macloop\n\t"
+    "adda.l #48, %[yptr]\n\t"       /* add pointer increment */
+    "adda.l #24, %[X]\n\t"
+    "addq.l #1, %%a0\n\t"           /* increment outer loop variable */
+    "cmpa.l #3, %%a0\n\t"           /* we do three outer loop iterations */
+    "jne .imdctloop\n\t"
+    : [X] "+a" (X), [yptr] "+a" (yptr) 
+    : : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a0", "a1", "a5");
+  /* windowing, overlapping and concatenation */
+  yptr = &y[0];
+  wptr = &window_s[0];
+  asm volatile (
+    "clr.l %%d7\n\t"
+    ".align 2\n\t.overlaploop:\n\t"
+    "clr.l (%[z], %%d7.l*4)\n\t" /* z[i + 0] = 0 */
+    "move.l (%[wptr]), %%d0\n\t"
+    "move.l (%[yptr]), %%d2\n\t"
+    "mac.l %%d0, %%d2, 24(%[wptr]), %%d1, %%acc0\n\t"
+    "movclr.l %%acc0, %%d6\n\t"
+    "asl.l #3, %%d6\n\t"
+    "move.l %%d6, (6*4, %[z], %%d7.l*4)\n\t" /* z[i + 6] = result */
+    
+    "move.l 24(%[yptr]), %%d2\n\t"
+    "mac.l %%d1, %%d2, 48(%[yptr]), %%d2, %%acc0\n\t"
+    "mac.l %%d0, %%d2, 72(%[yptr]), %%d2, %%acc0\n\t"
+    "movclr.l %%acc0, %%d6\n\t"
+    "asl.l #3, %%d6\n\t"
+    "move.l %%d6, (12*4, %[z], %%d7.l*4)\n\t" /* z[i + 12] = result */
+    
+    "mac.l %%d1, %%d2, (24*4, %[yptr]), %%d2, %%acc0\n\t"
+    "mac.l %%d0, %%d2, (30*4, %[yptr]), %%d2, %%acc0\n\t"
+    "movclr.l %%acc0, %%d6\n\t"
+    "asl.l #3, %%d6\n\t"
+    "move.l %%d6, (18*4, %[z], %%d7.l*4)\n\t" /* z[i + 18] = result */
+    
+    "mac.l %%d1, %%d2, %%acc0\n\t"
+    "movclr.l %%acc0, %%d6\n\t"
+    "asl.l #3, %%d6\n\t"
+    "move.l %%d6, (24*4, %[z], %%d7.l*4)\n\t"   /* z[i + 24] = result */
+    
+    "clr.l (30*4, %[z], %%d7.l*4)\n\t"       /* z[i + 30] = 0 */
+    "addq.l #1, %%d7\n\t"
+    "addq.l #4, %[yptr]\n\t"
+    "addq.l #4, %[wptr]\n\t"
+    "cmp.l #6, %%d7\n\t"                    /* six iterations */
+    "jne .overlaploop\n\t"
+    : [yptr] "+a" (yptr), [wptr] "+a" (wptr)
+    : [z] "a" (z) 
+    : "d7");
+}
+#else
 static
 void III_imdct_s(mad_fixed_t const X[18], mad_fixed_t z[36])
 {
@@ -2219,6 +2329,8 @@ void III_imdct_s(mad_fixed_t const X[18], mad_fixed_t z[36])
  }
 }
+#endif
 /*
 * NAME:        III_overlap()
 * DESCRIPTION: perform overlap-add of windowed IMDCT outputs
diff --git a/apps/codecs/libmad/synth.c b/apps/codecs/libmad/synth.c
index 530f33cdf6..2d212c091f 100644
--- a/apps/codecs/libmad/synth.c
+++ b/apps/codecs/libmad/synth.c
@@ -24,6 +24,7 @@
 # endif
 # include "global.h"
 # include "fixed.h"
 # include "frame.h"
 # include "synth.h"
@@ -100,6 +101,7 @@ void mad_synth_mute(struct mad_synth *synth)
 # endif
 /* possible DCT speed optimization */
 # if defined(OPT_SPEED) && defined(MAD_F_MLX)
 #  define OPT_DCTO
 #  define MUL(x, y)  \
@@ -112,6 +114,7 @@ void mad_synth_mute(struct mad_synth *synth)
 #  undef OPT_DCTO
 #  define MUL(x, y)  mad_f_mul((x), (y))
 # endif
 /*
 * NAME:        dct32()
 * DESCRIPTION: perform fast in[32]->out[32] DCT
@@ -547,7 +550,6 @@ mad_fixed_t const D[17][32] __attribute__ ((section(".idata"))) = {
 void synth_full(struct mad_synth *, struct mad_frame const *,
                unsigned int, unsigned int);
 # else
 /*
 * NAME:        synth->full()
 * DESCRIPTION: perform full frequency PCM synthesis
@@ -563,7 +565,7 @@ void synth_full(struct mad_synth *synth, struct mad_frame const *frame,
  mad_fixed_t *pcm1, *pcm2, (*filter)[2][2][16][8];
  mad_fixed_t const (*sbsample)[36][32];
  mad_fixed_t (*fe)[8], (*fx)[8], (*fo)[8];
-  mad_fixed_t const (*Dptr)[32], *ptr;
+  mad_fixed_t const (*Dptr)[32];
  mad_fixed64hi_t hi = 0;
  mad_fixed64lo_t lo;
@@ -1010,4 +1012,3 @@ void mad_synth_frame(struct mad_synth *synth, struct mad_frame const *frame)
  synth->phase = (synth->phase + ns) % 16;
 }
author	Thom Johansen <thomj@rockbox.org>	2005-03-06 22:13:44 +0000
committer	Thom Johansen <thomj@rockbox.org>	2005-03-06 22:13:44 +0000
commit	e78186f4cfe0adae87cf499b73a75807d019d9bc (patch)
tree	3d051bd1bbc80a5d43bce3bc7b238e76755425cc /apps/codecs
parent	6a33dd761eab41585fc1eaed933e561724454d99 (diff)
download	rockbox-e78186f4cfe0adae87cf499b73a75807d019d9bc.tar.gz rockbox-e78186f4cfe0adae87cf499b73a75807d019d9bc.zip

diff --git a/apps/codecs/libmad/layer3.c b/apps/codecs/libmad/layer3.c index b1a9919af0..27c8d18430 100644 --- a/apps/codecs/libmad/layer3.c +++ b/apps/codecs/libmad/layer3.c
@@ -2144,6 +2144,116 @@ void III_imdct_l(mad_fixed_t const X[18], mad_fixed_t z[36],
2144	* NAME: III_imdct_s()	2144	* NAME: III_imdct_s()
2145	* DESCRIPTION: perform IMDCT and windowing for short blocks	2145	* DESCRIPTION: perform IMDCT and windowing for short blocks
2146	*/	2146	*/
		2147
		2148	# if CONFIG_CPU==MCF5249 && !defined(SIMULATOR)
		2149	/* this should probably be stuffed in a .S file somewhere, it's almost
		2150	100% asm as it is.
		2151	*/
		2152	static
		2153	void III_imdct_s(mad_fixed_t const X[18], mad_fixed_t z[36])
		2154	{
		2155	mad_fixed_t y[36], *yptr;
		2156	mad_fixed_t const *wptr;
		2157
		2158	/* IMDCT */
		2159	yptr = &y[0];
		2160
		2161	/* if additional precision is needed in this block, it is possible to
		2162	* get more low bits out of the accext01 register _before_ doing the
		2163	* movclrs.
		2164	*/
		2165	asm volatile (
		2166	"move.l #0x000000b0, %%macsr\n\t" /* frac. mode, saturation, rounding */
		2167	"suba.l %%a0, %%a0\n\t" /* clear loop variable */
		2168	".align 2\n\t.imdctloop:\n\t" /* outer loop label */
		2169	"lea.l imdct_s, %%a1\n\t" /* load pointer to imdct coefs in a1 */
		2170	"movem.l (%[X]), %%d0-%%d5\n\t" /* load input data in d0-d5 */
		2171
		2172	"clr.l %%d7\n\t" /* init loop variable */
		2173	"move.l (%%a1)+, %%a5\n\t" /* load imdct coef in a5 */
		2174	".align 2\n\t.macloop:\n\t" /* inner loop label */
		2175	"mac.l %%d0, %%a5, (%%a1)+, %%a5, %%acc0\n\t" /* mac sequence */
		2176	"mac.l %%d1, %%a5, (%%a1)+, %%a5, %%acc0\n\t"
		2177	"mac.l %%d2, %%a5, (%%a1)+, %%a5, %%acc0\n\t"
		2178	"mac.l %%d3, %%a5, (%%a1)+, %%a5, %%acc0\n\t"
		2179	"mac.l %%d4, %%a5, (%%a1)+, %%a5, %%acc0\n\t"
		2180	"mac.l %%d5, %%a5, (%%a1)+, %%a5, %%acc0\n\t"
		2181	"movclr.l %%acc0, %%d6\n\t" /* get result, left shifted once */
		2182	"asl.l #3, %%d6\n\t" /* got one shift free, shift three more */
		2183	"mov.l %%d6, (%[yptr], %%d7.l4)\n\t" / yptr[i] = result */
		2184	"neg.l %%d6\n\t"
		2185	"neg.l %%d7\n\t"
		2186	"mov.l %%d6, (54, %[yptr], %%d7.l4)\n\t" /* yptr[5 - 1] = -result */
		2187	"mac.l %%d0, %%a5, (%%a1)+, %%a5, %%acc0\n\t" /* mac sequence */
		2188	"mac.l %%d1, %%a5, (%%a1)+, %%a5, %%acc0\n\t"
		2189	"mac.l %%d2, %%a5, (%%a1)+, %%a5, %%acc0\n\t"
		2190	"mac.l %%d3, %%a5, (%%a1)+, %%a5, %%acc0\n\t"
		2191	"mac.l %%d4, %%a5, (%%a1)+, %%a5, %%acc0\n\t"
		2192	"mac.l %%d5, %%a5, (%%a1)+, %%a5, %%acc0\n\t"
		2193	"movclr.l %%acc0, %%d6\n\t" /* get result */
		2194	"asl.l #3, %%d6\n\t"
		2195	"mov.l %%d6, (114, %[yptr], %%d7.l4)\n\t" /* yptr[11 - i] = result*/
		2196	"neg.l %%d7\n\t"
		2197	"mov.l %%d6, (64, %[yptr], %%d7.l4)\n\t" /* yptr[i + 6] = result */
		2198	"addq.l #1, %%d7\n\t" /* increment inner loop variable */
		2199	"cmp.l #3, %%d7\n\t" /* we do three inner loop iterations */
		2200	"jne .macloop\n\t"
		2201
		2202	"adda.l #48, %[yptr]\n\t" /* add pointer increment */
		2203	"adda.l #24, %[X]\n\t"
		2204	"addq.l #1, %%a0\n\t" /* increment outer loop variable */
		2205	"cmpa.l #3, %%a0\n\t" /* we do three outer loop iterations */
		2206	"jne .imdctloop\n\t"
		2207	: [X] "+a" (X), [yptr] "+a" (yptr)
		2208	: : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a0", "a1", "a5");
		2209
		2210	/* windowing, overlapping and concatenation */
		2211
		2212	yptr = &y[0];
		2213	wptr = &window_s[0];
		2214
		2215	asm volatile (
		2216	"clr.l %%d7\n\t"
		2217	".align 2\n\t.overlaploop:\n\t"
		2218	"clr.l (%[z], %%d7.l4)\n\t" / z[i + 0] = 0 */
		2219	"move.l (%[wptr]), %%d0\n\t"
		2220	"move.l (%[yptr]), %%d2\n\t"
		2221	"mac.l %%d0, %%d2, 24(%[wptr]), %%d1, %%acc0\n\t"
		2222	"movclr.l %%acc0, %%d6\n\t"
		2223	"asl.l #3, %%d6\n\t"
		2224	"move.l %%d6, (64, %[z], %%d7.l4)\n\t" /* z[i + 6] = result */
		2225
		2226	"move.l 24(%[yptr]), %%d2\n\t"
		2227	"mac.l %%d1, %%d2, 48(%[yptr]), %%d2, %%acc0\n\t"
		2228	"mac.l %%d0, %%d2, 72(%[yptr]), %%d2, %%acc0\n\t"
		2229	"movclr.l %%acc0, %%d6\n\t"
		2230	"asl.l #3, %%d6\n\t"
		2231	"move.l %%d6, (124, %[z], %%d7.l4)\n\t" /* z[i + 12] = result */
		2232
		2233	"mac.l %%d1, %%d2, (24*4, %[yptr]), %%d2, %%acc0\n\t"
		2234	"mac.l %%d0, %%d2, (30*4, %[yptr]), %%d2, %%acc0\n\t"
		2235	"movclr.l %%acc0, %%d6\n\t"
		2236	"asl.l #3, %%d6\n\t"
		2237	"move.l %%d6, (184, %[z], %%d7.l4)\n\t" /* z[i + 18] = result */
		2238
		2239	"mac.l %%d1, %%d2, %%acc0\n\t"
		2240	"movclr.l %%acc0, %%d6\n\t"
		2241	"asl.l #3, %%d6\n\t"
		2242	"move.l %%d6, (244, %[z], %%d7.l4)\n\t" /* z[i + 24] = result */
		2243
		2244	"clr.l (304, %[z], %%d7.l4)\n\t" /* z[i + 30] = 0 */
		2245	"addq.l #1, %%d7\n\t"
		2246	"addq.l #4, %[yptr]\n\t"
		2247	"addq.l #4, %[wptr]\n\t"
		2248	"cmp.l #6, %%d7\n\t" /* six iterations */
		2249	"jne .overlaploop\n\t"
		2250	: [yptr] "+a" (yptr), [wptr] "+a" (wptr)
		2251	: [z] "a" (z)
		2252	: "d7");
		2253	}
		2254
		2255	#else
		2256
2147	static	2257	static
2148	void III_imdct_s(mad_fixed_t const X[18], mad_fixed_t z[36])	2258	void III_imdct_s(mad_fixed_t const X[18], mad_fixed_t z[36])
2149	{	2259	{
@@ -2219,6 +2329,8 @@ void III_imdct_s(mad_fixed_t const X[18], mad_fixed_t z[36])
2219	}	2329	}
2220	}	2330	}
2221		2331
		2332	#endif
		2333
2222	/*	2334	/*
2223	* NAME: III_overlap()	2335	* NAME: III_overlap()
2224	* DESCRIPTION: perform overlap-add of windowed IMDCT outputs	2336	* DESCRIPTION: perform overlap-add of windowed IMDCT outputs


diff --git a/apps/codecs/libmad/synth.c b/apps/codecs/libmad/synth.c index 530f33cdf6..2d212c091f 100644 --- a/apps/codecs/libmad/synth.c +++ b/apps/codecs/libmad/synth.c
@@ -24,6 +24,7 @@
24	# endif	24	# endif
25		25
26	# include "global.h"	26	# include "global.h"
		27
27	# include "fixed.h"	28	# include "fixed.h"
28	# include "frame.h"	29	# include "frame.h"
29	# include "synth.h"	30	# include "synth.h"
@@ -100,6 +101,7 @@ void mad_synth_mute(struct mad_synth *synth)
100	# endif	101	# endif
101		102
102	/* possible DCT speed optimization */	103	/* possible DCT speed optimization */
		104
103	# if defined(OPT_SPEED) && defined(MAD_F_MLX)	105	# if defined(OPT_SPEED) && defined(MAD_F_MLX)
104	# define OPT_DCTO	106	# define OPT_DCTO
105	# define MUL(x, y) \	107	# define MUL(x, y) \
@@ -112,6 +114,7 @@ void mad_synth_mute(struct mad_synth *synth)
112	# undef OPT_DCTO	114	# undef OPT_DCTO
113	# define MUL(x, y) mad_f_mul((x), (y))	115	# define MUL(x, y) mad_f_mul((x), (y))
114	# endif	116	# endif
		117
115	/*	118	/*
116	* NAME: dct32()	119	* NAME: dct32()
117	* DESCRIPTION: perform fast in[32]->out[32] DCT	120	* DESCRIPTION: perform fast in[32]->out[32] DCT
@@ -547,7 +550,6 @@ mad_fixed_t const D[17][32] __attribute__ ((section(".idata"))) = {
547	void synth_full(struct mad_synth , struct mad_frame const ,	550	void synth_full(struct mad_synth , struct mad_frame const ,
548	unsigned int, unsigned int);	551	unsigned int, unsigned int);
549	# else	552	# else
550
551	/*	553	/*
552	* NAME: synth->full()	554	* NAME: synth->full()
553	* DESCRIPTION: perform full frequency PCM synthesis	555	* DESCRIPTION: perform full frequency PCM synthesis
@@ -563,7 +565,7 @@ void synth_full(struct mad_synth synth, struct mad_frame const frame,
563	mad_fixed_t pcm1, pcm2, (*filter)[2][2][16][8];	565	mad_fixed_t pcm1, pcm2, (*filter)[2][2][16][8];
564	mad_fixed_t const (*sbsample)[36][32];	566	mad_fixed_t const (*sbsample)[36][32];
565	mad_fixed_t (fe)[8], (fx)[8], (*fo)[8];	567	mad_fixed_t (fe)[8], (fx)[8], (*fo)[8];
566	mad_fixed_t const (Dptr)[32], ptr;	568	mad_fixed_t const (*Dptr)[32];
567	mad_fixed64hi_t hi = 0;	569	mad_fixed64hi_t hi = 0;
568	mad_fixed64lo_t lo;	570	mad_fixed64lo_t lo;
569		571
@@ -1010,4 +1012,3 @@ void mad_synth_frame(struct mad_synth synth, struct mad_frame const frame)
1010		1012
1011	synth->phase = (synth->phase + ns) % 16;	1013	synth->phase = (synth->phase + ns) % 16;
1012	}	1014	}
1013