opus: asm MULT16_32_Q15 for arm and cf

Speeds up decoding of a 64kbps opus test file by 34MHz on h300 (cf), 24MHz on c200 (pp) and 13MHz on fuzev1 (amsv1) Change-Id: I0dce6b3bfe6c81d0a722dfebb13891b9a428c6ba
author: Nils Wallménius <nils@rockbox.org> 2012-09-24 23:14:58 +0200
committer: Nils Wallménius <nils@rockbox.org> 2012-09-25 11:40:59 +0200
commit: afc6b3f0215037821382c957d975dfc8f727b2a9 (patch)
tree: c6f5a46eecf88ca3ee2965ba425fafac156fd955
parent: 06fc6fdd0a6e0758043fd7aebb98f5098a8344e4 (diff)
download: rockbox-afc6b3f0215037821382c957d975dfc8f727b2a9.tar.gz
rockbox-afc6b3f0215037821382c957d975dfc8f727b2a9.zip
2 files changed, 33 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/libopus/celt/fixed_generic.h b/lib/rbcodec/codecs/libopus/celt/fixed_generic.h
index 71e28d62a8..d2271e7972 100644
--- a/lib/rbcodec/codecs/libopus/celt/fixed_generic.h
+++ b/lib/rbcodec/codecs/libopus/celt/fixed_generic.h
@@ -42,8 +42,35 @@
 /** 16x32 multiplication, followed by a 16-bit shift right (round-to-nearest). Results fits in 32 bits */
 #define MULT16_32_P16(a,b) ADD32(MULT16_16((a),SHR((b),16)), PSHR(MULT16_16((a),((b)&0x0000ffff)),16))
+#if defined(CPU_COLDFIRE)
+static inline int32_t MULT16_32_Q15(int32_t a, int32_t b)
+{
+  asm volatile ("lsl.l #8, %[a];"
+                "lsl.l #8, %[a];"
+                "mac.l %[a], %[b], %%acc0;"
+                "movclr.l %%acc0, %[a];"
+                : [a] "+d" (a)
+                : [b] "d" (b)
+                : "cc");
+  return a;
+}
+#elif defined(CPU_ARM)
+static inline int32_t MULT16_32_Q15(int32_t a, int32_t b)
+{
+  int32_t lo, hi;
+  asm volatile("smull %[lo], %[hi], %[a], %[b] \n\t"
+               "mov %[lo], %[lo], lsr #15 \n\t"
+               "orr %[hi], %[lo], %[hi], lsl #17 \n\t"
+               : [lo] "=&r" (lo), [hi] "=&r" (hi)
+               : [a] "r" (a), [b] "r" (b) );
+  return(hi);
+}
+#else
 /** 16x32 multiplication, followed by a 15-bit shift right. Results fits in 32 bits */
 #define MULT16_32_Q15(a,b) ADD32(SHL(MULT16_16((a),SHR((b),16)),1), SHR(MULT16_16SU((a),((b)&0x0000ffff)),15))
+#endif
 /** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */
 #define MULT32_32_Q31(a,b) ADD32(ADD32(SHL(MULT16_16(SHR((a),16),SHR((b),16)),1), SHR(MULT16_16SU(SHR((a),16),((b)&0x0000ffff)),15)), SHR(MULT16_16SU(SHR((b),16),((a)&0x0000ffff)),15))
diff --git a/lib/rbcodec/codecs/opus.c b/lib/rbcodec/codecs/opus.c
index 19bdb8daae..cf2d69738f 100644
--- a/lib/rbcodec/codecs/opus.c
+++ b/lib/rbcodec/codecs/opus.c
@@ -332,6 +332,12 @@ enum codec_status codec_run(void)
    }
    global_stack = 0;
+#if defined(CPU_COLDFIRE)
+    /* EMAC rounding is disabled because of MULT16_32_Q15, which will be
+       inaccurate with rounding in its current incarnation */
+    coldfire_set_macsr(EMAC_FRACTIONAL | EMAC_SATURATE);
+#endif
    /* pre-init the ogg_sync_state buffer, so it won't need many reallocs */
    ogg_sync_init(&oy);
    oy.storage = 64*1024;
author	Nils Wallménius <nils@rockbox.org>	2012-09-24 23:14:58 +0200
committer	Nils Wallménius <nils@rockbox.org>	2012-09-25 11:40:59 +0200
commit	afc6b3f0215037821382c957d975dfc8f727b2a9 (patch)
tree	c6f5a46eecf88ca3ee2965ba425fafac156fd955
parent	06fc6fdd0a6e0758043fd7aebb98f5098a8344e4 (diff)
download	rockbox-afc6b3f0215037821382c957d975dfc8f727b2a9.tar.gz rockbox-afc6b3f0215037821382c957d975dfc8f727b2a9.zip

diff --git a/lib/rbcodec/codecs/libopus/celt/fixed_generic.h b/lib/rbcodec/codecs/libopus/celt/fixed_generic.h index 71e28d62a8..d2271e7972 100644 --- a/lib/rbcodec/codecs/libopus/celt/fixed_generic.h +++ b/lib/rbcodec/codecs/libopus/celt/fixed_generic.h
@@ -42,8 +42,35 @@
42	/** 16x32 multiplication, followed by a 16-bit shift right (round-to-nearest). Results fits in 32 bits */	42	/** 16x32 multiplication, followed by a 16-bit shift right (round-to-nearest). Results fits in 32 bits */
43	#define MULT16_32_P16(a,b) ADD32(MULT16_16((a),SHR((b),16)), PSHR(MULT16_16((a),((b)&0x0000ffff)),16))	43	#define MULT16_32_P16(a,b) ADD32(MULT16_16((a),SHR((b),16)), PSHR(MULT16_16((a),((b)&0x0000ffff)),16))
44		44
		45	#if defined(CPU_COLDFIRE)
		46	static inline int32_t MULT16_32_Q15(int32_t a, int32_t b)
		47	{
		48	asm volatile ("lsl.l #8, %[a];"
		49	"lsl.l #8, %[a];"
		50	"mac.l %[a], %[b], %%acc0;"
		51	"movclr.l %%acc0, %[a];"
		52	: [a] "+d" (a)
		53	: [b] "d" (b)
		54	: "cc");
		55	return a;
		56	}
		57
		58	#elif defined(CPU_ARM)
		59	static inline int32_t MULT16_32_Q15(int32_t a, int32_t b)
		60	{
		61	int32_t lo, hi;
		62	asm volatile("smull %[lo], %[hi], %[a], %[b] \n\t"
		63	"mov %[lo], %[lo], lsr #15 \n\t"
		64	"orr %[hi], %[lo], %[hi], lsl #17 \n\t"
		65	: [lo] "=&r" (lo), [hi] "=&r" (hi)
		66	: [a] "r" (a), [b] "r" (b) );
		67	return(hi);
		68	}
		69
		70	#else
45	/** 16x32 multiplication, followed by a 15-bit shift right. Results fits in 32 bits */	71	/** 16x32 multiplication, followed by a 15-bit shift right. Results fits in 32 bits */
46	#define MULT16_32_Q15(a,b) ADD32(SHL(MULT16_16((a),SHR((b),16)),1), SHR(MULT16_16SU((a),((b)&0x0000ffff)),15))	72	#define MULT16_32_Q15(a,b) ADD32(SHL(MULT16_16((a),SHR((b),16)),1), SHR(MULT16_16SU((a),((b)&0x0000ffff)),15))
		73	#endif
47		74
48	/** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */	75	/** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */
49	#define MULT32_32_Q31(a,b) ADD32(ADD32(SHL(MULT16_16(SHR((a),16),SHR((b),16)),1), SHR(MULT16_16SU(SHR((a),16),((b)&0x0000ffff)),15)), SHR(MULT16_16SU(SHR((b),16),((a)&0x0000ffff)),15))	76	#define MULT32_32_Q31(a,b) ADD32(ADD32(SHL(MULT16_16(SHR((a),16),SHR((b),16)),1), SHR(MULT16_16SU(SHR((a),16),((b)&0x0000ffff)),15)), SHR(MULT16_16SU(SHR((b),16),((a)&0x0000ffff)),15))


diff --git a/lib/rbcodec/codecs/opus.c b/lib/rbcodec/codecs/opus.c index 19bdb8daae..cf2d69738f 100644 --- a/lib/rbcodec/codecs/opus.c +++ b/lib/rbcodec/codecs/opus.c
@@ -332,6 +332,12 @@ enum codec_status codec_run(void)
332	}	332	}
333	global_stack = 0;	333	global_stack = 0;
334		334
		335	#if defined(CPU_COLDFIRE)
		336	/* EMAC rounding is disabled because of MULT16_32_Q15, which will be
		337	inaccurate with rounding in its current incarnation */
		338	coldfire_set_macsr(EMAC_FRACTIONAL \| EMAC_SATURATE);
		339	#endif
		340
335	/* pre-init the ogg_sync_state buffer, so it won't need many reallocs */	341	/* pre-init the ogg_sync_state buffer, so it won't need many reallocs */
336	ogg_sync_init(&oy);	342	ogg_sync_init(&oy);
337	oy.storage = 64*1024;	343	oy.storage = 64*1024;