Assembler optimised LPC routines for Coldfire. Will enable them when codec has seen further testing.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@7657 a1c6a512-1295-4272-9138-f99709370657
author: Thom Johansen <thomj@rockbox.org> 2005-10-27 00:33:38 +0000
committer: Thom Johansen <thomj@rockbox.org> 2005-10-27 00:33:38 +0000
commit: 0b38c7dcbe283ba7d13531831a5367afae668e69 (patch)
tree: dd5428f415fb6db9c860d6867c88b5059ba4f25a
parent: 273d2e81f72c7721447ab9c539877f6712faaecc (diff)
download: rockbox-0b38c7dcbe283ba7d13531831a5367afae668e69.tar.gz
rockbox-0b38c7dcbe283ba7d13531831a5367afae668e69.zip
2 files changed, 245 insertions, 0 deletions
diff --git a/apps/codecs/libffmpegFLAC/coldfire.S b/apps/codecs/libffmpegFLAC/coldfire.S
new file mode 100644
index 0000000000..7e19e4b695
--- /dev/null
+++ b/apps/codecs/libffmpegFLAC/coldfire.S
@@ -0,0 +1,237 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2005 by Thom Johansen 
+ *
+ * All files in this archive are subject to the GNU General Public License.
+ * See the file COPYING in the source tree root for full license agreement.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+ 
+/* The following is an assembler optimised version of the LPC filtering
+   routines needed for FLAC decoding. It is optimised for use with the
+   MCF5249 processor, or any other similar ColdFire core with the EMAC unit.
+   All LPC filtering up to order 8 is done in specially optimised unrolled
+   loops, while every order above this is handled by a slower default routine.
+ */
+    .text
+    .global lpc_decode_emac
+    .align 2
+lpc_decode_emac:
+    lea.l (-40, %sp), %sp
+    movem.l %d2-%d7/%a2-%a5, (%sp)
+    movem.l (40+4, %sp), %d0-%d2/%a0-%a1
+    /* d0 = blocksize, d1 = qlevel, d2 = pred_order
+       a0 = data, a1 = coeffs
+     */
+     
+    /* the data pointer always lags behind history pointer by 'pred_order'
+       samples. since we have one loop for each order, we can hard code this
+       and free a register by not saving data pointer. 
+     */ 
+    move.l %d2, %d3 
+    neg.l %d3 
+    lea.l (%a0, %d3.l*4), %a0 | history
+    clr.l %d3
+    move.l %d3, %macsr        | we'll need integer mode for this
+    tst.l %d0          
+    jeq .exit                 | zero samples to process, exit
+    moveq.l #8, %d3
+    cmp.l %d3, %d2
+    jgt .default              | order is over 8, jump to default case
+    lea.l .jumptable, %a4
+    move.l (%a4, %d2.l*4), %a4
+    jmp (%a4)
+    .align 4                  | avoid unaligned fetch
+.jumptable:
+    .long .exit
+    .long .order1
+    .long .order2
+    .long .order3
+    .long .order4
+    .long .order5
+    .long .order6
+    .long .order7
+    .long .order8
+.order8:
+    movem.l (%a1), %d3-%d7/%a2-%a4 | load lpc coefs
+    move.l (%a0)+, %a5             | load first history sample
+.loop8:
+    mac.l %a5, %a4, (%a0)+, %a5, %acc0
+    mac.l %a5, %a3, (%a0)+, %a5, %acc0
+    mac.l %a5, %a2, (%a0)+, %a5, %acc0
+    mac.l %a5, %d7, (%a0)+, %a5, %acc0
+    mac.l %a5, %d6, (%a0)+, %a5, %acc0
+    mac.l %a5, %d5, (%a0)+, %a5, %acc0
+    mac.l %a5, %d4, (%a0)+, %a5, %acc0
+    mac.l %a5, %d3, (-7*4, %a0), %a5, %acc0 | load for the next iteration
+    movclr.l %acc0, %d2    | get sum
+    asr.l %d1, %d2         | shift sum by lp_quantization bits
+    add.l %d2, (%a0)       | add residual and save
+    lea.l (-6*4, %a0), %a0 | history pointer points at second element
+    subq.l #1, %d0         | decrement counter
+    jne .loop8             | are we done?
+    jra .exit
+.order7:
+    movem.l (%a1), %d3-%d7/%a2-%a3
+    move.l (%a0)+, %a5
+.loop7:
+    mac.l %a5, %a3, (%a0)+, %a5, %acc0
+    mac.l %a5, %a2, (%a0)+, %a5, %acc0
+    mac.l %a5, %d7, (%a0)+, %a5, %acc0
+    mac.l %a5, %d6, (%a0)+, %a5, %acc0
+    mac.l %a5, %d5, (%a0)+, %a5, %acc0
+    mac.l %a5, %d4, (%a0)+, %a5, %acc0
+    mac.l %a5, %d3, (-6*4, %a0), %a5, %acc0
+    movclr.l %acc0, %d2
+    asr.l %d1, %d2
+    add.l %d2, (%a0)
+    lea.l (-5*4, %a0), %a0
+    subq.l #1, %d0
+    jne .loop7
+    jra .exit
+.order6:
+    movem.l (%a1), %d3-%d7/%a2
+    move.l (%a0)+, %a5
+.loop6:
+    mac.l %a5, %a2, (%a0)+, %a5, %acc0
+    mac.l %a5, %d7, (%a0)+, %a5, %acc0
+    mac.l %a5, %d6, (%a0)+, %a5, %acc0
+    mac.l %a5, %d5, (%a0)+, %a5, %acc0
+    mac.l %a5, %d4, (%a0)+, %a5, %acc0
+    mac.l %a5, %d3, (-5*4, %a0), %a5, %acc0
+    movclr.l %acc0, %d2
+    asr.l %d1, %d2
+    add.l %d2, (%a0)
+    lea.l (-4*4, %a0), %a0
+    subq.l #1, %d0
+    jne .loop6
+    jra .exit
+.order5:
+    movem.l (%a1), %d3-%d7
+    move.l (%a0)+, %a5
+.loop5:
+    mac.l %a5, %d7, (%a0)+, %a5, %acc0
+    mac.l %a5, %d6, (%a0)+, %a5, %acc0
+    mac.l %a5, %d5, (%a0)+, %a5, %acc0
+    mac.l %a5, %d4, (%a0)+, %a5, %acc0
+    mac.l %a5, %d3, (-4*4, %a0), %a5, %acc0
+    movclr.l %acc0, %d2
+    asr.l %d1, %d2
+    add.l %d2, (%a0)
+    lea.l (-3*4, %a0), %a0
+    subq.l #1, %d0
+    jne .loop5
+    jra .exit
+.order4:
+    movem.l (%a1), %d3-%d6
+    move.l (%a0)+, %a5
+.loop4:
+    mac.l %a5, %d6, (%a0)+, %a5, %acc0
+    mac.l %a5, %d5, (%a0)+, %a5, %acc0
+    mac.l %a5, %d4, (%a0)+, %a5, %acc0
+    mac.l %a5, %d3, (-3*4, %a0), %a5, %acc0
+    movclr.l %acc0, %d2
+    asr.l %d1, %d2
+    add.l %d2, (%a0)
+    subq.l #8, %a0
+    subq.l #1, %d0
+    jne .loop4
+    jra .exit
+.order3:
+    movem.l (%a1), %d3-%d5
+    move.l (%a0)+, %a5
+.loop3:
+    mac.l %a5, %d5, (%a0)+, %a5, %acc0
+    mac.l %a5, %d4, (%a0)+, %a5, %acc0
+    mac.l %a5, %d3, (-2*4, %a0), %a5, %acc0
+    movclr.l %acc0, %d2
+    asr.l %d1, %d2
+    add.l %d2, (%a0)
+    subq.l #4, %a0
+    subq.l #1, %d0
+    jne .loop3
+    jra .exit
+.order2:
+    movem.l (%a1), %d3-%d4
+    move.l (%a0)+, %a5
+.loop2:
+    mac.l %a5, %d4, (%a0)+, %a5, %acc0
+    mac.l %a5, %d3, %acc0 | data for next iteration is already loaded
+    movclr.l %acc0, %d2
+    asr.l %d1, %d2
+    add.l %d2, (%a0)
+    subq.l #1, %d0
+    jne .loop2
+    jra .exit
+.order1:
+    | no point in using mac here
+    move.l (%a1), %d3
+.loop1:
+    move.l %d3, %d2
+    muls.l (%a0)+, %d2
+    asr.l %d1, %d2
+    add.l %d2, (%a0)
+    subq.l #1, %d0
+    jne .loop1
+    jra .exit
+    
+.default:
+    /* we do the filtering in an unrolled by 4 loop as far as we can, and then
+       do the rest in an ordinary one by one sample loop.
+     */
+    lea.l (%a1, %d2.l*4), %a2 | need to start in the other end of coefs
+    move.l %a0, %a3           | working copy of history pointer
+    move.l %d2, %d3
+    lsr.l #2, %d3             | coefs/4, num of iterations needed in next loop
+    move.l (%a3)+, %a5        | preload data for loop
+.dloop1:
+    lea.l (-4*4, %a2), %a2    | move lpc coef pointer four samples backwards
+    movem.l (%a2), %d4-%d7    | load four coefs
+    mac.l %a5, %d7, (%a3)+, %a5, %acc0
+    mac.l %a5, %d6, (%a3)+, %a5, %acc0
+    mac.l %a5, %d5, (%a3)+, %a5, %acc0
+    mac.l %a5, %d4, (%a3)+, %a5, %acc0
+    subq.l #1, %d3            | any more unrolled loop operations left?
+    jne .dloop1
+    
+    move.l %d2, %d3
+    moveq.l #3, %d4           | mask 0x00000003
+    and.l %d4, %d3            | get the remaining samples to be filtered
+    jeq .dsave                | no remaining samples
+.dloop2:
+    move.l -(%a2), %d4        | get lpc coef
+    mac.l %a5, %d4, (%a3)+, %a5, %acc0
+    subq.l #1, %d3            | any more iterations left?
+    jne .dloop2
+.dsave:
+    movclr.l %acc0, %d3       | get result
+    asr.l %d1, %d3            | shift lp_quantization bits right
+    subq.l #4, %a3            | we're one past the save location
+    add.l %d3, (%a3)          | add residual and save
+    addq.l #4, %a0            | increment history pointer
+    subq.l #1, %d0            | decrement data_len
+    jne .default              | are we done?
+                              | if so, fall through to exit
+.exit:
+    movem.l (%sp), %d2-%d7/%a2-%a5
+    lea.l (40, %sp), %sp
+    rts
diff --git a/apps/codecs/libffmpegFLAC/coldfire.h b/apps/codecs/libffmpegFLAC/coldfire.h
new file mode 100644
index 0000000000..5493f549f7
--- /dev/null
+++ b/apps/codecs/libffmpegFLAC/coldfire.h
@@ -0,0 +1,8 @@
+#ifndef _FLAC_COLDFIRE_H
+#define _FLAC_COLDFIRE_H
+#include "bitstream.h"
+void lpc_decode_emac(int blocksize, int qlevel, int pred_order, int32_t* data, int* coeffs);
+#endif
author	Thom Johansen <thomj@rockbox.org>	2005-10-27 00:33:38 +0000
committer	Thom Johansen <thomj@rockbox.org>	2005-10-27 00:33:38 +0000
commit	0b38c7dcbe283ba7d13531831a5367afae668e69 (patch)
tree	dd5428f415fb6db9c860d6867c88b5059ba4f25a
parent	273d2e81f72c7721447ab9c539877f6712faaecc (diff)
download	rockbox-0b38c7dcbe283ba7d13531831a5367afae668e69.tar.gz rockbox-0b38c7dcbe283ba7d13531831a5367afae668e69.zip

diff --git a/apps/codecs/libffmpegFLAC/coldfire.S b/apps/codecs/libffmpegFLAC/coldfire.S new file mode 100644 index 0000000000..7e19e4b695 --- /dev/null +++ b/apps/codecs/libffmpegFLAC/coldfire.S
@@ -0,0 +1,237 @@
	1	/***************************************************************************
	2	* __________ __ ___.
	3	* Open \______ \ ____ ____ \| \| _\_ \|__ _______ ___
	4	* Source \| _// _ \_/ ___\\| \|/ /\| __ \ / _ \ \/ /
	5	* Jukebox \| \| ( <_> ) \___\| < \| \_\ ( <_> > < <
	6	* Firmware \|____\|_ /\____/ \___ >__\|_ \\|___ /\____/__/\_ \
	7	* \/ \/ \/ \/ \/
	8	* $Id$
	9	*
	10	* Copyright (C) 2005 by Thom Johansen
	11	*
	12	* All files in this archive are subject to the GNU General Public License.
	13	* See the file COPYING in the source tree root for full license agreement.
	14	*
	15	* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
	16	* KIND, either express or implied.
	17	*
	18	****************************************************************************/
	19
	20	/* The following is an assembler optimised version of the LPC filtering
	21	routines needed for FLAC decoding. It is optimised for use with the
	22	MCF5249 processor, or any other similar ColdFire core with the EMAC unit.
	23	All LPC filtering up to order 8 is done in specially optimised unrolled
	24	loops, while every order above this is handled by a slower default routine.
	25	*/
	26	.text
	27	.global lpc_decode_emac
	28	.align 2
	29	lpc_decode_emac:
	30	lea.l (-40, %sp), %sp
	31	movem.l %d2-%d7/%a2-%a5, (%sp)
	32	movem.l (40+4, %sp), %d0-%d2/%a0-%a1
	33	/* d0 = blocksize, d1 = qlevel, d2 = pred_order
	34	a0 = data, a1 = coeffs
	35	*/
	36
	37	/* the data pointer always lags behind history pointer by 'pred_order'
	38	samples. since we have one loop for each order, we can hard code this
	39	and free a register by not saving data pointer.
	40	*/
	41	move.l %d2, %d3
	42	neg.l %d3
	43	lea.l (%a0, %d3.l*4), %a0 \| history
	44	clr.l %d3
	45	move.l %d3, %macsr \| we'll need integer mode for this
	46	tst.l %d0
	47	jeq .exit \| zero samples to process, exit
	48	moveq.l #8, %d3
	49	cmp.l %d3, %d2
	50	jgt .default \| order is over 8, jump to default case
	51	lea.l .jumptable, %a4
	52	move.l (%a4, %d2.l*4), %a4
	53	jmp (%a4)
	54	.align 4 \| avoid unaligned fetch
	55	.jumptable:
	56	.long .exit
	57	.long .order1
	58	.long .order2
	59	.long .order3
	60	.long .order4
	61	.long .order5
	62	.long .order6
	63	.long .order7
	64	.long .order8
	65
	66	.order8:
	67	movem.l (%a1), %d3-%d7/%a2-%a4 \| load lpc coefs
	68	move.l (%a0)+, %a5 \| load first history sample
	69	.loop8:
	70	mac.l %a5, %a4, (%a0)+, %a5, %acc0
	71	mac.l %a5, %a3, (%a0)+, %a5, %acc0
	72	mac.l %a5, %a2, (%a0)+, %a5, %acc0
	73	mac.l %a5, %d7, (%a0)+, %a5, %acc0
	74	mac.l %a5, %d6, (%a0)+, %a5, %acc0
	75	mac.l %a5, %d5, (%a0)+, %a5, %acc0
	76	mac.l %a5, %d4, (%a0)+, %a5, %acc0
	77	mac.l %a5, %d3, (-7*4, %a0), %a5, %acc0 \| load for the next iteration
	78	movclr.l %acc0, %d2 \| get sum
	79	asr.l %d1, %d2 \| shift sum by lp_quantization bits
	80	add.l %d2, (%a0) \| add residual and save
	81	lea.l (-6*4, %a0), %a0 \| history pointer points at second element
	82	subq.l #1, %d0 \| decrement counter
	83	jne .loop8 \| are we done?
	84	jra .exit
	85
	86	.order7:
	87	movem.l (%a1), %d3-%d7/%a2-%a3
	88	move.l (%a0)+, %a5
	89	.loop7:
	90	mac.l %a5, %a3, (%a0)+, %a5, %acc0
	91	mac.l %a5, %a2, (%a0)+, %a5, %acc0
	92	mac.l %a5, %d7, (%a0)+, %a5, %acc0
	93	mac.l %a5, %d6, (%a0)+, %a5, %acc0
	94	mac.l %a5, %d5, (%a0)+, %a5, %acc0
	95	mac.l %a5, %d4, (%a0)+, %a5, %acc0
	96	mac.l %a5, %d3, (-6*4, %a0), %a5, %acc0
	97	movclr.l %acc0, %d2
	98	asr.l %d1, %d2
	99	add.l %d2, (%a0)
	100	lea.l (-5*4, %a0), %a0
	101	subq.l #1, %d0
	102	jne .loop7
	103	jra .exit
	104
	105	.order6:
	106	movem.l (%a1), %d3-%d7/%a2
	107	move.l (%a0)+, %a5
	108	.loop6:
	109	mac.l %a5, %a2, (%a0)+, %a5, %acc0
	110	mac.l %a5, %d7, (%a0)+, %a5, %acc0
	111	mac.l %a5, %d6, (%a0)+, %a5, %acc0
	112	mac.l %a5, %d5, (%a0)+, %a5, %acc0
	113	mac.l %a5, %d4, (%a0)+, %a5, %acc0
	114	mac.l %a5, %d3, (-5*4, %a0), %a5, %acc0
	115	movclr.l %acc0, %d2
	116	asr.l %d1, %d2
	117	add.l %d2, (%a0)
	118	lea.l (-4*4, %a0), %a0
	119	subq.l #1, %d0
	120	jne .loop6
	121	jra .exit
	122
	123	.order5:
	124	movem.l (%a1), %d3-%d7
	125	move.l (%a0)+, %a5
	126	.loop5:
	127	mac.l %a5, %d7, (%a0)+, %a5, %acc0
	128	mac.l %a5, %d6, (%a0)+, %a5, %acc0
	129	mac.l %a5, %d5, (%a0)+, %a5, %acc0
	130	mac.l %a5, %d4, (%a0)+, %a5, %acc0
	131	mac.l %a5, %d3, (-4*4, %a0), %a5, %acc0
	132	movclr.l %acc0, %d2
	133	asr.l %d1, %d2
	134	add.l %d2, (%a0)
	135	lea.l (-3*4, %a0), %a0
	136	subq.l #1, %d0
	137	jne .loop5
	138	jra .exit
	139
	140	.order4:
	141	movem.l (%a1), %d3-%d6
	142	move.l (%a0)+, %a5
	143	.loop4:
	144	mac.l %a5, %d6, (%a0)+, %a5, %acc0
	145	mac.l %a5, %d5, (%a0)+, %a5, %acc0
	146	mac.l %a5, %d4, (%a0)+, %a5, %acc0
	147	mac.l %a5, %d3, (-3*4, %a0), %a5, %acc0
	148	movclr.l %acc0, %d2
	149	asr.l %d1, %d2
	150	add.l %d2, (%a0)
	151	subq.l #8, %a0
	152	subq.l #1, %d0
	153	jne .loop4
	154	jra .exit
	155
	156	.order3:
	157	movem.l (%a1), %d3-%d5
	158	move.l (%a0)+, %a5
	159	.loop3:
	160	mac.l %a5, %d5, (%a0)+, %a5, %acc0
	161	mac.l %a5, %d4, (%a0)+, %a5, %acc0
	162	mac.l %a5, %d3, (-2*4, %a0), %a5, %acc0
	163	movclr.l %acc0, %d2
	164	asr.l %d1, %d2
	165	add.l %d2, (%a0)
	166	subq.l #4, %a0
	167	subq.l #1, %d0
	168	jne .loop3
	169	jra .exit
	170
	171	.order2:
	172	movem.l (%a1), %d3-%d4
	173	move.l (%a0)+, %a5
	174	.loop2:
	175	mac.l %a5, %d4, (%a0)+, %a5, %acc0
	176	mac.l %a5, %d3, %acc0 \| data for next iteration is already loaded
	177	movclr.l %acc0, %d2
	178	asr.l %d1, %d2
	179	add.l %d2, (%a0)
	180	subq.l #1, %d0
	181	jne .loop2
	182	jra .exit
	183
	184	.order1:
	185	\| no point in using mac here
	186	move.l (%a1), %d3
	187	.loop1:
	188	move.l %d3, %d2
	189	muls.l (%a0)+, %d2
	190	asr.l %d1, %d2
	191	add.l %d2, (%a0)
	192	subq.l #1, %d0
	193	jne .loop1
	194	jra .exit
	195
	196	.default:
	197	/* we do the filtering in an unrolled by 4 loop as far as we can, and then
	198	do the rest in an ordinary one by one sample loop.
	199	*/
	200	lea.l (%a1, %d2.l*4), %a2 \| need to start in the other end of coefs
	201	move.l %a0, %a3 \| working copy of history pointer
	202	move.l %d2, %d3
	203	lsr.l #2, %d3 \| coefs/4, num of iterations needed in next loop
	204	move.l (%a3)+, %a5 \| preload data for loop
	205	.dloop1:
	206	lea.l (-4*4, %a2), %a2 \| move lpc coef pointer four samples backwards
	207	movem.l (%a2), %d4-%d7 \| load four coefs
	208	mac.l %a5, %d7, (%a3)+, %a5, %acc0
	209	mac.l %a5, %d6, (%a3)+, %a5, %acc0
	210	mac.l %a5, %d5, (%a3)+, %a5, %acc0
	211	mac.l %a5, %d4, (%a3)+, %a5, %acc0
	212	subq.l #1, %d3 \| any more unrolled loop operations left?
	213	jne .dloop1
	214
	215	move.l %d2, %d3
	216	moveq.l #3, %d4 \| mask 0x00000003
	217	and.l %d4, %d3 \| get the remaining samples to be filtered
	218	jeq .dsave \| no remaining samples
	219	.dloop2:
	220	move.l -(%a2), %d4 \| get lpc coef
	221	mac.l %a5, %d4, (%a3)+, %a5, %acc0
	222	subq.l #1, %d3 \| any more iterations left?
	223	jne .dloop2
	224	.dsave:
	225	movclr.l %acc0, %d3 \| get result
	226	asr.l %d1, %d3 \| shift lp_quantization bits right
	227	subq.l #4, %a3 \| we're one past the save location
	228	add.l %d3, (%a3) \| add residual and save
	229	addq.l #4, %a0 \| increment history pointer
	230	subq.l #1, %d0 \| decrement data_len
	231	jne .default \| are we done?
	232	\| if so, fall through to exit
	233
	234	.exit:
	235	movem.l (%sp), %d2-%d7/%a2-%a5
	236	lea.l (40, %sp), %sp
	237	rts


diff --git a/apps/codecs/libffmpegFLAC/coldfire.h b/apps/codecs/libffmpegFLAC/coldfire.h new file mode 100644 index 0000000000..5493f549f7 --- /dev/null +++ b/apps/codecs/libffmpegFLAC/coldfire.h
@@ -0,0 +1,8 @@
	1	#ifndef _FLAC_COLDFIRE_H
	2	#define _FLAC_COLDFIRE_H
	3
	4	#include "bitstream.h"
	5
	6	void lpc_decode_emac(int blocksize, int qlevel, int pred_order, int32_t* data, int* coeffs);
	7
	8	#endif