1 files changed, 271 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/libffmpegFLAC/arm.S b/lib/rbcodec/codecs/libffmpegFLAC/arm.S
new file mode 100644
index 0000000000..8adca77ce5
--- /dev/null
+++ b/lib/rbcodec/codecs/libffmpegFLAC/arm.S
@@ -0,0 +1,271 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2006 by Thom Johansen 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+#include "config.h"
+/* The following is an assembler optimised version of the LPC filtering
+   routines needed for FLAC decoding. It is optimised for use with ARM 
+   processors.
+   All LPC filtering up to order 9 is done in specially optimised unrolled
+   loops, while every order above this is handled by a slower default routine.
+ */
+#ifdef USE_IRAM
+    .section .icode,"ax",%progbits
+#else
+    .text
+#endif
+    .global lpc_decode_arm
+lpc_decode_arm:
+    stmdb sp!, { r4-r11, lr }
+    ldr r4, [sp, #36]
+    /* r0 = blocksize, r1 = qlevel, r2 = pred_order
+       r3 = data, r4 = coeffs
+     */
+     
+    /* the data pointer always lags behind history pointer by 'pred_order'
+       samples. since we have one loop for each order, we can hard code this
+       and free a register by not saving data pointer. 
+     */ 
+    sub r3, r3, r2, lsl #2    @ r3 = history
+    cmp r0, #0                @ no samples to process
+    beq .exit
+    cmp r2, #9                @ check if order is too high for unrolled loops
+    addls pc, pc, r2, lsl #2  @ jump to our unrolled decode loop if it exists
+@ jumptable:
+    b .default                @ order too high, go to default routine
+    b .exit                   @ zero order filter isn't possible, exit function
+    b .order1
+    b .order2
+    b .order3
+    b .order4
+    b .order5
+    b .order6
+    b .order7
+    b .order8
+@ last jump table entry coincides with target, so leave it out
+.order9:
+    ldmia r4, { r5-r12, r14 } @ fetch coefs
+.loop9:
+    ldr r4, [r3], #4          @ load first history sample
+    mul r2, r4, r14           @ multiply with last coef
+    ldr r4, [r3], #4          @ rinse and repeat while accumulating sum in r2
+    mla r2, r4, r12, r2
+    ldr r4, [r3], #4
+    mla r2, r4, r11, r2
+    ldr r4, [r3], #4
+    mla r2, r4, r10, r2
+    ldr r4, [r3], #4
+    mla r2, r4, r9, r2
+    ldr r4, [r3], #4
+    mla r2, r4, r8, r2
+    ldr r4, [r3], #4
+    mla r2, r4, r7, r2
+    ldr r4, [r3], #4
+    mla r2, r4, r6, r2
+    ldr r4, [r3], #4
+    mla r2, r4, r5, r2
+    ldr r4, [r3]              @ r4 = residual
+    add r2, r4, r2, asr r1    @ shift sum by qlevel bits and add residual 
+    str r2, [r3], #-8*4       @ save result and wrap history pointer back
+    subs r0, r0, #1           @ check if we're done
+    bne .loop9                @ nope, jump back
+    b .exit
+    
+.order8:
+    ldmia r4, { r5-r12 }
+.loop8:
+    @ we have more registers to spare here, so start block reading
+    ldmia r3!, { r4, r14 }
+    mul r2, r4, r12
+    mla r2, r14, r11, r2
+    ldmia r3!, { r4, r14 }
+    mla r2, r4, r10, r2
+    mla r2, r14, r9, r2
+    ldmia r3!, { r4, r14 }
+    mla r2, r4, r8, r2
+    mla r2, r14, r7, r2
+    ldmia r3!, { r4, r14 }
+    mla r2, r4, r6, r2
+    mla r2, r14, r5, r2
+    ldr r4, [r3]
+    add r2, r4, r2, asr r1
+    str r2, [r3], #-7*4
+    subs r0, r0, #1
+    bne .loop8
+    b .exit
+.order7:
+    ldmia r4, { r5-r11 }
+.loop7:
+    ldmia r3!, { r4, r12, r14 }
+    mul r2, r4, r11
+    mla r2, r12, r10, r2
+    mla r2, r14, r9, r2
+    ldmia r3!, { r4, r12, r14 }
+    mla r2, r4, r8, r2
+    mla r2, r12, r7, r2
+    mla r2, r14, r6, r2
+    ldr r4, [r3], #4
+    mla r2, r4, r5, r2
+    ldr r4, [r3]
+    add r2, r4, r2, asr r1
+    str r2, [r3], #-6*4
+    subs r0, r0, #1
+    bne .loop7
+    b .exit
+.order6:
+    ldmia r4, { r5-r10 }
+.loop6:
+    ldmia r3!, { r4, r11-r12, r14 }
+    mul r2, r4, r10
+    mla r2, r11, r9, r2
+    mla r2, r12, r8, r2
+    mla r2, r14, r7, r2
+    ldmia r3!, { r4, r11 }
+    mla r2, r4, r6, r2
+    mla r2, r11, r5, r2
+    ldr r4, [r3]
+    add r2, r4, r2, asr r1
+    str r2, [r3], #-5*4
+    subs r0, r0, #1
+    bne .loop6
+    b .exit
+.order5:
+    ldmia r4, { r5-r9 }
+.loop5:
+    ldmia r3!, { r4, r10-r12, r14 }
+    mul r2, r4, r9
+    mla r2, r10, r8, r2
+    mla r2, r11, r7, r2
+    mla r2, r12, r6, r2
+    mla r2, r14, r5, r2
+    ldr r4, [r3]
+    add r2, r4, r2, asr r1
+    str r2, [r3], #-4*4
+    subs r0, r0, #1
+    bne .loop5
+    b .exit
+.order4:
+    ldmia r4, { r5-r8 }
+.loop4:
+    ldmia r3!, { r4, r11-r12, r14 }
+    mul r2, r4, r8
+    mla r2, r11, r7, r2
+    mla r2, r12, r6, r2
+    mla r2, r14, r5, r2
+    ldr r4, [r3]
+    add r2, r4, r2, asr r1
+    str r2, [r3], #-3*4
+    subs r0, r0, #1
+    bne .loop4
+    b .exit
+.order3:
+    ldmia r4, { r5-r7 }
+.loop3:
+    ldmia r3!, { r4, r12, r14 }
+    mul r2, r4, r7
+    mla r2, r12, r6, r2
+    mla r2, r14, r5, r2
+    ldr r4, [r3]
+    add r2, r4, r2, asr r1
+    str r2, [r3], #-2*4
+    subs r0, r0, #1
+    bne .loop3
+    b .exit
+.order2:
+    ldmia r4, { r5-r6 }
+.loop2:
+    ldmia r3!, { r4, r14 }
+    mul r2, r4, r6
+    mla r2, r14, r5, r2
+    ldr r4, [r3]
+    add r2, r4, r2, asr r1
+    str r2, [r3], #-1*4
+    subs r0, r0, #1
+    bne .loop2
+    b .exit
+.order1:
+    ldr r5, [r4]            @ load the one coef we need
+    ldr r4, [r3], #4        @ load one history sample, r3 now points to residual
+.loop1:
+    mul r2, r4, r5          @ multiply coef by history sample
+    ldr r4, [r3]            @ load residual
+    add r4, r4, r2, asr r1  @ add result to residual
+    str r4, [r3], #4        @ place r3 at next residual, we already have 
+    subs r0, r0, #1         @ the current sample in r4 for the next iteration
+    bne .loop1
+    b .exit
+.default:
+    /* we do the filtering in an unrolled by 4 loop as far as we can, and then
+       do the rest by jump table. */
+    add r5, r4, r2, lsl #2   @ need to start in the other end of coefs
+    mov r7, r2, lsr #2       @ r7 = coefs/4
+    mov r14, #0              @ init accumulator
+.dloop1:
+    ldmdb r5!, { r8-r11 }
+    ldmia r3!, { r6, r12 }
+    mla r14, r6, r11, r14
+    mla r14, r12, r10, r14
+    ldmia r3!, { r6, r12 }
+    mla r14, r6, r9, r14
+    mla r14, r12, r8, r14
+    subs r7, r7, #1
+    bne .dloop1
+    and r7, r2, #3            @ get remaining samples to be filtered
+    add pc, pc, r7, lsl #2    @ jump into accumulator chain
+@ jumptable:
+    b .dsave @ padding
+    b .dsave
+    b .oneleft
+    b .twoleft
+@ implicit .threeleft 
+    ldr r12, [r5, #-4]!
+    ldr r8, [r3], #4
+    mla r14, r12, r8, r14  
+.twoleft:
+    ldr r12, [r5, #-4]!
+    ldr r8, [r3], #4
+    mla r14, r12, r8, r14  
+.oneleft:
+    ldr r12, [r5, #-4]!
+    ldr r8, [r3], #4
+    mla r14, r12, r8, r14  
+.dsave:
+    ldr r12, [r3]             @ load residual
+    add r14, r12, r14, asr r1 @ shift sum by qlevel bits and add residual
+    str r14, [r3], #4         @ store result
+    sub r3, r3, r2, lsl #2    @ and wrap history pointer back to next first pos
+    subs r0, r0, #1           @ are we done?
+    bne .default              @ no, prepare for next sample
+.exit:
+    ldmpc regs=r4-r11

diff --git a/lib/rbcodec/codecs/libffmpegFLAC/arm.S b/lib/rbcodec/codecs/libffmpegFLAC/arm.S new file mode 100644 index 0000000000..8adca77ce5 --- /dev/null +++ b/lib/rbcodec/codecs/libffmpegFLAC/arm.S
@@ -0,0 +1,271 @@
	1	/***************************************************************************
	2	* __________ __ ___.
	3	* Open \______ \ ____ ____ \| \| _\_ \|__ _______ ___
	4	* Source \| _// _ \_/ ___\\| \|/ /\| __ \ / _ \ \/ /
	5	* Jukebox \| \| ( <_> ) \___\| < \| \_\ ( <_> > < <
	6	* Firmware \|____\|_ /\____/ \___ >__\|_ \\|___ /\____/__/\_ \
	7	* \/ \/ \/ \/ \/
	8	* $Id$
	9	*
	10	* Copyright (C) 2006 by Thom Johansen
	11	*
	12	* This program is free software; you can redistribute it and/or
	13	* modify it under the terms of the GNU General Public License
	14	* as published by the Free Software Foundation; either version 2
	15	* of the License, or (at your option) any later version.
	16	*
	17	* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
	18	* KIND, either express or implied.
	19	*
	20	****************************************************************************/
	21
	22	#include "config.h"
	23
	24	/* The following is an assembler optimised version of the LPC filtering
	25	routines needed for FLAC decoding. It is optimised for use with ARM
	26	processors.
	27	All LPC filtering up to order 9 is done in specially optimised unrolled
	28	loops, while every order above this is handled by a slower default routine.
	29	*/
	30	#ifdef USE_IRAM
	31	.section .icode,"ax",%progbits
	32	#else
	33	.text
	34	#endif
	35	.global lpc_decode_arm
	36	lpc_decode_arm:
	37	stmdb sp!, { r4-r11, lr }
	38	ldr r4, [sp, #36]
	39	/* r0 = blocksize, r1 = qlevel, r2 = pred_order
	40	r3 = data, r4 = coeffs
	41	*/
	42
	43	/* the data pointer always lags behind history pointer by 'pred_order'
	44	samples. since we have one loop for each order, we can hard code this
	45	and free a register by not saving data pointer.
	46	*/
	47	sub r3, r3, r2, lsl #2 @ r3 = history
	48	cmp r0, #0 @ no samples to process
	49	beq .exit
	50	cmp r2, #9 @ check if order is too high for unrolled loops
	51	addls pc, pc, r2, lsl #2 @ jump to our unrolled decode loop if it exists
	52	@ jumptable:
	53	b .default @ order too high, go to default routine
	54	b .exit @ zero order filter isn't possible, exit function
	55	b .order1
	56	b .order2
	57	b .order3
	58	b .order4
	59	b .order5
	60	b .order6
	61	b .order7
	62	b .order8
	63
	64	@ last jump table entry coincides with target, so leave it out
	65	.order9:
	66	ldmia r4, { r5-r12, r14 } @ fetch coefs
	67	.loop9:
	68	ldr r4, [r3], #4 @ load first history sample
	69	mul r2, r4, r14 @ multiply with last coef
	70	ldr r4, [r3], #4 @ rinse and repeat while accumulating sum in r2
	71	mla r2, r4, r12, r2
	72	ldr r4, [r3], #4
	73	mla r2, r4, r11, r2
	74	ldr r4, [r3], #4
	75	mla r2, r4, r10, r2
	76	ldr r4, [r3], #4
	77	mla r2, r4, r9, r2
	78	ldr r4, [r3], #4
	79	mla r2, r4, r8, r2
	80	ldr r4, [r3], #4
	81	mla r2, r4, r7, r2
	82	ldr r4, [r3], #4
	83	mla r2, r4, r6, r2
	84	ldr r4, [r3], #4
	85	mla r2, r4, r5, r2
	86	ldr r4, [r3] @ r4 = residual
	87	add r2, r4, r2, asr r1 @ shift sum by qlevel bits and add residual
	88	str r2, [r3], #-8*4 @ save result and wrap history pointer back
	89	subs r0, r0, #1 @ check if we're done
	90	bne .loop9 @ nope, jump back
	91	b .exit
	92
	93	.order8:
	94	ldmia r4, { r5-r12 }
	95	.loop8:
	96	@ we have more registers to spare here, so start block reading
	97	ldmia r3!, { r4, r14 }
	98	mul r2, r4, r12
	99	mla r2, r14, r11, r2
	100	ldmia r3!, { r4, r14 }
	101	mla r2, r4, r10, r2
	102	mla r2, r14, r9, r2
	103	ldmia r3!, { r4, r14 }
	104	mla r2, r4, r8, r2
	105	mla r2, r14, r7, r2
	106	ldmia r3!, { r4, r14 }
	107	mla r2, r4, r6, r2
	108	mla r2, r14, r5, r2
	109	ldr r4, [r3]
	110	add r2, r4, r2, asr r1
	111	str r2, [r3], #-7*4
	112	subs r0, r0, #1
	113	bne .loop8
	114	b .exit
	115
	116	.order7:
	117	ldmia r4, { r5-r11 }
	118	.loop7:
	119	ldmia r3!, { r4, r12, r14 }
	120	mul r2, r4, r11
	121	mla r2, r12, r10, r2
	122	mla r2, r14, r9, r2
	123	ldmia r3!, { r4, r12, r14 }
	124	mla r2, r4, r8, r2
	125	mla r2, r12, r7, r2
	126	mla r2, r14, r6, r2
	127	ldr r4, [r3], #4
	128	mla r2, r4, r5, r2
	129	ldr r4, [r3]
	130	add r2, r4, r2, asr r1
	131	str r2, [r3], #-6*4
	132	subs r0, r0, #1
	133	bne .loop7
	134	b .exit
	135
	136	.order6:
	137	ldmia r4, { r5-r10 }
	138	.loop6:
	139	ldmia r3!, { r4, r11-r12, r14 }
	140	mul r2, r4, r10
	141	mla r2, r11, r9, r2
	142	mla r2, r12, r8, r2
	143	mla r2, r14, r7, r2
	144	ldmia r3!, { r4, r11 }
	145	mla r2, r4, r6, r2
	146	mla r2, r11, r5, r2
	147	ldr r4, [r3]
	148	add r2, r4, r2, asr r1
	149	str r2, [r3], #-5*4
	150	subs r0, r0, #1
	151	bne .loop6
	152	b .exit
	153
	154	.order5:
	155	ldmia r4, { r5-r9 }
	156	.loop5:
	157	ldmia r3!, { r4, r10-r12, r14 }
	158	mul r2, r4, r9
	159	mla r2, r10, r8, r2
	160	mla r2, r11, r7, r2
	161	mla r2, r12, r6, r2
	162	mla r2, r14, r5, r2
	163	ldr r4, [r3]
	164	add r2, r4, r2, asr r1
	165	str r2, [r3], #-4*4
	166	subs r0, r0, #1
	167	bne .loop5
	168	b .exit
	169
	170	.order4:
	171	ldmia r4, { r5-r8 }
	172	.loop4:
	173	ldmia r3!, { r4, r11-r12, r14 }
	174	mul r2, r4, r8
	175	mla r2, r11, r7, r2
	176	mla r2, r12, r6, r2
	177	mla r2, r14, r5, r2
	178	ldr r4, [r3]
	179	add r2, r4, r2, asr r1
	180	str r2, [r3], #-3*4
	181	subs r0, r0, #1
	182	bne .loop4
	183	b .exit
	184
	185	.order3:
	186	ldmia r4, { r5-r7 }
	187	.loop3:
	188	ldmia r3!, { r4, r12, r14 }
	189	mul r2, r4, r7
	190	mla r2, r12, r6, r2
	191	mla r2, r14, r5, r2
	192	ldr r4, [r3]
	193	add r2, r4, r2, asr r1
	194	str r2, [r3], #-2*4
	195	subs r0, r0, #1
	196	bne .loop3
	197	b .exit
	198
	199	.order2:
	200	ldmia r4, { r5-r6 }
	201	.loop2:
	202	ldmia r3!, { r4, r14 }
	203	mul r2, r4, r6
	204	mla r2, r14, r5, r2
	205	ldr r4, [r3]
	206	add r2, r4, r2, asr r1
	207	str r2, [r3], #-1*4
	208	subs r0, r0, #1
	209	bne .loop2
	210	b .exit
	211
	212	.order1:
	213	ldr r5, [r4] @ load the one coef we need
	214	ldr r4, [r3], #4 @ load one history sample, r3 now points to residual
	215	.loop1:
	216	mul r2, r4, r5 @ multiply coef by history sample
	217	ldr r4, [r3] @ load residual
	218	add r4, r4, r2, asr r1 @ add result to residual
	219	str r4, [r3], #4 @ place r3 at next residual, we already have
	220	subs r0, r0, #1 @ the current sample in r4 for the next iteration
	221	bne .loop1
	222	b .exit
	223
	224	.default:
	225	/* we do the filtering in an unrolled by 4 loop as far as we can, and then
	226	do the rest by jump table. */
	227	add r5, r4, r2, lsl #2 @ need to start in the other end of coefs
	228	mov r7, r2, lsr #2 @ r7 = coefs/4
	229	mov r14, #0 @ init accumulator
	230	.dloop1:
	231	ldmdb r5!, { r8-r11 }
	232	ldmia r3!, { r6, r12 }
	233	mla r14, r6, r11, r14
	234	mla r14, r12, r10, r14
	235	ldmia r3!, { r6, r12 }
	236	mla r14, r6, r9, r14
	237	mla r14, r12, r8, r14
	238	subs r7, r7, #1
	239	bne .dloop1
	240
	241	and r7, r2, #3 @ get remaining samples to be filtered
	242	add pc, pc, r7, lsl #2 @ jump into accumulator chain
	243	@ jumptable:
	244	b .dsave @ padding
	245	b .dsave
	246	b .oneleft
	247	b .twoleft
	248	@ implicit .threeleft
	249	ldr r12, [r5, #-4]!
	250	ldr r8, [r3], #4
	251	mla r14, r12, r8, r14
	252	.twoleft:
	253	ldr r12, [r5, #-4]!
	254	ldr r8, [r3], #4
	255	mla r14, r12, r8, r14
	256	.oneleft:
	257	ldr r12, [r5, #-4]!
	258	ldr r8, [r3], #4
	259	mla r14, r12, r8, r14
	260
	261	.dsave:
	262	ldr r12, [r3] @ load residual
	263	add r14, r12, r14, asr r1 @ shift sum by qlevel bits and add residual
	264	str r14, [r3], #4 @ store result
	265	sub r3, r3, r2, lsl #2 @ and wrap history pointer back to next first pos
	266	subs r0, r0, #1 @ are we done?
	267	bne .default @ no, prepare for next sample
	268
	269	.exit:
	270	ldmpc regs=r4-r11
	271