libmusepack: ARMv6 assembler for mpc_decoder_windowing_D, speeds up decoding of 128kbps sample file 2MHz, or 8%, on gigabeat S. The output difference to the c implementation and the other ARM implementation is +/-1 in less than 0.1% of the output samples.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@28487 a1c6a512-1295-4272-9138-f99709370657
author: Nils Wallménius <nils@rockbox.org> 2010-11-05 11:20:50 +0000
committer: Nils Wallménius <nils@rockbox.org> 2010-11-05 11:20:50 +0000
commit: dbdc0a8a8cbfa4e6b72e5f6fb643f5b0ef4afc27 (patch)
tree: 7f18e2158a33462af7eae0d068e437be01ce5805
parent: 8404c53ee6d2f828fb7ea2b0713d2cd0afcfeeca (diff)
download: rockbox-dbdc0a8a8cbfa4e6b72e5f6fb643f5b0ef4afc27.tar.gz
rockbox-dbdc0a8a8cbfa4e6b72e5f6fb643f5b0ef4afc27.zip
2 files changed, 205 insertions, 5 deletions
diff --git a/apps/codecs/libmusepack/synth_filter.c b/apps/codecs/libmusepack/synth_filter.c
index 9a79328106..94c57eb213 100644
--- a/apps/codecs/libmusepack/synth_filter.c
+++ b/apps/codecs/libmusepack/synth_filter.c
@@ -46,10 +46,16 @@
 #if defined(MPC_FIXED_POINT)
    #if defined(CPU_ARM)
-      // do not up-scale D-values to achieve higher speed in smull/mlal
+      #if ARM_ARCH >= 6
-      // operations. saves ~14/8 = 1.75 cycles per multiplication
+        // on ARMv6 we use 32*32=64>>32 multiplies (smmul/smmla) so we need to scale up the D coefficients
-      #define D(value)  (value)
+        // the ARM11 multiplier doesn't have early termination so the magnitude of the multiplicands does not
-      
+        // matter for speed.
+        #define D(value)  (value << (14))
+      #else
+        // do not up-scale D-values to achieve higher speed in smull/mlal
+        // operations. saves ~14/8 = 1.75 cycles per multiplication
+        #define D(value)  (value)
+      #endif
      // in this configuration a post-shift by >>16 is needed after synthesis
    #else
      // saturate to +/- 2^31 (= value << (31-17)), D-values are +/- 2^17
diff --git a/apps/codecs/libmusepack/synth_filter_arm.S b/apps/codecs/libmusepack/synth_filter_arm.S
index b44e029a43..598f218e45 100644
--- a/apps/codecs/libmusepack/synth_filter_arm.S
+++ b/apps/codecs/libmusepack/synth_filter_arm.S
@@ -296,7 +296,7 @@ mpc_decoder_windowing_D:
    add r1, r1, #4              /* V++ */
    
    ldmpc regs=r4-r11
-#else /* arm9 and above */
+#elif ARM_ARCH < 6 /* arm9 and above */
    mpc_decoder_windowing_D:
    /* r0 = Data[] */
    /* r1 = V[] */
@@ -501,6 +501,200 @@ mpc_decoder_windowing_D:
    add r1, r1, #4              /* V++ */
    
    ldmpc regs=r4-r11
+#else
+    mpc_decoder_windowing_D:
+    /* r0 = Data[] */
+    /* r1 = V[] */
+    /* r2 = D[] */
+    /* lr = counter */
+    /************************************************************************
+     * Further speed up through making use of symmetries within D[]-window.
+     * The row V[00] can be extracted as it has symmetries within this single
+     * row. 8 smull/mlal and 8 ldr's can be saved at the cost of 2 add's.
+     * The rows V[01..15] are symmetric to V[31..17]. 15 x 16 ldr's can be
+     * saved at the cost of 15 x 4 + 1 add's.
+     * The row V[16] can be extracted as it has symmetries within this single
+     * row. 8 smull/mlal and 8 ldr's can be saved.
+     * On armv6 use smmulr/smlalr which are faster than smull/smlal and only
+     * accumulate the top 32 bits of the result so that frees up 2
+     * registers so we can ldm larger blocks.
+     ***********************************************************************/
+    stmfd   sp!, {r4-r11, lr}
+    
+    /******************************************
+     * row 0 with internal symmetry
+     *****************************************/
+    add r2, r2, #4          /* D+=1, r2 = D[01] as D[00] = 0 */
+    ldmia r2!, { r3-r6 }    /* load D[01..04] */
+    ldr r7 , [r1, #96*4]    /*  1 */
+    ldr r10, [r1, #992*4]   /* 15 */
+    ldr r11, [r1, #128*4]   /*  2 */
+    rsb r10, r10, r7        /* V[01] - V[15] */
+    ldr r12, [r1, #896*4]   /* 14 */
+    smmulr r9, r10, r3
+    ldr r7 , [r1, #224*4]   /*  3 */
+    add r12, r12, r11       /* V[02] + V[14] */
+    ldr r10, [r1, #864*4]   /* 13 */
+    smmlar r9, r12, r4, r9
+    ldr r11, [r1, #256*4]   /*  4 */
+    rsb r10, r10, r7        /* V[03] - V[13] */
+    ldr r12, [r1, #768*4]   /* 12 */
+    smmlar r9, r10, r5, r9
+    ldr r7 , [r1, #352*4]   /*  5 */
+    add r12, r12, r11       /* V[04] + V[12] */
+    ldr r10, [r1, #736*4]   /* 11 */
+    smmlar r9, r12, r6, r9
+    ldmia r2!, { r3-r6 }    /* load D[05..08] */
+    ldr r11, [r1, #384*4]   /*  6 */
+    rsb r10, r10, r7        /* V[05] - V[11] */
+    ldr r12, [r1, #640*4]   /* 10 */
+    smmlar r9, r10, r3, r9
+    ldr r7 , [r1, #480*4]   /*  7 */
+    add r12, r12, r11       /* V[06] + V[10] */
+    ldr r10, [r1, #608*4]   /*  9 */
+    smmlar r9, r12, r4, r9
+    rsb r10, r10, r7        /* V[07] - V[09] */
+    ldr r11, [r1, #512*4]   /*  8 */
+    smmlar r9, r10, r5, r9
+    add r1, r1, #4          /* V+=1, r1 = V[01] */
+    smmlar r9, r11, r6, r9
+    add r2, r2, #7*4        /* D+=7, r2 = D[16] */
+    mov r9, r9, lsl #2
+    str r9, [r0], #4        /* store Data */
+    /******************************************
+     * rows 01..15 are symmetric to rows 31..17
+     * r9  = acc of 01..15
+     * r1  = V[01..15]
+     * r11 = acc of 31..17
+     * r12 = V[31..16]
+     *****************************************/
+    mov lr, #15*8
+    add r12, r1, #30*4          /* r12 = V[31] */
+.loop15:
+    ldmia r2!, { r3-r6 }        /* load D[00..03] */
+    ldr r7, [r12, #896*4]       /* 14 */
+    ldr r8, [r12, #992*4]       /* 15 */
+    smmulr r11, r7, r4
+    ldr r7, [r1]                /*  0 */
+    smmlar r11, r8, r3, r11
+    ldr r8, [r1, #96*4]         /*  1 */
+    smmulr r9, r7, r3
+    ldr r7, [r12, #768*4]       /* 12 */
+    smmlar r9, r8, r4, r9
+    ldr r8, [r12, #864*4]       /* 13 */
+    smmlar r11, r7, r6, r11
+    ldr r7, [r1, #128*4]        /*  2 */
+    smmlar r11, r8, r5, r11
+    ldr r8, [r1, #224*4]        /*  3 */
+    smmlar r9, r7, r5, r9
+    ldr r7, [r1, #256*4]        /*  4 */
+    smmlar r9, r8, r6, r9
+    ldmia r2!, { r3-r6 }        /* load D[04..07] */
+    ldr r8, [r1, #352*4]        /*  5 */
+    smmlar r9, r7, r3, r9
+    ldr r7, [r12, #640*4]       /* 10 */
+    smmlar r9, r8, r4, r9
+    ldr r8, [r12, #736*4]       /* 11 */
+    smmlar r11, r7, r4, r11
+    ldr r7, [r1, #384*4]        /*  6 */
+    smmlar r11, r8, r3, r11
+    ldr r8, [r1, #480*4]        /*  7 */
+    smmlar r9, r7, r5, r9
+    ldr r7, [r12, #512*4]       /*  8 */
+    smmlar r9, r8, r6, r9
+    ldr r8, [r12, #608*4]       /*  9 */
+    smmlar r11, r7, r6, r11
+    ldr r7, [r12, #384*4]       /*  6 */
+    smmlar r11, r8, r5, r11
+    ldmia r2!, { r3-r6 }        /* load D[08..11] */
+    ldr r8, [r12, #480*4]       /*  7 */
+    smmlar r11, r7, r4, r11
+    ldr r7, [r1, #512*4]        /*  8 */
+    smmlar r11, r8, r3, r11
+    ldr r8, [r1, #608*4]        /*  9 */
+    smmlar r9, r7, r3, r9
+    ldr r7, [r1, #640*4]        /* 10 */
+    smmlar r9, r8, r4, r9
+    ldr r8, [r1, #736*4]        /* 11 */
+    smmlar r9, r7, r5, r9
+    ldr r7, [r12, #256*4]       /*  4 */
+    smmlar r9, r8, r6, r9
+    ldr r8, [r12, #352*4]       /*  5 */
+    smmlar r11, r7, r6, r11
+    ldr r7, [r1, #768*4]        /* 12 */
+    smmlar r11, r8, r5, r11
+    ldmia r2!, { r3-r6 }        /* load D[12..15] */
+    ldr r8, [r1, #864*4]        /* 13 */
+    smmlar r9, r7, r3, r9
+    ldr r7, [r12, #128*4]       /*  2 */
+    smmlar r9, r8, r4, r9
+    ldr r8, [r12, #224*4]       /*  3 */
+    smmlar r11, r7, r4, r11
+    ldr r7, [r12]               /*  0 */
+    smmlar r11, r8, r3, r11
+    ldr r8, [r12, #96*4]        /*  1 */
+    smmlar r11, r7, r6, r11
+    ldr r7, [r1, #896*4]        /* 14 */
+    smmlar r11, r8, r5, r11
+    ldr r8, [r1, #992*4]        /* 15 */
+    smmlar r9, r7, r5, r9
+    sub r12, r12, #4            /* r12 = V-- correct adresses for next loop */
+    smmlar r9, r8, r6, r9
+    add r1, r1, #4              /* r1  = V++ correct adresses for next loop */
+    rsb r11, r11, #0            /* r11 = -r11 */
+    /* store Data[01..15] */
+    mov r9, r9, lsl #2
+    str r9, [r0]                /* store Data */
+    /* store Data[31..17] */
+    mov r11, r11, lsl #2
+    str r11, [r0, lr]           /* store Data */
+    add r0, r0, #4              /* r0++ */
+    /* next loop */
+    subs lr, lr, #8
+    bgt .loop15
+    
+    /******************************************
+     * V[16] with internal symmetry
+     *****************************************/
+    ldmia r2!, { r3-r6 }        /* load D[00..03] */
+    ldr r7 , [r1]               /*  0 */
+    ldr r10, [r1, #992*4]       /* 15 */
+    ldr r11, [r1, #96*4]        /*  1 */
+    rsb r10, r10, r7            /* V[00] - V[15] */
+    ldr r12, [r1, #896*4]       /* 14 */
+    smmulr r9, r10, r3
+    ldr r7 , [r1, #128*4]       /*  2 */
+    rsb r12, r12, r11           /* V[01] - V[14] */
+    ldr r10, [r1, #864*4]       /* 13 */
+    smmlar r9, r12, r4, r9
+    ldr r11, [r1, #224*4]       /*  3 */
+    rsb r10, r10, r7            /* V[02] - V[13] */
+    ldr r12, [r1, #768*4]       /* 12 */
+    smmlar r9, r10, r5, r9
+    ldr r7 , [r1, #256*4]       /*  4 */
+    rsb r12, r12, r11           /* V[03] - V[12] */
+    ldr r10, [r1, #736*4]       /* 11 */
+    smmlar r9, r12, r6, r9
+    ldmia r2!, { r3-r6 }        /* load D[04..07] */
+    ldr r11, [r1, #352*4]       /*  5 */
+    rsb r10, r10, r7            /* V[04] - V[11] */
+    ldr r12, [r1, #640*4]       /* 10 */
+    smmlar r9, r10, r3, r9
+    ldr r7 , [r1, #384*4]       /*  6 */
+    rsb r12, r12, r11           /* V[05] - V[10] */
+    ldr r10, [r1, #608*4]       /*  9 */
+    smmlar r9, r12, r4, r9
+    ldr r11, [r1, #480*4]       /*  7 */
+    rsb r10, r10, r7            /* V[06] - V[09] */
+    ldr r12, [r1, #512*4]       /*  8 */
+    smmlar r9, r10, r5, r9
+    rsb r12, r12, r11           /* V[07] - V[08] */
+    smmlar r9, r12, r6, r9
+    mov r9, r9, lsl #2
+    str r9, [r0], #4            /* store Data */
+    
+    ldmpc regs=r4-r11
 #endif
 .mpc_dewindowing_end:
    .size   mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D
author	Nils Wallménius <nils@rockbox.org>	2010-11-05 11:20:50 +0000
committer	Nils Wallménius <nils@rockbox.org>	2010-11-05 11:20:50 +0000
commit	dbdc0a8a8cbfa4e6b72e5f6fb643f5b0ef4afc27 (patch)
tree	7f18e2158a33462af7eae0d068e437be01ce5805
parent	8404c53ee6d2f828fb7ea2b0713d2cd0afcfeeca (diff)
download	rockbox-dbdc0a8a8cbfa4e6b72e5f6fb643f5b0ef4afc27.tar.gz rockbox-dbdc0a8a8cbfa4e6b72e5f6fb643f5b0ef4afc27.zip

diff --git a/apps/codecs/libmusepack/synth_filter.c b/apps/codecs/libmusepack/synth_filter.c index 9a79328106..94c57eb213 100644 --- a/apps/codecs/libmusepack/synth_filter.c +++ b/apps/codecs/libmusepack/synth_filter.c
@@ -46,10 +46,16 @@
46		46
47	#if defined(MPC_FIXED_POINT)	47	#if defined(MPC_FIXED_POINT)
48	#if defined(CPU_ARM)	48	#if defined(CPU_ARM)
49	// do not up-scale D-values to achieve higher speed in smull/mlal	49	#if ARM_ARCH >= 6
50	// operations. saves ~14/8 = 1.75 cycles per multiplication	50	// on ARMv6 we use 32*32=64>>32 multiplies (smmul/smmla) so we need to scale up the D coefficients
51	#define D(value) (value)	51	// the ARM11 multiplier doesn't have early termination so the magnitude of the multiplicands does not
52		52	// matter for speed.
		53	#define D(value) (value << (14))
		54	#else
		55	// do not up-scale D-values to achieve higher speed in smull/mlal
		56	// operations. saves ~14/8 = 1.75 cycles per multiplication
		57	#define D(value) (value)
		58	#endif
53	// in this configuration a post-shift by >>16 is needed after synthesis	59	// in this configuration a post-shift by >>16 is needed after synthesis
54	#else	60	#else
55	// saturate to +/- 2^31 (= value << (31-17)), D-values are +/- 2^17	61	// saturate to +/- 2^31 (= value << (31-17)), D-values are +/- 2^17


diff --git a/apps/codecs/libmusepack/synth_filter_arm.S b/apps/codecs/libmusepack/synth_filter_arm.S index b44e029a43..598f218e45 100644 --- a/apps/codecs/libmusepack/synth_filter_arm.S +++ b/apps/codecs/libmusepack/synth_filter_arm.S
@@ -296,7 +296,7 @@ mpc_decoder_windowing_D:
296	add r1, r1, #4 /* V++ */	296	add r1, r1, #4 /* V++ */
297		297
298	ldmpc regs=r4-r11	298	ldmpc regs=r4-r11
299	#else /* arm9 and above */	299	#elif ARM_ARCH < 6 /* arm9 and above */
300	mpc_decoder_windowing_D:	300	mpc_decoder_windowing_D:
301	/* r0 = Data[] */	301	/* r0 = Data[] */
302	/* r1 = V[] */	302	/* r1 = V[] */
@@ -501,6 +501,200 @@ mpc_decoder_windowing_D:
501	add r1, r1, #4 /* V++ */	501	add r1, r1, #4 /* V++ */
502		502
503	ldmpc regs=r4-r11	503	ldmpc regs=r4-r11
		504	#else
		505	mpc_decoder_windowing_D:
		506	/* r0 = Data[] */
		507	/* r1 = V[] */
		508	/* r2 = D[] */
		509	/* lr = counter */
		510	/************************************************************************
		511	* Further speed up through making use of symmetries within D[]-window.
		512	* The row V[00] can be extracted as it has symmetries within this single
		513	* row. 8 smull/mlal and 8 ldr's can be saved at the cost of 2 add's.
		514	* The rows V[01..15] are symmetric to V[31..17]. 15 x 16 ldr's can be
		515	* saved at the cost of 15 x 4 + 1 add's.
		516	* The row V[16] can be extracted as it has symmetries within this single
		517	* row. 8 smull/mlal and 8 ldr's can be saved.
		518	* On armv6 use smmulr/smlalr which are faster than smull/smlal and only
		519	* accumulate the top 32 bits of the result so that frees up 2
		520	* registers so we can ldm larger blocks.
		521	***********************************************************************/
		522	stmfd sp!, {r4-r11, lr}
		523
		524	/******************************************
		525	* row 0 with internal symmetry
		526	*****************************************/
		527	add r2, r2, #4 /* D+=1, r2 = D[01] as D[00] = 0 */
		528	ldmia r2!, { r3-r6 } /* load D[01..04] */
		529	ldr r7 , [r1, #964] / 1 */
		530	ldr r10, [r1, #9924] / 15 */
		531	ldr r11, [r1, #1284] / 2 */
		532	rsb r10, r10, r7 /* V[01] - V[15] */
		533	ldr r12, [r1, #8964] / 14 */
		534	smmulr r9, r10, r3
		535	ldr r7 , [r1, #2244] / 3 */
		536	add r12, r12, r11 /* V[02] + V[14] */
		537	ldr r10, [r1, #8644] / 13 */
		538	smmlar r9, r12, r4, r9
		539	ldr r11, [r1, #2564] / 4 */
		540	rsb r10, r10, r7 /* V[03] - V[13] */
		541	ldr r12, [r1, #7684] / 12 */
		542	smmlar r9, r10, r5, r9
		543	ldr r7 , [r1, #3524] / 5 */
		544	add r12, r12, r11 /* V[04] + V[12] */
		545	ldr r10, [r1, #7364] / 11 */
		546	smmlar r9, r12, r6, r9
		547	ldmia r2!, { r3-r6 } /* load D[05..08] */
		548	ldr r11, [r1, #3844] / 6 */
		549	rsb r10, r10, r7 /* V[05] - V[11] */
		550	ldr r12, [r1, #6404] / 10 */
		551	smmlar r9, r10, r3, r9
		552	ldr r7 , [r1, #4804] / 7 */
		553	add r12, r12, r11 /* V[06] + V[10] */
		554	ldr r10, [r1, #6084] / 9 */
		555	smmlar r9, r12, r4, r9
		556	rsb r10, r10, r7 /* V[07] - V[09] */
		557	ldr r11, [r1, #5124] / 8 */
		558	smmlar r9, r10, r5, r9
		559	add r1, r1, #4 /* V+=1, r1 = V[01] */
		560	smmlar r9, r11, r6, r9
		561	add r2, r2, #74 / D+=7, r2 = D[16] */
		562	mov r9, r9, lsl #2
		563	str r9, [r0], #4 /* store Data */
		564
		565	/******************************************
		566	* rows 01..15 are symmetric to rows 31..17
		567	* r9 = acc of 01..15
		568	* r1 = V[01..15]
		569	* r11 = acc of 31..17
		570	* r12 = V[31..16]
		571	*****************************************/
		572	mov lr, #15*8
		573	add r12, r1, #304 / r12 = V[31] */
		574	.loop15:
		575	ldmia r2!, { r3-r6 } /* load D[00..03] */
		576	ldr r7, [r12, #8964] / 14 */
		577	ldr r8, [r12, #9924] / 15 */
		578	smmulr r11, r7, r4
		579	ldr r7, [r1] /* 0 */
		580	smmlar r11, r8, r3, r11
		581	ldr r8, [r1, #964] / 1 */
		582	smmulr r9, r7, r3
		583	ldr r7, [r12, #7684] / 12 */
		584	smmlar r9, r8, r4, r9
		585	ldr r8, [r12, #8644] / 13 */
		586	smmlar r11, r7, r6, r11
		587	ldr r7, [r1, #1284] / 2 */
		588	smmlar r11, r8, r5, r11
		589	ldr r8, [r1, #2244] / 3 */
		590	smmlar r9, r7, r5, r9
		591	ldr r7, [r1, #2564] / 4 */
		592	smmlar r9, r8, r6, r9
		593	ldmia r2!, { r3-r6 } /* load D[04..07] */
		594	ldr r8, [r1, #3524] / 5 */
		595	smmlar r9, r7, r3, r9
		596	ldr r7, [r12, #6404] / 10 */
		597	smmlar r9, r8, r4, r9
		598	ldr r8, [r12, #7364] / 11 */
		599	smmlar r11, r7, r4, r11
		600	ldr r7, [r1, #3844] / 6 */
		601	smmlar r11, r8, r3, r11
		602	ldr r8, [r1, #4804] / 7 */
		603	smmlar r9, r7, r5, r9
		604	ldr r7, [r12, #5124] / 8 */
		605	smmlar r9, r8, r6, r9
		606	ldr r8, [r12, #6084] / 9 */
		607	smmlar r11, r7, r6, r11
		608	ldr r7, [r12, #3844] / 6 */
		609	smmlar r11, r8, r5, r11
		610	ldmia r2!, { r3-r6 } /* load D[08..11] */
		611	ldr r8, [r12, #4804] / 7 */
		612	smmlar r11, r7, r4, r11
		613	ldr r7, [r1, #5124] / 8 */
		614	smmlar r11, r8, r3, r11
		615	ldr r8, [r1, #6084] / 9 */
		616	smmlar r9, r7, r3, r9
		617	ldr r7, [r1, #6404] / 10 */
		618	smmlar r9, r8, r4, r9
		619	ldr r8, [r1, #7364] / 11 */
		620	smmlar r9, r7, r5, r9
		621	ldr r7, [r12, #2564] / 4 */
		622	smmlar r9, r8, r6, r9
		623	ldr r8, [r12, #3524] / 5 */
		624	smmlar r11, r7, r6, r11
		625	ldr r7, [r1, #7684] / 12 */
		626	smmlar r11, r8, r5, r11
		627	ldmia r2!, { r3-r6 } /* load D[12..15] */
		628	ldr r8, [r1, #8644] / 13 */
		629	smmlar r9, r7, r3, r9
		630	ldr r7, [r12, #1284] / 2 */
		631	smmlar r9, r8, r4, r9
		632	ldr r8, [r12, #2244] / 3 */
		633	smmlar r11, r7, r4, r11
		634	ldr r7, [r12] /* 0 */
		635	smmlar r11, r8, r3, r11
		636	ldr r8, [r12, #964] / 1 */
		637	smmlar r11, r7, r6, r11
		638	ldr r7, [r1, #8964] / 14 */
		639	smmlar r11, r8, r5, r11
		640	ldr r8, [r1, #9924] / 15 */
		641	smmlar r9, r7, r5, r9
		642	sub r12, r12, #4 /* r12 = V-- correct adresses for next loop */
		643	smmlar r9, r8, r6, r9
		644	add r1, r1, #4 /* r1 = V++ correct adresses for next loop */
		645	rsb r11, r11, #0 /* r11 = -r11 */
		646	/* store Data[01..15] */
		647	mov r9, r9, lsl #2
		648	str r9, [r0] /* store Data */
		649	/* store Data[31..17] */
		650	mov r11, r11, lsl #2
		651	str r11, [r0, lr] /* store Data */
		652	add r0, r0, #4 /* r0++ */
		653	/* next loop */
		654	subs lr, lr, #8
		655	bgt .loop15
		656
		657	/******************************************
		658	* V[16] with internal symmetry
		659	*****************************************/
		660	ldmia r2!, { r3-r6 } /* load D[00..03] */
		661	ldr r7 , [r1] /* 0 */
		662	ldr r10, [r1, #9924] / 15 */
		663	ldr r11, [r1, #964] / 1 */
		664	rsb r10, r10, r7 /* V[00] - V[15] */
		665	ldr r12, [r1, #8964] / 14 */
		666	smmulr r9, r10, r3
		667	ldr r7 , [r1, #1284] / 2 */
		668	rsb r12, r12, r11 /* V[01] - V[14] */
		669	ldr r10, [r1, #8644] / 13 */
		670	smmlar r9, r12, r4, r9
		671	ldr r11, [r1, #2244] / 3 */
		672	rsb r10, r10, r7 /* V[02] - V[13] */
		673	ldr r12, [r1, #7684] / 12 */
		674	smmlar r9, r10, r5, r9
		675	ldr r7 , [r1, #2564] / 4 */
		676	rsb r12, r12, r11 /* V[03] - V[12] */
		677	ldr r10, [r1, #7364] / 11 */
		678	smmlar r9, r12, r6, r9
		679	ldmia r2!, { r3-r6 } /* load D[04..07] */
		680	ldr r11, [r1, #3524] / 5 */
		681	rsb r10, r10, r7 /* V[04] - V[11] */
		682	ldr r12, [r1, #6404] / 10 */
		683	smmlar r9, r10, r3, r9
		684	ldr r7 , [r1, #3844] / 6 */
		685	rsb r12, r12, r11 /* V[05] - V[10] */
		686	ldr r10, [r1, #6084] / 9 */
		687	smmlar r9, r12, r4, r9
		688	ldr r11, [r1, #4804] / 7 */
		689	rsb r10, r10, r7 /* V[06] - V[09] */
		690	ldr r12, [r1, #5124] / 8 */
		691	smmlar r9, r10, r5, r9
		692	rsb r12, r12, r11 /* V[07] - V[08] */
		693	smmlar r9, r12, r6, r9
		694	mov r9, r9, lsl #2
		695	str r9, [r0], #4 /* store Data */
		696
		697	ldmpc regs=r4-r11
504	#endif	698	#endif
505	.mpc_dewindowing_end:	699	.mpc_dewindowing_end:
506	.size mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D	700	.size mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D