From c13eba29ff5615cc74a7818e42cc9d464a7c7075 Mon Sep 17 00:00:00 2001 From: Tomasz Malesinski Date: Thu, 27 Sep 2007 21:58:51 +0000 Subject: FS #7833: Optimizations to the Vorbis codec: - ARM assembly version of parts of mdct, - special case for vorbis_book_decodevv_add for 2 channels and even book->dim, - store the output in vb->pcm if possible, as it is usually in IRAM as opposed to v->pcm. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@14875 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/Tremor/SOURCES | 3 + apps/codecs/Tremor/block.c | 20 +- apps/codecs/Tremor/codebook.c | 54 ++++- apps/codecs/Tremor/ivorbiscodec.h | 1 + apps/codecs/Tremor/mdct.c | 15 ++ apps/codecs/Tremor/mdct_arm.S | 419 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 502 insertions(+), 10 deletions(-) create mode 100644 apps/codecs/Tremor/mdct_arm.S diff --git a/apps/codecs/Tremor/SOURCES b/apps/codecs/Tremor/SOURCES index 0877941808..9b8c05e340 100644 --- a/apps/codecs/Tremor/SOURCES +++ b/apps/codecs/Tremor/SOURCES @@ -7,6 +7,9 @@ framing.c info.c mapping0.c mdct.c +#ifdef CPU_ARM +mdct_arm.S +#endif registry.c res012.c sharedbook.c diff --git a/apps/codecs/Tremor/block.c b/apps/codecs/Tremor/block.c index 80cbb7809c..e609fc44f7 100644 --- a/apps/codecs/Tremor/block.c +++ b/apps/codecs/Tremor/block.c @@ -171,6 +171,7 @@ static int _vds_init(vorbis_dsp_state *v,vorbis_info *vi){ v->pcm_storage=ci->blocksizes[1]; v->pcm=(ogg_int32_t **)_ogg_malloc(vi->channels*sizeof(*v->pcm)); + v->pcmb=(ogg_int32_t **)_ogg_malloc(vi->channels*sizeof(*v->pcmb)); v->pcmret=(ogg_int32_t **)_ogg_malloc(vi->channels*sizeof(*v->pcmret)); for(i=0;ichannels;i++) @@ -308,25 +309,28 @@ int vorbis_synthesis_blockin(vorbis_dsp_state *v,vorbis_block *vb){ /* large/large */ ogg_int32_t *pcm=v->pcm[j]+prevCenter; ogg_int32_t *p=vb->pcm[j]; - vect_add(pcm, p, n1); + vect_add(p, pcm, n1); + v->pcmb[j]=p; }else{ /* large/small */ ogg_int32_t *pcm=v->pcm[j]+prevCenter+n1/2-n0/2; ogg_int32_t *p=vb->pcm[j]; vect_add(pcm, p, n0); + v->pcmb[j]=v->pcm[j]+prevCenter; } }else{ if(v->W){ /* small/large */ ogg_int32_t *pcm=v->pcm[j]+prevCenter; ogg_int32_t *p=vb->pcm[j]+n1/2-n0/2; - vect_add(pcm, p, n0); - vect_copy(&pcm[n0], &p[n0], n1/2-n0/2); + vect_add(p, pcm, n0); + v->pcmb[j]=p; }else{ /* small/small */ ogg_int32_t *pcm=v->pcm[j]+prevCenter; ogg_int32_t *p=vb->pcm[j]; - vect_add(pcm, p, n0); + vect_add(p, pcm, n0); + v->pcmb[j]=p; } } @@ -351,10 +355,8 @@ int vorbis_synthesis_blockin(vorbis_dsp_state *v,vorbis_block *vb){ v->pcm_returned=thisCenter; v->pcm_current=thisCenter; }else{ - v->pcm_returned=prevCenter; - v->pcm_current=prevCenter+ - ci->blocksizes[v->lW]/4+ - ci->blocksizes[v->W]/4; + v->pcm_returned=0; + v->pcm_current=ci->blocksizes[v->lW]/4+ci->blocksizes[v->W]/4; } } @@ -436,7 +438,7 @@ int vorbis_synthesis_pcmout(vorbis_dsp_state *v,ogg_int32_t ***pcm){ if(pcm){ int i; for(i=0;ichannels;i++) - v->pcmret[i]=v->pcm[i]+v->pcm_returned; + v->pcmret[i]=v->pcmb[i]+v->pcm_returned; *pcm=v->pcmret; } return(v->pcm_current-v->pcm_returned); diff --git a/apps/codecs/Tremor/codebook.c b/apps/codecs/Tremor/codebook.c index 1287a95011..8c319ab49e 100644 --- a/apps/codecs/Tremor/codebook.c +++ b/apps/codecs/Tremor/codebook.c @@ -199,7 +199,7 @@ STIN long decode_packed_entry_number(codebook *book, return(-1); } -static inline long decode_packed_block(codebook *book, oggpack_buffer *b, +static long decode_packed_block(codebook *book, oggpack_buffer *b, long *buf, int n){ long *bufptr = buf; long *bufend = buf + n; @@ -399,6 +399,55 @@ long vorbis_book_decodev_set(codebook *book,ogg_int32_t *a, return(0); } +static long vorbis_book_decodevv_add_2ch_even(codebook *book,ogg_int32_t **a, + long offset,oggpack_buffer *b, + int n,int point){ + long i,k,chunk,read; + int shift=point-book->binarypoint; + long entries[32]; + ogg_int32_t *p0 = &(a[0][offset]); + ogg_int32_t *p1 = &(a[1][offset]); + + if(shift>=0){ + + for(i=0;idim>(n-i)*2) + chunk=((n-i)*2+book->dim-1)/book->dim; + read = decode_packed_block(book,b,entries,chunk); + for(k=0;kvaluelist+entries[k]*book->dim; + const ogg_int32_t *u = t+book->dim; + do{ + *p0++ += *t++>>shift; + *p1++ += *t++>>shift; + }while(tdim/2; + } + }else{ + shift = -shift; + for(i=0;idim>(n-i)*2) + chunk=((n-i)*2+book->dim-1)/book->dim; + read = decode_packed_block(book,b,entries,chunk); + for(k=0;kvaluelist+entries[k]*book->dim; + const ogg_int32_t *u = t+book->dim; + do{ + *p0++ += *t++<dim/2; + } + } + return(0); +} + long vorbis_book_decodevv_add(codebook *book,ogg_int32_t **a, long offset,int ch, oggpack_buffer *b,int n,int point){ @@ -408,6 +457,9 @@ long vorbis_book_decodevv_add(codebook *book,ogg_int32_t **a, int shift=point-book->binarypoint; long entries[32]; + if (!(book->dim&1) && ch==2) + return vorbis_book_decodevv_add_2ch_even(book,a,offset,b,n,point); + if(shift>=0){ for(i=offset;i>1), + sincos_lookup0, step, sincos_lookup0+1024); +} + +#else /* 8 point butterfly (in place) */ STIN void mdct_butterfly_8(DATA_TYPE *x){ @@ -225,6 +238,8 @@ void mdct_butterfly_generic(DATA_TYPE *x,int points, int step){ }while(T>sincos_lookup0); } +#endif /* CPU_ARM */ + STIN void mdct_butterflies(DATA_TYPE *x,int points,int shift) { int stages=8-shift; diff --git a/apps/codecs/Tremor/mdct_arm.S b/apps/codecs/Tremor/mdct_arm.S new file mode 100644 index 0000000000..495e6a17c9 --- /dev/null +++ b/apps/codecs/Tremor/mdct_arm.S @@ -0,0 +1,419 @@ +/*************************************************************************** + * __________ __ ___. + * Open \______ \ ____ ____ | | _\_ |__ _______ ___ + * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / + * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < + * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ + * \/ \/ \/ \/ \/ + * $Id: $ + * + * Copyright (C) 2007 by Tomasz Malesinski + * + * All files in this archive are subject to the GNU General Public License. + * See the file COPYING in the source tree root for full license agreement. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ****************************************************************************/ + +#define cPI3_8 (0x30fbc54d) +#define cPI2_8 (0x5a82799a) +#define cPI1_8 (0x7641af3d) + + .section .icode,"ax",%progbits + .align + + .global mdct_butterfly_32 + .global mdct_butterfly_generic_loop + +mdct_butterfly_8: + add r9, r5, r1 @ x4 + x0 + sub r5, r5, r1 @ x4 - x0 + add r7, r6, r2 @ x5 + x1 + sub r6, r6, r2 @ x5 - x1 + add r8, r10, r3 @ x6 + x2 + sub r10, r10, r3 @ x6 - x2 + add r12, r11, r4 @ x7 + x3 + sub r11, r11, r4 @ x7 - x3 + + add r1, r10, r6 @ y0 = (x6 - x2) + (x5 - x1) + sub r2, r11, r5 @ y1 = (x7 - x3) - (x4 - x0) + sub r3, r10, r6 @ y2 = (x6 - x2) - (x5 - x1) + add r4, r11, r5 @ y3 = (x7 - x3) + (x4 - x0) + sub r5, r8, r9 @ y4 = (x6 + x2) - (x4 + x0) + sub r6, r12, r7 @ y5 = (x7 + x3) - (x5 + x1) + add r10, r8, r9 @ y6 = (x6 + x2) + (x4 + x0) + add r11, r12, r7 @ y7 = (x7 + x3) + (x5 + x1) + stmia r0, {r1, r2, r3, r4, r5, r6, r10, r11} + + mov pc, lr + +mdct_butterfly_16: + str lr, [sp, #-4]! + add r1, r0, #8*4 + + ldmia r0, {r2, r3, r4, r5} + ldmia r1, {r6, r7, r8, r9} + add r6, r6, r2 @ y8 = x8 + x0 + rsb r2, r6, r2, asl #1 @ x0 - x8 + add r7, r7, r3 @ y9 = x9 + x1 + rsb r3, r7, r3, asl #1 @ x1 - x9 + add r8, r8, r4 @ y10 = x10 + x2 + sub r11, r8, r4, asl #1 @ x10 - x2 + add r9, r9, r5 @ y11 = x11 + x3 + rsb r10, r9, r5, asl #1 @ x3 - x11 + + stmia r1!, {r6, r7, r8, r9} + + add r2, r2, r3 @ (x0 - x8) + (x1 - x9) + rsb r3, r2, r3, asl #1 @ (x1 - x9) - (x0 - x8) + + ldr r12, =cPI2_8 + smull r8, r5, r2, r12 + mov r5, r5, asl #1 + smull r8, r6, r3, r12 + mov r6, r6, asl #1 + + stmia r0!, {r5, r6, r10, r11} + + ldmia r0, {r2, r3, r4, r5} + ldmia r1, {r6, r7, r8, r9} + add r6, r6, r2 @ y12 = x12 + x4 + sub r2, r6, r2, asl #1 @ x12 - x4 + add r7, r7, r3 @ y13 = x13 + x5 + sub r3, r7, r3, asl #1 @ x13 - x5 + add r8, r8, r4 @ y10 = x14 + x6 + sub r10, r8, r4, asl #1 @ x14 - x6 + add r9, r9, r5 @ y11 = x15 + x7 + sub r11, r9, r5, asl #1 @ x15 - x7 + + stmia r1, {r6, r7, r8, r9} + + sub r2, r2, r3 @ (x12 - x4) - (x13 - x5) + add r3, r2, r3, asl #1 @ (x12 - x4) + (x13 - x5) + + smull r8, r5, r2, r12 + mov r5, r5, asl #1 + smull r8, r6, r3, r12 + mov r6, r6, asl #1 + @ no stmia here, r5, r6, r10, r11 are passed to mdct_butterfly_8 + + sub r0, r0, #4*4 + ldmia r0, {r1, r2, r3, r4} + bl mdct_butterfly_8 + add r0, r0, #8*4 + ldmia r0, {r1, r2, r3, r4, r5, r6, r10, r11} + bl mdct_butterfly_8 + + ldr pc, [sp], #4 + +mdct_butterfly_32: + stmdb sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr} + + add r1, r0, #16*4 + + ldmia r0, {r2, r3, r4, r5} + ldmia r1, {r6, r7, r8, r9} + add r6, r6, r2 @ y16 = x16 + x0 + rsb r2, r6, r2, asl #1 @ x0 - x16 + add r7, r7, r3 @ y17 = x17 + x1 + rsb r3, r7, r3, asl #1 @ x1 - x17 + add r8, r8, r4 @ y18 = x18 + x2 + rsb r4, r8, r4, asl #1 @ x2 - x18 + add r9, r9, r5 @ y19 = x19 + x3 + rsb r5, r9, r5, asl #1 @ x3 - x19 + + stmia r1!, {r6, r7, r8, r9} + + ldr r12, =cPI1_8 + ldr lr, =cPI3_8 + smull r10, r6, r2, r12 + smlal r10, r6, r3, lr + rsb r2, r2, #0 + smull r10, r7, r3, r12 + smlal r10, r7, r2, lr + mov r6, r6, asl #1 + mov r7, r7, asl #1 + + add r4, r4, r5 @ (x3 - x19) + (x2 - x18) + rsb r5, r4, r5, asl #1 @ (x3 - x19) - (x2 - x18) + + ldr r11, =cPI2_8 + smull r10, r8, r4, r11 + mov r8, r8, asl #1 + smull r10, r9, r5, r11 + mov r9, r9, asl #1 + + stmia r0!, {r6, r7, r8, r9} + + ldmia r0, {r2, r3, r4, r5} + ldmia r1, {r6, r7, r8, r9} + add r6, r6, r2 @ y20 = x20 + x4 + rsb r2, r6, r2, asl #1 @ x4 - x20 + add r7, r7, r3 @ y21 = x21 + x5 + rsb r3, r7, r3, asl #1 @ x5 - x21 + add r8, r8, r4 @ y22 = x22 + x6 + sub r4, r8, r4, asl #1 @ x22 - x6 + add r9, r9, r5 @ y23 = x23 + x7 + rsb r5, r9, r5, asl #1 @ x7 - x23 + + stmia r1!, {r6, r7, r8, r9} + + smull r10, r6, r2, lr + smlal r10, r6, r3, r12 + rsb r2, r2, #0 + smull r10, r7, r3, lr + smlal r10, r7, r2, r12 + mov r6, r6, asl #1 + mov r7, r7, asl #1 + + mov r8, r5 + mov r9, r4 + stmia r0!, {r6, r7, r8, r9} + + ldmia r0, {r2, r3, r4, r5} + ldmia r1, {r6, r7, r8, r9} + add r6, r6, r2 @ y24 = x24 + x8 + sub r2, r6, r2, asl #1 @ x24 - x8 + add r7, r7, r3 @ y25 = x25 + x9 + sub r3, r7, r3, asl #1 @ x25 - x9 + add r8, r8, r4 @ y26 = x26 + x10 + sub r4, r8, r4, asl #1 @ x26 - x10 + add r9, r9, r5 @ y27 = x27 + x11 + sub r5, r9, r5, asl #1 @ x27 - x11 + + stmia r1!, {r6, r7, r8, r9} + + smull r10, r7, r2, r12 + smlal r10, r7, r3, lr + rsb r3, r3, #0 + smull r10, r6, r3, r12 + smlal r10, r6, r2, lr + mov r6, r6, asl #1 + mov r7, r7, asl #1 + + sub r4, r4, r5 @ (x26 - x10) - (x27 - x11) + add r5, r4, r5, asl #1 @ (x26 - x10) + (x27 - x11) + + ldr r11, =cPI2_8 + smull r10, r8, r4, r11 + mov r8, r8, asl #1 + smull r10, r9, r5, r11 + mov r9, r9, asl #1 + + stmia r0!, {r6, r7, r8, r9} + + ldmia r0, {r2, r3, r4, r5} + ldmia r1, {r6, r7, r8, r9} + add r6, r6, r2 @ y28 = x28 + x12 + sub r2, r6, r2, asl #1 @ x28 - x12 + add r7, r7, r3 @ y29 = x29 + x13 + sub r3, r7, r3, asl #1 @ x29 - x13 + add r8, r8, r4 @ y30 = x30 + x14 + sub r4, r8, r4, asl #1 @ x30 - x14 + add r9, r9, r5 @ y31 = x31 + x15 + sub r5, r9, r5, asl #1 @ x31 - x15 + + stmia r1, {r6, r7, r8, r9} + + smull r10, r7, r2, lr + smlal r10, r7, r3, r12 + rsb r3, r3, #0 + smull r10, r6, r3, lr + smlal r10, r6, r2, r12 + mov r6, r6, asl #1 + mov r7, r7, asl #1 + + mov r8, r4 + mov r9, r5 + stmia r0, {r6, r7, r8, r9} + + sub r0, r0, #12*4 + str r0, [sp, #-4]! + bl mdct_butterfly_16 + + ldr r0, [sp], #4 + add r0, r0, #16*4 + bl mdct_butterfly_16 + + ldmia sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc} + + @ mdct_butterfly_generic_loop(x1, x2, T0, step, Ttop) +mdct_butterfly_generic_loop: + stmdb sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr} + str r2, [sp, #-4] + ldr r4, [sp, #40] +1: + ldmdb r0, {r6, r7, r8, r9} + ldmdb r1, {r10, r11, r12, r14} + + add r6, r6, r10 + sub r10, r6, r10, asl #1 + add r7, r7, r11 + rsb r11, r7, r11, asl #1 + add r8, r8, r12 + sub r12, r8, r12, asl #1 + add r9, r9, r14 + rsb r14, r9, r14, asl #1 + + stmdb r0!, {r6, r7, r8, r9} + + ldmia r2, {r6, r7} + smull r5, r8, r14, r6 + smlal r5, r8, r12, r7 + rsb r14, r14, #0 + smull r5, r9, r12, r6 + smlal r5, r9, r14, r7 + + mov r8, r8, asl #1 + mov r9, r9, asl #1 + stmdb r1!, {r8, r9} + add r2, r2, r3, asl #2 + + ldmia r2, {r6, r7} + smull r5, r8, r11, r6 + smlal r5, r8, r10, r7 + rsb r11, r11, #0 + smull r5, r9, r10, r6 + smlal r5, r9, r11, r7 + + mov r8, r8, asl #1 + mov r9, r9, asl #1 + stmdb r1!, {r8, r9} + add r2, r2, r3, asl #2 + + cmp r2, r4 + blo 1b + + ldr r4, [sp, #-4] +1: + ldmdb r0, {r6, r7, r8, r9} + ldmdb r1, {r10, r11, r12, r14} + + add r6, r6, r10 + sub r10, r6, r10, asl #1 + add r7, r7, r11 + sub r11, r7, r11, asl #1 + add r8, r8, r12 + sub r12, r8, r12, asl #1 + add r9, r9, r14 + sub r14, r9, r14, asl #1 + + stmdb r0!, {r6, r7, r8, r9} + + ldmia r2, {r6, r7} + smull r5, r9, r14, r6 + smlal r5, r9, r12, r7 + rsb r14, r14, #0 + smull r5, r8, r12, r6 + smlal r5, r8, r14, r7 + + mov r8, r8, asl #1 + mov r9, r9, asl #1 + stmdb r1!, {r8, r9} + sub r2, r2, r3, asl #2 + + ldmia r2, {r6, r7} + smull r5, r9, r11, r6 + smlal r5, r9, r10, r7 + rsb r11, r11, #0 + smull r5, r8, r10, r6 + smlal r5, r8, r11, r7 + + mov r8, r8, asl #1 + mov r9, r9, asl #1 + stmdb r1!, {r8, r9} + sub r2, r2, r3, asl #2 + + cmp r2, r4 + bhi 1b + + ldr r4, [sp, #40] +1: + ldmdb r0, {r6, r7, r8, r9} + ldmdb r1, {r10, r11, r12, r14} + + add r6, r6, r10 + rsb r10, r6, r10, asl #1 + add r7, r7, r11 + rsb r11, r7, r11, asl #1 + add r8, r8, r12 + rsb r12, r8, r12, asl #1 + add r9, r9, r14 + rsb r14, r9, r14, asl #1 + + stmdb r0!, {r6, r7, r8, r9} + + ldmia r2, {r6, r7} + smull r5, r8, r12, r6 + smlal r5, r8, r14, r7 + rsb r12, r12, #0 + smull r5, r9, r14, r6 + smlal r5, r9, r12, r7 + + mov r8, r8, asl #1 + mov r9, r9, asl #1 + stmdb r1!, {r8, r9} + add r2, r2, r3, asl #2 + + ldmia r2, {r6, r7} + smull r5, r8, r10, r6 + smlal r5, r8, r11, r7 + rsb r10, r10, #0 + smull r5, r9, r11, r6 + smlal r5, r9, r10, r7 + + mov r8, r8, asl #1 + mov r9, r9, asl #1 + stmdb r1!, {r8, r9} + add r2, r2, r3, asl #2 + + cmp r2, r4 + blo 1b + + ldr r4, [sp, #-4] +1: + ldmdb r0, {r6, r7, r8, r9} + ldmdb r1, {r10, r11, r12, r14} + + add r6, r6, r10 + sub r10, r6, r10, asl #1 + add r7, r7, r11 + rsb r11, r7, r11, asl #1 + add r8, r8, r12 + sub r12, r8, r12, asl #1 + add r9, r9, r14 + rsb r14, r9, r14, asl #1 + + stmdb r0!, {r6, r7, r8, r9} + + ldmia r2, {r6, r7} + smull r5, r9, r12, r6 + smlal r5, r9, r14, r7 + rsb r12, r12, #0 + smull r5, r8, r14, r6 + smlal r5, r8, r12, r7 + + mov r8, r8, asl #1 + mov r9, r9, asl #1 + stmdb r1!, {r8, r9} + sub r2, r2, r3, asl #2 + + ldmia r2, {r6, r7} + smull r5, r9, r10, r6 + smlal r5, r9, r11, r7 + rsb r10, r10, #0 + smull r5, r8, r11, r6 + smlal r5, r8, r10, r7 + + mov r8, r8, asl #1 + mov r9, r9, asl #1 + stmdb r1!, {r8, r9} + sub r2, r2, r3, asl #2 + + cmp r2, r4 + bhi 1b + + ldmia sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc} + -- cgit v1.2.3