From 67fb5415f78a3198030a6285d1ccc641044f149b Mon Sep 17 00:00:00 2001 From: Dave Hooper Date: Sat, 25 Apr 2009 11:25:13 +0000 Subject: Commit FS#9882 - make better use of iram at different quality encodings, remove redundant memsets, implement doublebuffer if it will fit in iram to save a mempcy each frame, and some alignment fixes for coldfire git-svn-id: svn://svn.rockbox.org/rockbox/trunk@20783 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/lib/asm_mcf5249.h | 8 +- apps/codecs/libtremor/asm_arm.h | 134 +++++++++++--------- apps/codecs/libtremor/asm_mcf5249.h | 21 ++-- apps/codecs/libtremor/block.c | 222 ++++++++++++++++++++++------------ apps/codecs/libtremor/config-tremor.h | 29 ++++- apps/codecs/libtremor/ivorbiscodec.h | 4 + apps/codecs/libtremor/mapping0.c | 2 - apps/codecs/libtremor/misc.h | 20 +-- apps/codecs/libtremor/oggmalloc.c | 24 ++++ apps/codecs/libtremor/os_types.h | 4 + apps/codecs/libtremor/res012.c | 2 - apps/codecs/libtremor/synthesis.c | 36 +++--- apps/codecs/libtremor/window.c | 12 +- apps/codecs/libtremor/window_lookup.h | 4 +- 14 files changed, 335 insertions(+), 187 deletions(-) diff --git a/apps/codecs/lib/asm_mcf5249.h b/apps/codecs/lib/asm_mcf5249.h index 20899f0a5b..e3dc8dd684 100644 --- a/apps/codecs/lib/asm_mcf5249.h +++ b/apps/codecs/lib/asm_mcf5249.h @@ -143,7 +143,7 @@ static inline void vect_add(int32_t *x, int32_t *y, int n) { /* align to 16 bytes */ - while(n>0 && (int)x&16) { + while(n>0 && (int)x&15) { *x++ += *y++; n--; } @@ -177,7 +177,7 @@ static inline void vect_copy(int32_t *x, int32_t *y, int n) { /* align to 16 bytes */ - while(n>0 && (int)x&16) { + while(n>0 && (int)x&15) { *x++ = *y++; n--; } @@ -204,7 +204,7 @@ static inline void vect_mult_fw(int32_t *data, int32_t *window, int n) { /* ensure data is aligned to 16-bytes */ - while(n>0 && (int)data%16) { + while(n>0 && (int)data&15) { *data = MULT31(*data, *window); data++; window++; @@ -258,7 +258,7 @@ static inline void vect_mult_bw(int32_t *data, int32_t *window, int n) { /* ensure at least data is aligned to 16-bytes */ - while(n>0 && (int)data%16) { + while(n>0 && (int)data&15) { *data = MULT31(*data, *window); data++; window--; diff --git a/apps/codecs/libtremor/asm_arm.h b/apps/codecs/libtremor/asm_arm.h index bc09ac5170..5a8109841f 100644 --- a/apps/codecs/libtremor/asm_arm.h +++ b/apps/codecs/libtremor/asm_arm.h @@ -99,104 +99,120 @@ static inline void XNPROD31(ogg_int32_t a, ogg_int32_t b, #define _V_VECT_OPS /* asm versions of vector operations for block.c, window.c */ +/* SOME IMPORTANT NOTES: this implementation of vect_mult_bw does + NOT do a final shift, meaning that the result of vect_mult_bw is + only 31 bits not 32. This is so that we can do the shift in-place + in vect_add_xxxx instead to save one instruction for each mult on arm */ static inline -void vect_add(ogg_int32_t *x, ogg_int32_t *y, int n) +void vect_add_right_left(ogg_int32_t *x, const ogg_int32_t *y, int n) { - while (n>=4) { - asm volatile ("ldmia %[x], {r0, r1, r2, r3};" + /* first arg is right subframe of previous frame and second arg + is left subframe of current frame. overlap left onto right overwriting + the right subframe */ + + do{ + asm volatile ( + "ldmia %[x], {r0, r1, r2, r3};" "ldmia %[y]!, {r4, r5, r6, r7};" - "add r0, r0, r4;" - "add r1, r1, r5;" - "add r2, r2, r6;" - "add r3, r3, r7;" + "add r0, r4, r0, lsl #1;" + "add r1, r5, r1, lsl #1;" + "add r2, r6, r2, lsl #1;" + "add r3, r7, r3, lsl #1;" + "stmia %[x]!, {r0, r1, r2, r3};" + "ldmia %[x], {r0, r1, r2, r3};" + "ldmia %[y]!, {r4, r5, r6, r7};" + "add r0, r4, r0, lsl #1;" + "add r1, r5, r1, lsl #1;" + "add r2, r6, r2, lsl #1;" + "add r3, r7, r3, lsl #1;" "stmia %[x]!, {r0, r1, r2, r3};" : [x] "+r" (x), [y] "+r" (y) : : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "memory"); - n -= 4; - } - /* add final elements */ - while (n>0) { - *x++ += *y++; - n--; - } + n -= 8; + } while (n); } static inline -void vect_copy(ogg_int32_t *x, ogg_int32_t *y, int n) +void vect_add_left_right(ogg_int32_t *x, const ogg_int32_t *y, int n) { - while (n>=4) { - asm volatile ("ldmia %[y]!, {r0, r1, r2, r3};" + /* first arg is left subframe of current frame and second arg + is right subframe of previous frame. overlap right onto left overwriting + the LEFT subframe */ + do{ + asm volatile ( + "ldmia %[x], {r0, r1, r2, r3};" + "ldmia %[y]!, {r4, r5, r6, r7};" + "add r0, r0, r4, lsl #1;" + "add r1, r1, r5, lsl #1;" + "add r2, r2, r6, lsl #1;" + "add r3, r3, r7, lsl #1;" + "stmia %[x]!, {r0, r1, r2, r3};" + "ldmia %[x], {r0, r1, r2, r3};" + "ldmia %[y]!, {r4, r5, r6, r7};" + "add r0, r0, r4, lsl #1;" + "add r1, r1, r5, lsl #1;" + "add r2, r2, r6, lsl #1;" + "add r3, r3, r7, lsl #1;" "stmia %[x]!, {r0, r1, r2, r3};" : [x] "+r" (x), [y] "+r" (y) : : "r0", "r1", "r2", "r3", + "r4", "r5", "r6", "r7", "memory"); - n -= 4; - } - /* copy final elements */ - while (n>0) { - *x++ = *y++; - n--; - } + n -= 8; + } while (n); } static inline void vect_mult_fw(ogg_int32_t *data, LOOKUP_T *window, int n) { - while (n>=4) { - asm volatile ("ldmia %[d], {r0, r1, r2, r3};" + /* Note, mult_fw uses MULT31 */ + do{ + asm volatile ( + "ldmia %[d], {r0, r1, r2, r3};" "ldmia %[w]!, {r4, r5, r6, r7};" - "smull r8, r9, r0, r4;" - "mov r0, r9, lsl #1;" - "smull r8, r9, r1, r5;" - "mov r1, r9, lsl #1;" - "smull r8, r9, r2, r6;" - "mov r2, r9, lsl #1;" - "smull r8, r9, r3, r7;" - "mov r3, r9, lsl #1;" + "smull r8, r0, r4, r0;" + "mov r0, r0, lsl #1;" + "smull r8, r1, r5, r1;" + "mov r1, r1, lsl #1;" + "smull r8, r2, r6, r2;" + "mov r2, r2, lsl #1;" + "smull r8, r3, r7, r3;" + "mov r3, r3, lsl #1;" "stmia %[d]!, {r0, r1, r2, r3};" : [d] "+r" (data), [w] "+r" (window) : : "r0", "r1", "r2", "r3", - "r4", "r5", "r6", "r7", "r8", "r9", + "r4", "r5", "r6", "r7", "r8", "memory", "cc"); n -= 4; - } - while(n>0) { - *data = MULT31(*data, *window); - data++; - window++; - n--; - } + } while (n); } static inline void vect_mult_bw(ogg_int32_t *data, LOOKUP_T *window, int n) { - while (n>=4) { + /* NOTE mult_bw uses MULT_32 i.e. doesn't shift result left at end */ + /* On ARM, we can do the shift at the same time as the overlap-add */ + do{ asm volatile ("ldmia %[d], {r0, r1, r2, r3};" "ldmda %[w]!, {r4, r5, r6, r7};" - "smull r8, r9, r0, r7;" - "mov r0, r9, lsl #1;" - "smull r8, r9, r1, r6;" - "mov r1, r9, lsl #1;" - "smull r8, r9, r2, r5;" - "mov r2, r9, lsl #1;" - "smull r8, r9, r3, r4;" - "mov r3, r9, lsl #1;" + "smull r8, r0, r7, r0;" + "smull r7, r1, r6, r1;" + "smull r6, r2, r5, r2;" + "smull r5, r3, r4, r3;" "stmia %[d]!, {r0, r1, r2, r3};" : [d] "+r" (data), [w] "+r" (window) : : "r0", "r1", "r2", "r3", - "r4", "r5", "r6", "r7", "r8", "r9", + "r4", "r5", "r6", "r7", "r8", "memory", "cc"); n -= 4; - } - while(n>0) { - *data = MULT31(*data, *window); - data++; - window--; - n--; - } + } while (n); +} + +static inline void vect_copy(ogg_int32_t *x, const ogg_int32_t *y, int n) +{ + memcpy(x,y,n*sizeof(ogg_int32_t)); } #endif diff --git a/apps/codecs/libtremor/asm_mcf5249.h b/apps/codecs/libtremor/asm_mcf5249.h index 64dfb1b785..224a861afd 100644 --- a/apps/codecs/libtremor/asm_mcf5249.h +++ b/apps/codecs/libtremor/asm_mcf5249.h @@ -140,10 +140,10 @@ void XNPROD31(ogg_int32_t a, ogg_int32_t b, /* asm versions of vector operations for block.c, window.c */ /* assumes MAC is initialized & accumulators cleared */ static inline -void vect_add(ogg_int32_t *x, ogg_int32_t *y, int n) +void vect_add_right_left(ogg_int32_t *x, const ogg_int32_t *y, int n) { /* align to 16 bytes */ - while(n>0 && (int)x&16) { + while(n>0 && (int)x&15) { *x++ += *y++; n--; } @@ -172,12 +172,20 @@ void vect_add(ogg_int32_t *x, ogg_int32_t *y, int n) n--; } } +static inline +void vect_add_left_right(ogg_int32_t *x, const ogg_int32_t *y, int n) +{ + /* coldfire asm has symmetrical versions of vect_add_right_left + and vect_add_left_right (since symmetrical versions of + vect_mult_fw and vect_mult_bw i.e. both use MULT31) */ + vect_add_right_left(x, y, n ); +} static inline -void vect_copy(ogg_int32_t *x, ogg_int32_t *y, int n) +void vect_copy(ogg_int32_t *x, const ogg_int32_t *y, int n) { /* align to 16 bytes */ - while(n>0 && (int)x&16) { + while(n>0 && (int)x&15) { *x++ = *y++; n--; } @@ -199,12 +207,11 @@ void vect_copy(ogg_int32_t *x, ogg_int32_t *y, int n) } } - static inline void vect_mult_fw(ogg_int32_t *data, LOOKUP_T *window, int n) { /* ensure data is aligned to 16-bytes */ - while(n>0 && (int)data%16) { + while(n>0 && (int)data&15) { *data = MULT31(*data, *window); data++; window++; @@ -258,7 +265,7 @@ static inline void vect_mult_bw(ogg_int32_t *data, LOOKUP_T *window, int n) { /* ensure at least data is aligned to 16-bytes */ - while(n>0 && (int)data%16) { + while(n>0 && (int)data&15) { *data = MULT31(*data, *window); data++; window--; diff --git a/apps/codecs/libtremor/block.c b/apps/codecs/libtremor/block.c index e609fc44f7..eb087e12a9 100644 --- a/apps/codecs/libtremor/block.c +++ b/apps/codecs/libtremor/block.c @@ -36,6 +36,13 @@ static int ilog(unsigned int v){ return(ret); } +static ogg_int32_t* _pcmp [CHANNELS] IBSS_ATTR; +static ogg_int32_t* _pcmbp[CHANNELS] IBSS_ATTR; +static ogg_int32_t* _pcmret[CHANNELS] IBSS_ATTR; +/* if true, we have both pcm buffers in iram and we use a bufferflip. + if false, we have one in iram and one in mem, and we use a memcpy */ +static bool iram_pcm_doublebuffer IBSS_ATTR; + /* pcm accumulator examples (not exhaustive): <-------------- lW ----------------> @@ -145,18 +152,44 @@ int vorbis_block_clear(vorbis_block *vb){ static int _vds_init(vorbis_dsp_state *v,vorbis_info *vi){ int i; + long b_size[2]; + LOOKUP_TNC *iramposw; + ogg_int32_t *internal_pcm=NULL; + codec_setup_info *ci=(codec_setup_info *)vi->codec_setup; private_state *b=NULL; memset(v,0,sizeof(*v)); + v->reset_pcmb=true; b=(private_state *)(v->backend_state=_ogg_calloc(1,sizeof(*b))); v->vi=vi; b->modebits=ilog(ci->modes); - + + /* allocate IRAM buffer for the PCM data generated by synthesis */ + iram_malloc_init(); + v->iram_pcm=(ogg_int32_t *)iram_malloc(vi->channels*ci->blocksizes[1]*sizeof(ogg_int32_t)); + if(v->iram_pcm != NULL) v->iram_pcm_storage=ci->blocksizes[1]; + else v->iram_pcm_storage=0; + + v->centerW=0; + /* Vorbis I uses only window type 0 */ - b->window[0]=_vorbis_window(0,ci->blocksizes[0]/2); - b->window[1]=_vorbis_window(0,ci->blocksizes[1]/2); + b_size[0]=ci->blocksizes[0]/2; + b_size[1]=ci->blocksizes[1]/2; + b->window[0]=_vorbis_window(0,b_size[0]); + b->window[1]=_vorbis_window(0,b_size[1]); + + /* allocate IRAM buffer for window tables too, if sufficient iram available */ + /* give preference to the larger window over the smaller window + (on the assumption that both windows are equally likely used) */ + for(i=1; i>=0; i--){ + iramposw=(LOOKUP_TNC *)iram_malloc(b_size[i]*sizeof(LOOKUP_TNC)); + if(iramposw!=NULL) { + memcpy(iramposw, b->window[i], b_size[i]*sizeof(LOOKUP_TNC)); + b->window[i]=iramposw; + } + } /* finish the codebooks */ if(!ci->fullbooks){ @@ -169,14 +202,34 @@ static int _vds_init(vorbis_dsp_state *v,vorbis_info *vi){ } } + /* if we can get away with it, put a double buffer into IRAM too, so that + overlap-add runs iram-to-iram and we avoid needing to memcpy */ v->pcm_storage=ci->blocksizes[1]; - v->pcm=(ogg_int32_t **)_ogg_malloc(vi->channels*sizeof(*v->pcm)); - v->pcmb=(ogg_int32_t **)_ogg_malloc(vi->channels*sizeof(*v->pcmb)); - v->pcmret=(ogg_int32_t **)_ogg_malloc(vi->channels*sizeof(*v->pcmret)); - - for(i=0;ichannels;i++) - v->pcm[i]=(ogg_int32_t *)_ogg_calloc(v->pcm_storage,sizeof(*v->pcm[i])); - + v->pcm=_pcmp; + v->pcmret=_pcmret; + v->pcmb=_pcmbp; + + _pcmp[0]=NULL; + _pcmp[1]=NULL; + _pcmbp[0]=NULL; + _pcmbp[1]=NULL; + + if(NULL != (internal_pcm = iram_malloc(vi->channels*v->pcm_storage*sizeof(ogg_int32_t)))) + { + /* one-time initialisation at codec start or on switch from + blocksizes greater than IRAM_PCM_END to sizes that fit */ + for(i=0;ichannels;i++) + v->pcm[i]=&internal_pcm[i*v->pcm_storage]; + iram_pcm_doublebuffer = true; + } + else + { + /* one-time initialisation at codec start or on switch from + blocksizes that fit in IRAM_PCM_END to those that don't */ + for(i=0;ichannels;i++) + v->pcm[i]=(ogg_int32_t *)_ogg_calloc(v->pcm_storage,sizeof(*v->pcm[i])); + iram_pcm_doublebuffer = false; + } /* all 1 (large block) or 0 (small block) */ /* explicitly set for the sake of clarity */ @@ -203,13 +256,17 @@ int vorbis_synthesis_restart(vorbis_dsp_state *v){ ci=vi->codec_setup; if(!ci)return -1; - v->centerW=ci->blocksizes[1]/2; - v->pcm_current=v->centerW; + v->centerW=0; + v->pcm_current=0; v->pcm_returned=-1; v->granulepos=-1; v->sequence=-1; ((private_state *)(v->backend_state))->sample_count=-1; + + /* indicate to synthesis code that buffer pointers no longer valid + (if we're using double pcm buffer) and will need to reset them */ + v->reset_pcmb = true; return(0); } @@ -228,11 +285,12 @@ void vorbis_dsp_clear(vorbis_dsp_state *v){ codec_setup_info *ci=(codec_setup_info *)(vi?vi->codec_setup:NULL); private_state *b=(private_state *)v->backend_state; - if(v->pcm){ - for(i=0;ichannels;i++) - if(v->pcm[i])_ogg_free(v->pcm[i]); - _ogg_free(v->pcm); - if(v->pcmret)_ogg_free(v->pcmret); + if(!iram_pcm_doublebuffer) + { + if(v->pcm){ + for(i=0;ichannels;i++) + if(v->pcm[i])_ogg_free(v->pcm[i]); + } } /* free mode lookups; these are actually vorbis_look_mapping structs */ @@ -258,7 +316,7 @@ void vorbis_dsp_clear(vorbis_dsp_state *v){ calling (as it relies on the previous block). */ int vorbis_synthesis_blockin(vorbis_dsp_state *v,vorbis_block *vb) - ICODE_ATTR_TREMOR_NOT_MDCT; + ICODE_ATTR; int vorbis_synthesis_blockin(vorbis_dsp_state *v,vorbis_block *vb){ vorbis_info *vi=v->vi; codec_setup_info *ci=(codec_setup_info *)vi->codec_setup; @@ -278,85 +336,91 @@ int vorbis_synthesis_blockin(vorbis_dsp_state *v,vorbis_block *vb){ } v->sequence=vb->sequence; + int n=ci->blocksizes[v->W]/2; + int ln=ci->blocksizes[v->lW]/2; - if(vb->pcm){ /* no pcm to process if vorbis_synthesis_trackonly - was called on block */ - int n=ci->blocksizes[v->W]/2; + if(LIKELY(vb->pcm)){ /* no pcm to process if vorbis_synthesis_trackonly + was called on block */ + int prevCenter; int n0=ci->blocksizes[0]/2; int n1=ci->blocksizes[1]/2; - - int thisCenter; - int prevCenter; - - if(v->centerW){ - thisCenter=n1; - prevCenter=0; - }else{ - thisCenter=0; - prevCenter=n1; + + if(iram_pcm_doublebuffer) + { + prevCenter = ln; + } + else + { + prevCenter = v->centerW; + v->centerW = n1 - v->centerW; } - - /* v->pcm is now used like a two-stage double buffer. We don't want - to have to constantly shift *or* adjust memory usage. Don't - accept a new block until the old is shifted out */ /* overlap/add PCM */ - - for(j=0;jchannels;j++){ - /* the overlap/add section */ - if(v->lW){ - if(v->W){ - /* large/large */ - ogg_int32_t *pcm=v->pcm[j]+prevCenter; - ogg_int32_t *p=vb->pcm[j]; - vect_add(p, pcm, n1); - v->pcmb[j]=p; - }else{ - /* large/small */ - ogg_int32_t *pcm=v->pcm[j]+prevCenter+n1/2-n0/2; - ogg_int32_t *p=vb->pcm[j]; - vect_add(pcm, p, n0); - v->pcmb[j]=v->pcm[j]+prevCenter; + /* nb nothing to overlap with on first block so don't bother */ + if(LIKELY(v->pcm_returned!=-1)) + { + for(j=0;jchannels;j++) + { + ogg_int32_t *pcm=v->pcm[j]+prevCenter; + ogg_int32_t *p=vb->pcm[j]; + + /* the overlap/add section */ + if(v->lW == v->W) + { + /* large/large or small/small */ + vect_add_right_left(pcm,p,n); + v->pcmb[j]=pcm; } - }else{ - if(v->W){ - /* small/large */ - ogg_int32_t *pcm=v->pcm[j]+prevCenter; - ogg_int32_t *p=vb->pcm[j]+n1/2-n0/2; - vect_add(p, pcm, n0); - v->pcmb[j]=p; - }else{ - /* small/small */ - ogg_int32_t *pcm=v->pcm[j]+prevCenter; - ogg_int32_t *p=vb->pcm[j]; - vect_add(p, pcm, n0); - v->pcmb[j]=p; + else if (!v->W) + { + /* large/small */ + vect_add_right_left(pcm + (n1-n0)/2, p, n0); + v->pcmb[j]=pcm; + } + else + { + /* small/large */ + p += (n1-n0)/2; + vect_add_left_right(p,pcm,n0); + v->pcmb[j]=p; } } - - /* the copy section */ + } + + /* the copy section */ + if(iram_pcm_doublebuffer) + { + /* just flip the pointers over as we have a double buffer in iram */ + ogg_int32_t *p; + p=v->pcm[0]; + v->pcm[0]=vb->pcm[0]; + vb->pcm[0] = p; + p=v->pcm[1]; + v->pcm[1]=vb->pcm[1]; + vb->pcm[1] = p; + } + else + { + for(j=0;jchannels;j++) { - ogg_int32_t *pcm=v->pcm[j]+thisCenter; - ogg_int32_t *p=vb->pcm[j]+n; - vect_copy(pcm, p, n); + /* at best only vb->pcm is in iram, and that's where we do the + synthesis, so we copy out the right-hand subframe of last + synthesis into (noniram) local buffer so we can still do + synth in iram */ + vect_copy(v->pcm[j]+v->centerW, vb->pcm[j]+n, n); } } - if(v->centerW) - v->centerW=0; - else - v->centerW=n1; - /* deal with initial packet state; we do this using the explicit pcm_returned==-1 flag otherwise we're sensitive to first block being short or long */ if(v->pcm_returned==-1){ - v->pcm_returned=thisCenter; - v->pcm_current=thisCenter; + v->pcm_returned=0; + v->pcm_current=0; }else{ v->pcm_returned=0; - v->pcm_current=ci->blocksizes[v->lW]/4+ci->blocksizes[v->W]/4; + v->pcm_current=(n+ln)/2; } } @@ -375,7 +439,7 @@ int vorbis_synthesis_blockin(vorbis_dsp_state *v,vorbis_block *vb){ if(b->sample_count==-1){ b->sample_count=0; }else{ - b->sample_count+=ci->blocksizes[v->lW]/4+ci->blocksizes[v->W]/4; + b->sample_count+=(n+ln)/2; } if(v->granulepos==-1){ @@ -406,7 +470,7 @@ int vorbis_synthesis_blockin(vorbis_dsp_state *v,vorbis_block *vb){ } }else{ - v->granulepos+=ci->blocksizes[v->lW]/4+ci->blocksizes[v->W]/4; + v->granulepos+=(n+ln)/2; if(vb->granulepos!=-1 && v->granulepos!=vb->granulepos){ if(v->granulepos>vb->granulepos){ diff --git a/apps/codecs/libtremor/config-tremor.h b/apps/codecs/libtremor/config-tremor.h index 7cfcb7e35d..d360b5fd73 100644 --- a/apps/codecs/libtremor/config-tremor.h +++ b/apps/codecs/libtremor/config-tremor.h @@ -13,10 +13,6 @@ #define BIG_ENDIAN 0 #endif -#ifndef ICONST_ATTR_TREMOR_WINDOW -#define ICONST_ATTR_TREMOR_WINDOW ICONST_ATTR -#endif - #ifndef ICODE_ATTR_TREMOR_MDCT #define ICODE_ATTR_TREMOR_MDCT ICODE_ATTR #endif @@ -25,4 +21,29 @@ #define ICODE_ATTR_TREMOR_NOT_MDCT ICODE_ATTR #endif +/* Define CPU of large IRAM (MCF5250) */ +#if (CONFIG_CPU == MCF5250) +/* PCM_BUFFER : 32768 Byte (4096*2*4) * + * WINDOW_LOOKUP : 4608 Byte (128*4 + 1024*4) * + * TOTAL : 37376 */ +#define IRAM_IBSS_SIZE 37376 + +/* Define CPU of large IRAM (PP5022/5024) */ +#elif (CONFIG_CPU == PP5022) || (CONFIG_CPU == PP5024) +/* PCM_BUFFER : 32768 byte (4096*2*4 or 2048*4*4) * + * WINDOW_LOOKUP : 9216 Byte (256*4 + 2048*4) * + * TOTAL : 41984 */ +#define IRAM_IBSS_SIZE 41984 + +/* Define CPU of Normal IRAM (96KB) (and SIM also) */ +#else +/* PCM_BUFFER : 16384 Byte (2048*2*4) * + * WINDOW_LOOKUP : 4608 Byte (128*4 + 1024*4) * + * TOTAL : 20992 */ +#define IRAM_IBSS_SIZE 20992 +#endif + +/* max 2 channels */ +#define CHANNELS 2 + // #define _LOW_ACCURACY_ diff --git a/apps/codecs/libtremor/ivorbiscodec.h b/apps/codecs/libtremor/ivorbiscodec.h index 2574a11f2a..a9526d56a6 100644 --- a/apps/codecs/libtremor/ivorbiscodec.h +++ b/apps/codecs/libtremor/ivorbiscodec.h @@ -76,6 +76,10 @@ typedef struct vorbis_dsp_state{ ogg_int64_t sequence; void *backend_state; + + ogg_int32_t *iram_pcm; /* IRAM PCM buffer */ + int iram_pcm_storage; /* size of IRAM PCM buffer */ + bool reset_pcmb; } vorbis_dsp_state; typedef struct vorbis_block{ diff --git a/apps/codecs/libtremor/mapping0.c b/apps/codecs/libtremor/mapping0.c index 2bb7a46d79..8b2343c56f 100644 --- a/apps/codecs/libtremor/mapping0.c +++ b/apps/codecs/libtremor/mapping0.c @@ -182,8 +182,6 @@ static vorbis_info_mapping *mapping0_unpack(vorbis_info *vi,oggpack_buffer *opb) static int seq = 0; -#define CHANNELS 2 /* max 2 channels on the ihp-1xx (stereo) */ - static int mapping0_inverse(vorbis_block *vb,vorbis_look_mapping *l){ vorbis_dsp_state *vd=vb->vd; vorbis_info *vi=vd->vi; diff --git a/apps/codecs/libtremor/misc.h b/apps/codecs/libtremor/misc.h index e94236c2a8..59ce6dbb74 100644 --- a/apps/codecs/libtremor/misc.h +++ b/apps/codecs/libtremor/misc.h @@ -155,8 +155,11 @@ static inline void XNPROD31(ogg_int32_t a, ogg_int32_t b, #ifndef _V_VECT_OPS #define _V_VECT_OPS +/* generic misc.h has symmetrical versions of vect_add_right_left + and vect_add_left_right (since symmetrical versions of + vect_mult_fw and vect_mult_bw i.e. both use MULT31) */ static inline -void vect_add(ogg_int32_t *x, ogg_int32_t *y, int n) +void vect_add_right_left(ogg_int32_t *x, const ogg_int32_t *y, int n) { while (n>0) { *x++ += *y++; @@ -164,13 +167,10 @@ void vect_add(ogg_int32_t *x, ogg_int32_t *y, int n) } } -static inline -void vect_copy(ogg_int32_t *x, ogg_int32_t *y, int n) +static inline +void vect_add_left_right(ogg_int32_t *x, const ogg_int32_t *y, int n) { - while (n>0) { - *x++ = *y++; - n--; - } + vect_add_right_left(x,y,n); } static inline @@ -194,6 +194,12 @@ void vect_mult_bw(ogg_int32_t *data, LOOKUP_T *window, int n) n--; } } + +/* generic memcpy is probably optimal */ +static inline void vect_copy(ogg_int32_t *x, const ogg_int32_t *y, int n) +{ + memcpy(x,y,n*sizeof(ogg_int32_t)); +} #endif #endif diff --git a/apps/codecs/libtremor/oggmalloc.c b/apps/codecs/libtremor/oggmalloc.c index 4aa2760629..6da7cfcedc 100644 --- a/apps/codecs/libtremor/oggmalloc.c +++ b/apps/codecs/libtremor/oggmalloc.c @@ -81,3 +81,27 @@ void ogg_tmpmalloc_free(long pos) { tmp_ptr = pos; } + +/* Allocate IRAM buffer */ +static unsigned char iram_buff[IRAM_IBSS_SIZE] IBSS_ATTR __attribute__ ((aligned (16))); +static size_t iram_remain; + +void iram_malloc_init(void){ + iram_remain=IRAM_IBSS_SIZE; +} + +void *iram_malloc(size_t size){ + void* x; + + /* always ensure 16-byte aligned */ + if(size&0x0f) + size=(size-(size&0x0f))+16; + + if(size>iram_remain) + return NULL; + + x = &iram_buff[IRAM_IBSS_SIZE-iram_remain]; + iram_remain-=size; + + return x; +} diff --git a/apps/codecs/libtremor/os_types.h b/apps/codecs/libtremor/os_types.h index 5738ef4911..4c7d17ef3a 100644 --- a/apps/codecs/libtremor/os_types.h +++ b/apps/codecs/libtremor/os_types.h @@ -25,9 +25,11 @@ #ifdef _LOW_ACCURACY_ # define X(n) (((((n)>>22)+1)>>1) - ((((n)>>22)+1)>>9)) # define LOOKUP_T const unsigned char +# define LOOKUP_TNC unsigned char #else # define X(n) (n) # define LOOKUP_T const ogg_int32_t +# define LOOKUP_TNC ogg_int32_t #endif /* make it easy on the folks that want to compile the libs with a @@ -46,6 +48,8 @@ void *ogg_tmpcalloc(size_t nmemb, size_t size); void *ogg_realloc(void *ptr, size_t size); long ogg_tmpmalloc_pos(void); void ogg_tmpmalloc_free(long pos); +void iram_malloc_init(void); +void *iram_malloc(size_t size); typedef short ogg_int16_t; typedef int ogg_int32_t; diff --git a/apps/codecs/libtremor/res012.c b/apps/codecs/libtremor/res012.c index 46b782def1..a42660a065 100644 --- a/apps/codecs/libtremor/res012.c +++ b/apps/codecs/libtremor/res012.c @@ -172,8 +172,6 @@ static vorbis_look_residue *res0_look(vorbis_dsp_state *vd,vorbis_info_mode *vm, return(look); } -#define CHANNELS 2 - /* a truncated packet here just means 'stop working'; it's not an error */ static int _01inverse(vorbis_block *vb,vorbis_look_residue *vl, ogg_int32_t **in,int ch, diff --git a/apps/codecs/libtremor/synthesis.c b/apps/codecs/libtremor/synthesis.c index cef240e796..b1c5eeccef 100644 --- a/apps/codecs/libtremor/synthesis.c +++ b/apps/codecs/libtremor/synthesis.c @@ -25,15 +25,7 @@ #include "os.h" -/* IRAM buffer keep the block pcm data; only for windows size upto 2048 - for space restrictions. - libVorbis 1.1 Oggenc doesn't use larger windows anyway. */ -/* max 2 channels on the ihp-1xx (stereo), 2048 samples (2*2048*4=16Kb) */ -#define IRAM_PCM_END 2048 -#define CHANNELS 2 - static ogg_int32_t *ipcm_vect[CHANNELS] IBSS_ATTR; -static ogg_int32_t ipcm_buff[CHANNELS*IRAM_PCM_END] IBSS_ATTR LINE_ATTR; int vorbis_synthesis(vorbis_block *vb,ogg_packet *op,int decodep) ICODE_ATTR_TREMOR_NOT_MDCT; @@ -76,23 +68,33 @@ int vorbis_synthesis(vorbis_block *vb,ogg_packet *op,int decodep){ vb->eofflag=op->e_o_s; if(decodep && vi->channels<=CHANNELS){ + vb->pcm = ipcm_vect; + /* alloc pcm passback storage */ vb->pcmend=ci->blocksizes[vb->W]; - if (vb->pcmend<=IRAM_PCM_END) { + if (vd->iram_pcm_storage >= vb->pcmend) { /* use statically allocated iram buffer */ - vb->pcm = ipcm_vect; - for(i=0; ipcm[i] = &ipcm_buff[i*IRAM_PCM_END]; + if(vd->reset_pcmb || vb->pcm[0]==NULL) + { + /* one-time initialisation at codec start + NOT for every block synthesis start + allows us to flip between buffers once initialised + by simply flipping pointers */ + for(i=0; ichannels; i++) + vb->pcm[i] = &vd->iram_pcm[i*vd->iram_pcm_storage]; + } } else { - /* dynamic allocation (slower) */ - vb->pcm=(ogg_int32_t **)_vorbis_block_alloc(vb,sizeof(*vb->pcm)*vi->channels); - for(i=0;ichannels;i++) - vb->pcm[i]=(ogg_int32_t *)_vorbis_block_alloc(vb,vb->pcmend*sizeof(*vb->pcm[i])); + if(vd->reset_pcmb || vb->pcm[0]==NULL) + { + /* dynamic allocation (slower) */ + for(i=0;ichannels;i++) + vb->pcm[i]=(ogg_int32_t *)_vorbis_block_alloc(vb,vb->pcmend*sizeof(*vb->pcm[i])); + } } + vd->reset_pcmb = false; /* unpack_header enforces range checking */ type=ci->map_type[ci->mode_param[mode]->mapping]; - return(_mapping_P[type]->inverse(vb,b->mode[mode])); }else{ /* no pcm */ diff --git a/apps/codecs/libtremor/window.c b/apps/codecs/libtremor/window.c index 14d97cf6ac..7b48886939 100644 --- a/apps/codecs/libtremor/window.c +++ b/apps/codecs/libtremor/window.c @@ -68,11 +68,15 @@ void _vorbis_apply_window(ogg_int32_t *d,const void *window_p[2], long rightbegin=n/2+n/4-rn/4; long rightend=rightbegin+rn/2; - memset((void *)&d[0], 0, sizeof(ogg_int32_t)*leftbegin); - /* mcf5249_vect_zero(&d[0], leftbegin); */ + /* Following memset is not required - we are careful to only overlap/add the + regions that geniunely overlap in the window region, and the portions + outside that region are not added (so don't need to be zerod). see block.c + memset((void *)&d[0], 0, sizeof(ogg_int32_t)*leftbegin); */ + vect_mult_fw(&d[leftbegin], &window[lW][0], leftend-leftbegin); vect_mult_bw(&d[rightbegin], &window[nW][rn/2-1], rightend-rightbegin); - memset((void *)&d[rightend], 0, sizeof(ogg_int32_t)*(n-rightend)); - /* mcf5249_vect_zero(&d[rightend], n-rightend); */ + + /* Again - memset not needed + memset((void *)&d[rightend], 0, sizeof(ogg_int32_t)*(n-rightend)); */ } diff --git a/apps/codecs/libtremor/window_lookup.h b/apps/codecs/libtremor/window_lookup.h index ccf316e227..5363b81042 100644 --- a/apps/codecs/libtremor/window_lookup.h +++ b/apps/codecs/libtremor/window_lookup.h @@ -51,7 +51,7 @@ static LOOKUP_T vwin128[64] = { X(0x7ffdcf39), X(0x7fff6dac), X(0x7fffed01), X(0x7fffffc4), }; -static LOOKUP_T vwin256[128] ICONST_ATTR_TREMOR_WINDOW = { +static LOOKUP_T vwin256[128] = { X(0x0001f018), X(0x00117066), X(0x00306e9e), X(0x005ee5f1), X(0x009ccf26), X(0x00ea208b), X(0x0146cdea), X(0x01b2c87f), X(0x022dfedf), X(0x02b85ced), X(0x0351cbbd), X(0x03fa317f), @@ -284,7 +284,7 @@ static LOOKUP_T vwin1024[512] = { X(0x7fffffdd), X(0x7ffffff7), X(0x7fffffff), X(0x7fffffff), }; -static LOOKUP_T vwin2048[1024] ICONST_ATTR_TREMOR_WINDOW = { +static LOOKUP_T vwin2048[1024] = { X(0x000007c0), X(0x000045c4), X(0x0000c1ca), X(0x00017bd3), X(0x000273de), X(0x0003a9eb), X(0x00051df9), X(0x0006d007), X(0x0008c014), X(0x000aee1e), X(0x000d5a25), X(0x00100428), -- cgit v1.2.3