From fe04e40be7a26c758a82e410e58be63c1f3d571c Mon Sep 17 00:00:00 2001 From: Jens Arnold Date: Wed, 5 Nov 2008 00:10:05 +0000 Subject: Further optimised (vs. libgcc) unsigned 32 bit division for ARMv4 (based on the ARMv5(+) version from libgcc), in IRAM on PP for better performance on PP5002, and put into the codeclib for possible reuse. APE -c1000 is now usable on both PP502x and PP5002 (~138% realtime, they're on par now). Gigabeat F/X should also see an APE speedup. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19009 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/demac/libdemac/rangecoding.h | 14 +++- apps/codecs/lib/SOURCES | 3 + apps/codecs/lib/codeclib.h | 9 +++ apps/codecs/lib/udiv32_armv4.S | 114 +++++++++++++++++++++++++++++++ 4 files changed, 137 insertions(+), 3 deletions(-) create mode 100644 apps/codecs/lib/udiv32_armv4.S diff --git a/apps/codecs/demac/libdemac/rangecoding.h b/apps/codecs/demac/libdemac/rangecoding.h index c96886e32b..645fd1ad92 100644 --- a/apps/codecs/demac/libdemac/rangecoding.h +++ b/apps/codecs/demac/libdemac/rangecoding.h @@ -49,6 +49,14 @@ removing the rc parameter from each function (and the RNGC macro)). */ +#ifdef ROCKBOX +#include "../lib/codeclib.h" +/* for UDIV32() */ +#endif + +#ifndef UDIV32 +#define UDIV32(a, b) (a / b) +#endif /* BITSTREAM READING FUNCTIONS */ @@ -121,15 +129,15 @@ static inline void range_dec_normalize(void) static inline int range_decode_culfreq(int tot_f) { range_dec_normalize(); - rc.help = rc.range / tot_f; - return rc.low / rc.help; + rc.help = UDIV32(rc.range, tot_f); + return UDIV32(rc.low, rc.help); } static inline int range_decode_culshift(int shift) { range_dec_normalize(); rc.help = rc.range >> shift; - return rc.low / rc.help; + return UDIV32(rc.low, rc.help); } diff --git a/apps/codecs/lib/SOURCES b/apps/codecs/lib/SOURCES index 9c6d4e7ff6..8099620098 100644 --- a/apps/codecs/lib/SOURCES +++ b/apps/codecs/lib/SOURCES @@ -5,6 +5,9 @@ codeclib.c mdct2.c #ifdef CPU_ARM mdct_arm.S +#if ARM_ARCH == 4 +udiv32_armv4.S +#endif #endif #elif defined(SIMULATOR) && defined(__APPLE__) diff --git a/apps/codecs/lib/codeclib.h b/apps/codecs/lib/codeclib.h index 744accb8aa..477818a23d 100644 --- a/apps/codecs/lib/codeclib.h +++ b/apps/codecs/lib/codeclib.h @@ -57,6 +57,15 @@ void qsort(void *base, size_t nmemb, size_t size, int(*compar)(const void *, con extern void mdct_backward(int n, int32_t *in, int32_t *out); +#if defined(CPU_ARM) && (ARM_ARCH == 4) +/* optimised unsigned integer division for ARMv4, in IRAM */ +unsigned udiv32_arm(unsigned a, unsigned b); +#define UDIV32(a, b) udiv32_arm(a, b) +#else +/* default */ +#define UDIV32(a, b) (a / b) +#endif + /* Various codec helper functions */ int codec_init(void); diff --git a/apps/codecs/lib/udiv32_armv4.S b/apps/codecs/lib/udiv32_armv4.S new file mode 100644 index 0000000000..a659a9eb8e --- /dev/null +++ b/apps/codecs/lib/udiv32_armv4.S @@ -0,0 +1,114 @@ +/*************************************************************************** + * __________ __ ___. + * Open \______ \ ____ ____ | | _\_ |__ _______ ___ + * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / + * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < + * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ + * \/ \/ \/ \/ \/ + * $Id$ + * + * Copyright (C) 2008 by Jens Arnold + * + * Optimised unsigned integer division for ARMv4 + * + * Based on: libgcc routines for ARM cpu. + * Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk) + * Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005 + * Free Software Foundation, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ****************************************************************************/ + +#include "config.h" +/* Codecs should not normally do this, but we need to check a macro, and + * codecs.h would confuse the assembler. */ + +.macro ARM_DIV_BODY dividend, divisor, result, curbit + + mov \result, \dividend + mov \curbit, #90 @ 3 * 30, (calculating branch dest) + cmp \divisor, \result, lsr #16 + movls \result,\result, lsr #16 + subls \curbit, \curbit, #48 + cmp \divisor, \result, lsr #8 + movls \result,\result, lsr #8 + subls \curbit, \curbit, #24 + cmp \divisor, \result, lsr #4 + movls \result,\result, lsr #4 + subls \curbit, \curbit, #12 + cmp \divisor, \result, lsr #2 + subls \curbit, \curbit, #6 + @ calculation is only done down to shift=2, because the shift=1 step + @ would need 3 more cycles, but would only gain 1.5 cycles on average + mov \result, #0 + add pc, pc, \curbit, lsl #2 + nop + .set shift, 32 + .rept 32 + .set shift, shift - 1 + cmp \dividend, \divisor, lsl #shift + adc \result, \result, \result + subcs \dividend, \dividend, \divisor, lsl #shift + .endr +.endm + +.macro ARM_DIV2_ORDER divisor, order + + cmp \divisor, #(1 << 16) + movhs \divisor, \divisor, lsr #16 + movhs \order, #16 + movlo \order, #0 + + cmp \divisor, #(1 << 8) + movhs \divisor, \divisor, lsr #8 + addhs \order, \order, #8 + + cmp \divisor, #(1 << 4) + movhs \divisor, \divisor, lsr #4 + addhs \order, \order, #4 + + cmp \divisor, #(1 << 2) + addhi \order, \order, #3 + addls \order, \order, \divisor, lsr #1 +.endm + + +#ifdef USE_IRAM + .section .icode,"ax",%progbits +#else + .text +#endif + .align + .global udiv32_arm + .type udiv32_arm,%function + +udiv32_arm: + subs r2, r1, #1 + bxeq lr + bcc 20f + cmp r0, r1 + bls 10f + tst r1, r2 + beq 30f + + ARM_DIV_BODY r0, r1, r2, r3 + mov r0, r2 + bx lr + +10: + moveq r0, #1 +20: + movne r0, #0 + bx lr + +30: + ARM_DIV2_ORDER r1, r2 + mov r0, r0, lsr r2 + bx lr -- cgit v1.2.3