From b61cf76aba768513ab2a1fa9e7cc80f59dbce02f Mon Sep 17 00:00:00 2001 From: Jörg Hohensohn Date: Thu, 18 Mar 2004 22:06:36 +0000 Subject: patch #917153: faster memset()/memcpy() git-svn-id: svn://svn.rockbox.org/rockbox/trunk@4406 a1c6a512-1295-4272-9138-f99709370657 --- firmware/Makefile | 7 +- firmware/common/memcpy.S | 171 +++++++++++++++++++++++++++++++++++++++++++++++ firmware/common/memcpy.c | 117 -------------------------------- firmware/common/memset.S | 108 ++++++++++++++++++++++++++++++ firmware/common/memset.c | 109 ------------------------------ 5 files changed, 283 insertions(+), 229 deletions(-) create mode 100644 firmware/common/memcpy.S delete mode 100644 firmware/common/memcpy.c create mode 100644 firmware/common/memset.S delete mode 100644 firmware/common/memset.c (limited to 'firmware') diff --git a/firmware/Makefile b/firmware/Makefile index 93ee38ac78..38bcd4cc86 100644 --- a/firmware/Makefile +++ b/firmware/Makefile @@ -25,16 +25,17 @@ endif ifdef DEBUG CFLAGS += -g -DDEBUG else -CFLAGS += -fomit-frame-pointer -fschedule-insns +CFLAGS += -fomit-frame-pointer -fschedule-insns endif SRC := $(wildcard drivers/*.c common/*.c *.c) +SRC_S := $(wildcard drivers/*.S common/*.S *.S) -OBJS := $(SRC:%.c=$(OBJDIR)/%.o) $(OBJDIR)/crt0.o $(OBJDIR)/bitswap.o $(OBJDIR)/descramble.o +OBJS := $(SRC:%.c=$(OBJDIR)/%.o) $(SRC_S:%.S=$(OBJDIR)/%.o) DEPS:=.deps DEPDIRS:=$(DEPS) $(DEPS)/drivers $(DEPS)/common $(DEPS)/malloc -DIRS = $(subst $(DEPS),".",$(DEPDIRS)) +DIRS = $(subst $(DEPS),".",$(DEPDIRS)) OUTPUT = $(OBJDIR)/librockbox.a diff --git a/firmware/common/memcpy.S b/firmware/common/memcpy.S new file mode 100644 index 0000000000..2fb9f6a5a7 --- /dev/null +++ b/firmware/common/memcpy.S @@ -0,0 +1,171 @@ +/*************************************************************************** + * __________ __ ___. + * Open \______ \ ____ ____ | | _\_ |__ _______ ___ + * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / + * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < + * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ + * \/ \/ \/ \/ \/ + * $Id$ + * + * Copyright (C) 2004 by Jens Arnold + * + * All files in this archive are subject to the GNU General Public License. + * See the file COPYING in the source tree root for full license agreement. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ****************************************************************************/ + + .section .icode,"ax",@progbits + + .align 2 + .global _memcpy + .type _memcpy,@function + +/* Copies bytes of data in memory from to + * This version is optimized for speed + * + * arguments: + * r4 - destination address + * r5 - source address + * r6 - length + * + * return value: + * r0 - destination address (like ANSI version) + * + * register usage: + * r0 - data / temporary + * r1 - bit mask for rounding to long bounds / 2nd data + * r2 - first long bound (only if >= 12 bytes) + * r3 - last long bound (-4) (only if >= 12 bytes) + * r4 - current dest address + * r5 - current source address + * r6 - source end address + * r7 - stored dest start address + * + * The instruction order below is devised in a way to utilize the pipelining + * of the SH1 to the max. The routine also tries to utilize fast page mode. + */ + +_memcpy: + add r5,r6 /* r6 = source_end */ + mov r4,r7 /* store for returning */ + add #-8,r4 /* adjust for early increments (max. 2 longs) */ + + mov r6,r0 + add #-12,r0 /* r0 = r6 - 12; don't go below 12 here! */ + cmp/hs r5,r0 /* >= 12 bytes to copy? */ + bf .start_b2 /* no, jump into byte loop */ + + mov #-4,r1 /* r1 = 0xFFFFFFFC */ + + mov r5,r2 + add #3,r2 + and r1,r2 /* r2 = first source long bound */ + mov r6,r3 + add #-4,r3 /* end offset for copying 2 longs per pass */ + bra .start_b1 /* jump into leading byte loop */ + and r1,r3 /* r3 = last source long bound - 4 */ + + /* leading byte loop: copies 0..3 bytes */ + .align 2 +.loop_b1: + mov.b @r5+,r0 /* load byte & increment source addr */ + add #1,r4 /* increment dest addr */ + mov.b r0,@(7,r4) /* store byte */ +.start_b1: + cmp/hi r5,r2 /* runs r5 up to first long bound */ + bt .loop_b1 + /* now r5 is always at a long boundary */ + /* -> memory reading is done in longs for all dest alignments */ + + /* selector for main copy loop */ + mov r4,r0 + tst #3,r0 /* dest now also at long bound? */ + bt .loop2_l /* yes, do long copy */ + tst #1,r0 /* dest now at least at word bound? */ + bt .start4_w /* yes, do word copy */ + + /* main loop for byte aligned destination (fast) */ + /* copies 1 long per pass */ + add #4,r3 /* reset end offset */ + add #-1,r4 /* adjust to word alignment for word write+ */ + +.loop4_b: + mov.l @r5+,r0 /* load a long & increment source addr */ + add #4,r4 /* increment dest addr */ + mov.b r0,@(8,r4) /* store low byte */ + shlr8 r0 /* get middle 2 bytes */ + mov.w r0,@(6,r4) /* store as word+ */ + shlr16 r0 /* get upper byte */ + mov.b r0,@(5,r4) /* and store */ + cmp/hi r5,r3 /* runs r5 up to last long bound */ + bt .loop4_b + + bra .start_b2 /* jump to trailing byte loop */ + add #1,r4 /* readjust */ + + /* main loop for word aligned destination (faster) */ + /* copies 2 longs per pass, utilizing fast page mode */ +.start4_w: + add #-2,r4 /* adjust to long alignment for long write+ */ + +.loop4_w: + mov.l @r5+,r1 /* load first long & increment source addr */ + add #8,r4 /* increment dest addr */ + mov.l @r5+,r0 /* load second long & increment source addr */ + cmp/hi r5,r3 /* runs r5 up to last or second last long bound */ + mov.w r0,@(8,r4) /* store low word of second long */ + xtrct r1,r0 /* extract low word of first long & high word of second long */ + mov.l r0,@(4,r4) /* and store as long+ */ + swap.w r1,r0 /* get high word of first long */ + mov.w r0,@(2,r4) /* and store it */ + bt .loop4_w + + add #2,r4 /* readjust destination */ + add #4,r3 /* reset end offset */ + cmp/hi r5,r3 /* one long left? */ + bf .start_b2 /* no, jump to trailing byte loop */ + + mov.l @r5+,r0 /* load last long & increment source addr */ + add #4,r4 /* increment dest addr */ + mov.w r0,@(6,r4) /* store low word */ + shlr16 r0 /* get high word */ + bra .start_b2 /* jump to trailing byte loop */ + mov.w r0,@(4,r4) /* and store it */ + + /* main loop for long aligned destination (fastest) */ + /* copies 2 longs per pass, utilizing fast page mode */ +.loop2_l: + mov.l @r5+,r1 /* load first long & increment source addr */ + add #8,r4 /* increment dest addr */ + mov.l @r5+,r0 /* load second long & increment source addr */ + cmp/hi r5,r3 /* runs r5 up to last or second last long bound */ + mov.l r1,@r4 /* store first long */ + mov.l r0,@(4,r4) /* store second long; NOT ALIGNED - no speed loss here! */ + bt .loop2_l + + add #4,r3 /* reset end offset */ + cmp/hi r5,r3 /* one long left? */ + bf .start_b2 /* no, jump to trailing byte loop */ + + mov.l @r5+,r0 /* load last long & increment source addr */ + add #4,r4 /* increment dest addr */ + bra .start_b2 /* jump to trailing byte loop */ + mov.l r0,@(4,r4) /* store last long */ + + /* trailing byte loop: copies 0..3 bytes (or all for < 12 in total) */ +.loop_b2: + mov.b @r5+,r0 /* load byte & increment source addr */ + add #1,r4 /* increment dest addr */ + mov.b r0,@(7,r4) /* store byte */ +.start_b2: + cmp/hi r5,r6 /* runs r5 up to end address */ + bt .loop_b2 + + rts + mov r7,r0 /* return dest start address */ +.end: + .size _memcpy,.end-_memcpy + diff --git a/firmware/common/memcpy.c b/firmware/common/memcpy.c deleted file mode 100644 index 49678920fa..0000000000 --- a/firmware/common/memcpy.c +++ /dev/null @@ -1,117 +0,0 @@ -/* -FUNCTION - <>---copy memory regions - -ANSI_SYNOPSIS - #include - void* memcpy(void *<[out]>, const void *<[in]>, size_t <[n]>); - -TRAD_SYNOPSIS - void *memcpy(<[out]>, <[in]>, <[n]> - void *<[out]>; - void *<[in]>; - size_t <[n]>; - -DESCRIPTION - This function copies <[n]> bytes from the memory region - pointed to by <[in]> to the memory region pointed to by - <[out]>. - - If the regions overlap, the behavior is undefined. - -RETURNS - <> returns a pointer to the first byte of the <[out]> - region. - -PORTABILITY -<> is ANSI C. - -<> requires no supporting OS subroutines. - -QUICKREF - memcpy ansi pure - */ - -#include <_ansi.h> -#include -#include - -/* Nonzero if either X or Y is not aligned on a "long" boundary. */ -#define UNALIGNED(X, Y) \ - (((long)X & (sizeof (long) - 1)) | ((long)Y & (sizeof (long) - 1))) - -/* How many bytes are copied each iteration of the 4X unrolled loop. */ -#define BIGBLOCKSIZE (sizeof (long) << 2) - -/* How many bytes are copied each iteration of the word copy loop. */ -#define LITTLEBLOCKSIZE (sizeof (long)) - -/* Threshhold for punting to the byte copier. */ -#define TOO_SMALL(LEN) ((LEN) < BIGBLOCKSIZE) - -_PTR -_DEFUN (memcpy, (dst0, src0, len0), - _PTR dst0 _AND - _CONST _PTR src0 _AND - size_t len0) __attribute__ ((section (".icode"))); - -_PTR -_DEFUN (memcpy, (dst0, src0, len0), - _PTR dst0 _AND - _CONST _PTR src0 _AND - size_t len0) -{ -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) - char *dst = (char *) dst0; - char *src = (char *) src0; - - _PTR save = dst0; - - while (len0--) - { - *dst++ = *src++; - } - - return save; -#else - char *dst = dst0; - _CONST char *src = src0; - long *aligned_dst; - _CONST long *aligned_src; - unsigned int len = len0; - - /* If the size is small, or either SRC or DST is unaligned, - then punt into the byte copy loop. This should be rare. */ - if (!TOO_SMALL(len) && !UNALIGNED (src, dst)) - { - aligned_dst = (long*)dst; - aligned_src = (long*)src; - - /* Copy 4X long words at a time if possible. */ - while (len >= BIGBLOCKSIZE) - { - *aligned_dst++ = *aligned_src++; - *aligned_dst++ = *aligned_src++; - *aligned_dst++ = *aligned_src++; - *aligned_dst++ = *aligned_src++; - len -= BIGBLOCKSIZE; - } - - /* Copy one long word at a time if possible. */ - while (len >= LITTLEBLOCKSIZE) - { - *aligned_dst++ = *aligned_src++; - len -= LITTLEBLOCKSIZE; - } - - /* Pick up any residual with a byte copier. */ - dst = (char*)aligned_dst; - src = (char*)aligned_src; - } - - while (len--) - *dst++ = *src++; - - return dst0; -#endif /* not PREFER_SIZE_OVER_SPEED */ -} diff --git a/firmware/common/memset.S b/firmware/common/memset.S new file mode 100644 index 0000000000..038915c475 --- /dev/null +++ b/firmware/common/memset.S @@ -0,0 +1,108 @@ +/*************************************************************************** + * __________ __ ___. + * Open \______ \ ____ ____ | | _\_ |__ _______ ___ + * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / + * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < + * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ + * \/ \/ \/ \/ \/ + * $Id$ + * + * Copyright (C) 2004 by Jens Arnold + * + * All files in this archive are subject to the GNU General Public License. + * See the file COPYING in the source tree root for full license agreement. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ****************************************************************************/ + + .section .icode,"ax",@progbits + + .align 2 + .global _memset + .type _memset,@function + +/* Fills a memory region with specified byte value + * This version is optimized for speed + * + * arguments: + * r4 - start address + * r5 - data + * r6 - length + * + * return value: + * r0 - start address (like ANSI version) + * + * register usage: + * r0 - temporary + * r1 - bit mask for rounding to long bounds + * r2 - last / first long bound (only if >= 12 bytes) + * r4 - start address + * r5 - data (spread to all 4 bytes if >= 12 bytes) + * r6 - current address (runs down from end to start) + * + * The instruction order below is devised in a way to utilize the pipelining + * of the SH1 to the max. The routine fills memory from end to start in + * order to utilize the auto-decrementing store instructions. + */ + +_memset: + add r4,r6 /* r6 = end_address */ + + mov r6,r0 + add #-12,r0 /* r0 = r6 - 12; don't go below 12 here! */ + cmp/hs r4,r0 /* >= 12 bytes to fill? */ + bf .start_b2 /* no, jump directly to byte loop */ + + extu.b r5,r5 /* start: spread data to all 4 bytes */ + swap.b r5,r0 + or r0,r5 /* data now in 2 lower bytes of r5 */ + swap.w r5,r0 + or r0,r5 /* data now in all 4 bytes of r5 */ + + mov #-4,r1 /* r1 = 0xFFFFFFFC */ + + mov r6,r2 + bra .start_b1 + and r1,r2 /* r2 = last long bound */ + + /* leading byte loop: sets 0..3 bytes */ +.loop_b1: + mov.b r5,@-r6 /* store byte */ +.start_b1: + cmp/hi r2,r6 /* runs r6 down to last long bound */ + bt .loop_b1 + + mov r4,r2 + add #11,r2 /* combined for rounding and offset */ + and r1,r2 /* r2 = first long bound + 8 */ + + /* main loop: set 2 longs per pass */ +.loop2_l: + mov.l r5,@-r6 /* store first long */ + cmp/hi r2,r6 /* runs r6 down to first or second long bound */ + mov.l r5,@-r6 /* store second long */ + bt .loop2_l + + add #-8,r2 /* correct offset */ + cmp/hi r2,r6 /* 1 long left? */ + bf .start_b2 /* no, jump to trailing byte loop */ + + bra .start_b2 /* jump to trailing byte loop */ + mov.l r5,@-r6 /* store last long */ + + /* trailing byte loop */ + .align 2 +.loop_b2: + mov.b r5,@-r6 /* store byte */ +.start_b2: + cmp/hi r4,r6 /* runs r6 down to the start address */ + bt .loop_b2 + + rts + mov r4,r0 /* return start address */ + +.end: + .size _memset,.end-_memset + diff --git a/firmware/common/memset.c b/firmware/common/memset.c deleted file mode 100644 index c370191cda..0000000000 --- a/firmware/common/memset.c +++ /dev/null @@ -1,109 +0,0 @@ -/* -FUNCTION - <>---set an area of memory - -INDEX - memset - -ANSI_SYNOPSIS - #include - void *memset(const void *<[dst]>, int <[c]>, size_t <[length]>); - -TRAD_SYNOPSIS - #include - void *memset(<[dst]>, <[c]>, <[length]>) - void *<[dst]>; - int <[c]>; - size_t <[length]>; - -DESCRIPTION - This function converts the argument <[c]> into an unsigned - char and fills the first <[length]> characters of the array - pointed to by <[dst]> to the value. - -RETURNS - <> returns the value of <[m]>. - -PORTABILITY -<> is ANSI C. - - <> requires no supporting OS subroutines. - -QUICKREF - memset ansi pure -*/ - -#include - -#define LBLOCKSIZE (sizeof(long)) -#define UNALIGNED(X) ((long)X & (LBLOCKSIZE - 1)) -#define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE) - -_PTR -_DEFUN (memset, (m, c, n), - _PTR m _AND - int c _AND - size_t n) -{ -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) - char *s = (char *) m; - - while (n-- != 0) - { - *s++ = (char) c; - } - - return m; -#else - char *s = (char *) m; - unsigned int i; - unsigned long buffer; - unsigned long *aligned_addr; - - if (!TOO_SMALL (n) && !UNALIGNED (m)) - { - /* If we get this far, we know that n is large and m is word-aligned. */ - - aligned_addr = (unsigned long*)m; - - /* Store C into each char sized location in BUFFER so that - we can set large blocks quickly. */ - c &= 0xff; - if (LBLOCKSIZE == 4) - { - buffer = (c << 8) | c; - buffer |= (buffer << 16); - } - else - { - buffer = 0; - for (i = 0; i < LBLOCKSIZE; i++) - buffer = (buffer << 8) | c; - } - - while (n >= LBLOCKSIZE*4) - { - *aligned_addr++ = buffer; - *aligned_addr++ = buffer; - *aligned_addr++ = buffer; - *aligned_addr++ = buffer; - n -= 4*LBLOCKSIZE; - } - - while (n >= LBLOCKSIZE) - { - *aligned_addr++ = buffer; - n -= LBLOCKSIZE; - } - /* Pick up the remainder with a bytewise loop. */ - s = (char*)aligned_addr; - } - - while (n--) - { - *s++ = (char)c; - } - - return m; -#endif /* not PREFER_SIZE_OVER_SPEED */ -} -- cgit v1.2.3