From b61cf76aba768513ab2a1fa9e7cc80f59dbce02f Mon Sep 17 00:00:00 2001 From: Jörg Hohensohn Date: Thu, 18 Mar 2004 22:06:36 +0000 Subject: patch #917153: faster memset()/memcpy() git-svn-id: svn://svn.rockbox.org/rockbox/trunk@4406 a1c6a512-1295-4272-9138-f99709370657 --- firmware/common/memset.S | 108 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 firmware/common/memset.S (limited to 'firmware/common/memset.S') diff --git a/firmware/common/memset.S b/firmware/common/memset.S new file mode 100644 index 0000000000..038915c475 --- /dev/null +++ b/firmware/common/memset.S @@ -0,0 +1,108 @@ +/*************************************************************************** + * __________ __ ___. + * Open \______ \ ____ ____ | | _\_ |__ _______ ___ + * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / + * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < + * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ + * \/ \/ \/ \/ \/ + * $Id$ + * + * Copyright (C) 2004 by Jens Arnold + * + * All files in this archive are subject to the GNU General Public License. + * See the file COPYING in the source tree root for full license agreement. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ****************************************************************************/ + + .section .icode,"ax",@progbits + + .align 2 + .global _memset + .type _memset,@function + +/* Fills a memory region with specified byte value + * This version is optimized for speed + * + * arguments: + * r4 - start address + * r5 - data + * r6 - length + * + * return value: + * r0 - start address (like ANSI version) + * + * register usage: + * r0 - temporary + * r1 - bit mask for rounding to long bounds + * r2 - last / first long bound (only if >= 12 bytes) + * r4 - start address + * r5 - data (spread to all 4 bytes if >= 12 bytes) + * r6 - current address (runs down from end to start) + * + * The instruction order below is devised in a way to utilize the pipelining + * of the SH1 to the max. The routine fills memory from end to start in + * order to utilize the auto-decrementing store instructions. + */ + +_memset: + add r4,r6 /* r6 = end_address */ + + mov r6,r0 + add #-12,r0 /* r0 = r6 - 12; don't go below 12 here! */ + cmp/hs r4,r0 /* >= 12 bytes to fill? */ + bf .start_b2 /* no, jump directly to byte loop */ + + extu.b r5,r5 /* start: spread data to all 4 bytes */ + swap.b r5,r0 + or r0,r5 /* data now in 2 lower bytes of r5 */ + swap.w r5,r0 + or r0,r5 /* data now in all 4 bytes of r5 */ + + mov #-4,r1 /* r1 = 0xFFFFFFFC */ + + mov r6,r2 + bra .start_b1 + and r1,r2 /* r2 = last long bound */ + + /* leading byte loop: sets 0..3 bytes */ +.loop_b1: + mov.b r5,@-r6 /* store byte */ +.start_b1: + cmp/hi r2,r6 /* runs r6 down to last long bound */ + bt .loop_b1 + + mov r4,r2 + add #11,r2 /* combined for rounding and offset */ + and r1,r2 /* r2 = first long bound + 8 */ + + /* main loop: set 2 longs per pass */ +.loop2_l: + mov.l r5,@-r6 /* store first long */ + cmp/hi r2,r6 /* runs r6 down to first or second long bound */ + mov.l r5,@-r6 /* store second long */ + bt .loop2_l + + add #-8,r2 /* correct offset */ + cmp/hi r2,r6 /* 1 long left? */ + bf .start_b2 /* no, jump to trailing byte loop */ + + bra .start_b2 /* jump to trailing byte loop */ + mov.l r5,@-r6 /* store last long */ + + /* trailing byte loop */ + .align 2 +.loop_b2: + mov.b r5,@-r6 /* store byte */ +.start_b2: + cmp/hi r4,r6 /* runs r6 down to the start address */ + bt .loop_b2 + + rts + mov r4,r0 /* return start address */ + +.end: + .size _memset,.end-_memset + -- cgit v1.2.3