From a035261089403de259e74ce4dd196e2715138ed2 Mon Sep 17 00:00:00 2001 From: Thomas Martitz Date: Sat, 7 Jan 2012 19:56:09 +0100 Subject: Move optimized memcpy and friends and strlen to firmware/asm, using the new automatic-asm-picking infrastructure. --- firmware/asm/mips/memcpy.S | 143 +++++++++++++++++++++++++++ firmware/asm/mips/memset.S | 239 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 382 insertions(+) create mode 100644 firmware/asm/mips/memcpy.S create mode 100644 firmware/asm/mips/memset.S (limited to 'firmware/asm/mips') diff --git a/firmware/asm/mips/memcpy.S b/firmware/asm/mips/memcpy.S new file mode 100644 index 0000000000..2e7f245c69 --- /dev/null +++ b/firmware/asm/mips/memcpy.S @@ -0,0 +1,143 @@ +/*************************************************************************** + * __________ __ ___. + * Open \______ \ ____ ____ | | _\_ |__ _______ ___ + * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / + * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < + * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ + * \/ \/ \/ \/ \/ + * $Id$ + * + * Copyright (C) 2002, 2003 Free Software Foundation, Inc. + * This file was originally part of the GNU C Library + * Contributed to glibc by Hartvig Ekner , 2002 + * Adapted for Rockbox by Maurus Cuelenaere, 2009 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ****************************************************************************/ + +#include "config.h" +#include "mips.h" + +/* void *memcpy(void *s1, const void *s2, size_t n); */ + +#ifdef ROCKBOX_BIG_ENDIAN +# define LWHI lwl /* high part is left in big-endian */ +# define SWHI swl /* high part is left in big-endian */ +# define LWLO lwr /* low part is right in big-endian */ +# define SWLO swr /* low part is right in big-endian */ +#else +# define LWHI lwr /* high part is right in little-endian */ +# define SWHI swr /* high part is right in little-endian */ +# define LWLO lwl /* low part is left in little-endian */ +# define SWLO swl /* low part is left in little-endian */ +#endif + + .section .icode, "ax", %progbits + + .global memcpy + .type memcpy, %function + + .set noreorder + +memcpy: + slti t0, a2, 8 # Less than 8? + bne t0, zero, last8 + move v0, a0 # Setup exit value before too late + + xor t0, a1, a0 # Find a0/a1 displacement + andi t0, 0x3 + bne t0, zero, shift # Go handle the unaligned case + subu t1, zero, a1 + andi t1, 0x3 # a0/a1 are aligned, but are we + beq t1, zero, chk8w # starting in the middle of a word? + subu a2, t1 + LWHI t0, 0(a1) # Yes we are... take care of that + addu a1, t1 + SWHI t0, 0(a0) + addu a0, t1 + +chk8w: + andi t0, a2, 0x1f # 32 or more bytes left? + beq t0, a2, chk1w + subu a3, a2, t0 # Yes + addu a3, a1 # a3 = end address of loop + move a2, t0 # a2 = what will be left after loop +lop8w: + lw t0, 0(a1) # Loop taking 8 words at a time + lw t1, 4(a1) + lw t2, 8(a1) + lw t3, 12(a1) + lw t4, 16(a1) + lw t5, 20(a1) + lw t6, 24(a1) + lw t7, 28(a1) + addiu a0, 32 + addiu a1, 32 + sw t0, -32(a0) + sw t1, -28(a0) + sw t2, -24(a0) + sw t3, -20(a0) + sw t4, -16(a0) + sw t5, -12(a0) + sw t6, -8(a0) + bne a1, a3, lop8w + sw t7, -4(a0) + +chk1w: + andi t0, a2, 0x3 # 4 or more bytes left? + beq t0, a2, last8 + subu a3, a2, t0 # Yes, handle them one word at a time + addu a3, a1 # a3 again end address + move a2, t0 +lop1w: + lw t0, 0(a1) + addiu a0, 4 + addiu a1, 4 + bne a1, a3, lop1w + sw t0, -4(a0) + +last8: + blez a2, lst8e # Handle last 8 bytes, one at a time + addu a3, a2, a1 +lst8l: + lb t0, 0(a1) + addiu a0, 1 + addiu a1, 1 + bne a1, a3, lst8l + sb t0, -1(a0) +lst8e: + jr ra # Bye, bye + nop + +shift: + subu a3, zero, a0 # Src and Dest unaligned + andi a3, 0x3 # (unoptimized case...) + beq a3, zero, shft1 + subu a2, a3 # a2 = bytes left + LWHI t0, 0(a1) # Take care of first odd part + LWLO t0, 3(a1) + addu a1, a3 + SWHI t0, 0(a0) + addu a0, a3 +shft1: + andi t0, a2, 0x3 + subu a3, a2, t0 + addu a3, a1 +shfth: + LWHI t1, 0(a1) # Limp through, word by word + LWLO t1, 3(a1) + addiu a0, 4 + addiu a1, 4 + bne a1, a3, shfth + sw t1, -4(a0) + b last8 # Handle anything which may be left + move a2, t0 + + .set reorder diff --git a/firmware/asm/mips/memset.S b/firmware/asm/mips/memset.S new file mode 100644 index 0000000000..8db76d9123 --- /dev/null +++ b/firmware/asm/mips/memset.S @@ -0,0 +1,239 @@ +/*************************************************************************** + * __________ __ ___. + * Open \______ \ ____ ____ | | _\_ |__ _______ ___ + * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / + * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < + * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ + * \/ \/ \/ \/ \/ + * $Id$ + * + * This file was originally part of the Linux/MIPS GNU C Library + * Copyright (C) 1998 by Ralf Baechle + * Adapted for Rockbox by Maurus Cuelenaere, 2009 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ****************************************************************************/ + +#include "config.h" +#include "mips.h" + +#define FILL256(dst, offset, val) \ + sw val, (offset + 0x00)(dst); \ + sw val, (offset + 0x04)(dst); \ + sw val, (offset + 0x08)(dst); \ + sw val, (offset + 0x0c)(dst); \ + sw val, (offset + 0x10)(dst); \ + sw val, (offset + 0x14)(dst); \ + sw val, (offset + 0x18)(dst); \ + sw val, (offset + 0x1c)(dst); \ + sw val, (offset + 0x20)(dst); \ + sw val, (offset + 0x24)(dst); \ + sw val, (offset + 0x28)(dst); \ + sw val, (offset + 0x2c)(dst); \ + sw val, (offset + 0x30)(dst); \ + sw val, (offset + 0x34)(dst); \ + sw val, (offset + 0x38)(dst); \ + sw val, (offset + 0x3c)(dst); \ + sw val, (offset + 0x40)(dst); \ + sw val, (offset + 0x44)(dst); \ + sw val, (offset + 0x48)(dst); \ + sw val, (offset + 0x4c)(dst); \ + sw val, (offset + 0x50)(dst); \ + sw val, (offset + 0x54)(dst); \ + sw val, (offset + 0x58)(dst); \ + sw val, (offset + 0x5c)(dst); \ + sw val, (offset + 0x60)(dst); \ + sw val, (offset + 0x64)(dst); \ + sw val, (offset + 0x68)(dst); \ + sw val, (offset + 0x6c)(dst); \ + sw val, (offset + 0x70)(dst); \ + sw val, (offset + 0x74)(dst); \ + sw val, (offset + 0x78)(dst); \ + sw val, (offset + 0x7c)(dst); \ + sw val, (offset + 0x80)(dst); \ + sw val, (offset + 0x84)(dst); \ + sw val, (offset + 0x88)(dst); \ + sw val, (offset + 0x8c)(dst); \ + sw val, (offset + 0x90)(dst); \ + sw val, (offset + 0x94)(dst); \ + sw val, (offset + 0x98)(dst); \ + sw val, (offset + 0x9c)(dst); \ + sw val, (offset + 0xa0)(dst); \ + sw val, (offset + 0xa4)(dst); \ + sw val, (offset + 0xa8)(dst); \ + sw val, (offset + 0xac)(dst); \ + sw val, (offset + 0xb0)(dst); \ + sw val, (offset + 0xb4)(dst); \ + sw val, (offset + 0xb8)(dst); \ + sw val, (offset + 0xbc)(dst); \ + sw val, (offset + 0xc0)(dst); \ + sw val, (offset + 0xc4)(dst); \ + sw val, (offset + 0xc8)(dst); \ + sw val, (offset + 0xcc)(dst); \ + sw val, (offset + 0xd0)(dst); \ + sw val, (offset + 0xd4)(dst); \ + sw val, (offset + 0xd8)(dst); \ + sw val, (offset + 0xdc)(dst); \ + sw val, (offset + 0xe0)(dst); \ + sw val, (offset + 0xe4)(dst); \ + sw val, (offset + 0xe8)(dst); \ + sw val, (offset + 0xec)(dst); \ + sw val, (offset + 0xf0)(dst); \ + sw val, (offset + 0xf4)(dst); \ + sw val, (offset + 0xf8)(dst); \ + sw val, (offset + 0xfc)(dst); + +#define FILL128(dst, offset, val) \ + sw val, (offset + 0x00)(dst); \ + sw val, (offset + 0x04)(dst); \ + sw val, (offset + 0x08)(dst); \ + sw val, (offset + 0x0c)(dst); \ + sw val, (offset + 0x10)(dst); \ + sw val, (offset + 0x14)(dst); \ + sw val, (offset + 0x18)(dst); \ + sw val, (offset + 0x1c)(dst); \ + sw val, (offset + 0x20)(dst); \ + sw val, (offset + 0x24)(dst); \ + sw val, (offset + 0x28)(dst); \ + sw val, (offset + 0x2c)(dst); \ + sw val, (offset + 0x30)(dst); \ + sw val, (offset + 0x34)(dst); \ + sw val, (offset + 0x38)(dst); \ + sw val, (offset + 0x3c)(dst); \ + sw val, (offset + 0x40)(dst); \ + sw val, (offset + 0x44)(dst); \ + sw val, (offset + 0x48)(dst); \ + sw val, (offset + 0x4c)(dst); \ + sw val, (offset + 0x50)(dst); \ + sw val, (offset + 0x54)(dst); \ + sw val, (offset + 0x58)(dst); \ + sw val, (offset + 0x5c)(dst); \ + sw val, (offset + 0x60)(dst); \ + sw val, (offset + 0x64)(dst); \ + sw val, (offset + 0x68)(dst); \ + sw val, (offset + 0x6c)(dst); \ + sw val, (offset + 0x70)(dst); \ + sw val, (offset + 0x74)(dst); \ + sw val, (offset + 0x78)(dst); \ + sw val, (offset + 0x7c)(dst); + +#define FILL64(dst, offset, val) \ + sw val, (offset + 0x00)(dst); \ + sw val, (offset + 0x04)(dst); \ + sw val, (offset + 0x08)(dst); \ + sw val, (offset + 0x0c)(dst); \ + sw val, (offset + 0x10)(dst); \ + sw val, (offset + 0x14)(dst); \ + sw val, (offset + 0x18)(dst); \ + sw val, (offset + 0x1c)(dst); \ + sw val, (offset + 0x20)(dst); \ + sw val, (offset + 0x24)(dst); \ + sw val, (offset + 0x28)(dst); \ + sw val, (offset + 0x2c)(dst); \ + sw val, (offset + 0x30)(dst); \ + sw val, (offset + 0x34)(dst); \ + sw val, (offset + 0x38)(dst); \ + sw val, (offset + 0x3c)(dst); + +#define FILL32(dst, offset, val) \ + sw val, (offset + 0x00)(dst); \ + sw val, (offset + 0x04)(dst); \ + sw val, (offset + 0x08)(dst); \ + sw val, (offset + 0x0c)(dst); \ + sw val, (offset + 0x10)(dst); \ + sw val, (offset + 0x14)(dst); \ + sw val, (offset + 0x18)(dst); \ + sw val, (offset + 0x1c)(dst); + +#define FILL 64 +#define F_FILL FILL64 + + +#ifdef ROCKBOX_BIG_ENDIAN +# define SWHI swl /* high part is left in big-endian */ +#else +# define SWHI swr /* high part is right in little-endian */ +#endif + +/* + * memset(void *s, int c, size_t n) + * + * a0: start of area to clear + * a1: char to fill with + * a2: size of area to clear + */ + .section .icode, "ax", %progbits + + .global memset + .type memset, %function + + .set noreorder + .align 5 +memset: + beqz a1, 1f + move v0, a0 /* result */ + + andi a1, 0xff /* spread fillword */ + sll t1, a1, 8 + or a1, t1 + sll t1, a1, 16 + or a1, t1 +1: + + sltiu t0, a2, 4 /* very small region? */ + bnez t0, small_memset + andi t0, a0, 3 /* aligned? */ + + beqz t0, 1f + subu t0, 4 /* alignment in bytes */ + + SWHI a1, (a0) /* make word aligned */ + subu a0, t0 /* word align ptr */ + addu a2, t0 /* correct size */ + +1: ori t1, a2, (FILL-1) /* # of full blocks */ + xori t1, (FILL-1) + beqz t1, memset_partial /* no block to fill */ + andi t0, a2, (FILL-4) + + addu t1, a0 /* end address */ + .set reorder +1: addiu a0, FILL + F_FILL( a0, -FILL, a1 ) + bne t1, a0, 1b + .set noreorder + +memset_partial: + la t1, 2f /* where to start */ + subu t1, t0 + jr t1 + addu a0, t0 /* dest ptr */ + + F_FILL( a0, -FILL, a1 ) /* ... but first do words ... */ +2: andi a2, 3 /* 0 <= n <= 3 to go */ + + beqz a2, 1f + addu a0, a2 /* What's left */ + SWHI a1, -1(a0) +1: jr ra + move a2, zero + +small_memset: + beqz a2, 2f + addu t1, a0, a2 + +1: addiu a0, 1 /* fill bytewise */ + bne t1, a0, 1b + sb a1, -1(a0) + +2: jr ra /* done */ + move a2, zero + + .set reorder -- cgit v1.2.3