From a035261089403de259e74ce4dd196e2715138ed2 Mon Sep 17 00:00:00 2001 From: Thomas Martitz Date: Sat, 7 Jan 2012 19:56:09 +0100 Subject: Move optimized memcpy and friends and strlen to firmware/asm, using the new automatic-asm-picking infrastructure. --- firmware/asm/arm/memcpy.S | 176 ++++++++++++++++++++++++++++++++++++++++ firmware/asm/arm/memmove.S | 190 ++++++++++++++++++++++++++++++++++++++++++++ firmware/asm/arm/memset.S | 98 +++++++++++++++++++++++ firmware/asm/arm/memset16.S | 82 +++++++++++++++++++ 4 files changed, 546 insertions(+) create mode 100644 firmware/asm/arm/memcpy.S create mode 100644 firmware/asm/arm/memmove.S create mode 100644 firmware/asm/arm/memset.S create mode 100644 firmware/asm/arm/memset16.S (limited to 'firmware/asm/arm') diff --git a/firmware/asm/arm/memcpy.S b/firmware/asm/arm/memcpy.S new file mode 100644 index 0000000000..2a55fb5656 --- /dev/null +++ b/firmware/asm/arm/memcpy.S @@ -0,0 +1,176 @@ +/*************************************************************************** + * __________ __ ___. + * Open \______ \ ____ ____ | | _\_ |__ _______ ___ + * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / + * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < + * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ + * \/ \/ \/ \/ \/ + * $Id$ + * + * Copyright (C) 2006 Free Software Foundation, Inc. + * This file was originally part of the GNU C Library + * Contributed to glibc by MontaVista Software, Inc. (written by Nicolas Pitre) + * Adapted for Rockbox by Daniel Ankers + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ****************************************************************************/ + +#include "config.h" + +/* + * Endian independent macros for shifting bytes within registers. + */ +#ifndef __ARMEB__ +#define pull lsr +#define push lsl +#else +#define pull lsl +#define push lsr +#endif + +/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */ + + .section .icode,"ax",%progbits + + .align 2 + .global memcpy + .type memcpy,%function + +memcpy: + stmfd sp!, {r0, r4, lr} + + subs r2, r2, #4 + blt 8f + ands ip, r0, #3 + bne 9f + ands ip, r1, #3 + bne 10f + +1: subs r2, r2, #(28) + stmfd sp!, {r5 - r8} + blt 5f + +2: +3: +4: ldmia r1!, {r3, r4, r5, r6, r7, r8, ip, lr} + subs r2, r2, #32 + stmia r0!, {r3, r4, r5, r6, r7, r8, ip, lr} + bge 3b + +5: ands ip, r2, #28 + rsb ip, ip, #32 + addne pc, pc, ip @ C is always clear here + b 7f +6: nop + ldr r3, [r1], #4 + ldr r4, [r1], #4 + ldr r5, [r1], #4 + ldr r6, [r1], #4 + ldr r7, [r1], #4 + ldr r8, [r1], #4 + ldr lr, [r1], #4 + + add pc, pc, ip + nop + nop + str r3, [r0], #4 + str r4, [r0], #4 + str r5, [r0], #4 + str r6, [r0], #4 + str r7, [r0], #4 + str r8, [r0], #4 + str lr, [r0], #4 + +7: ldmfd sp!, {r5 - r8} + +8: movs r2, r2, lsl #31 + ldrneb r3, [r1], #1 + ldrcsb r4, [r1], #1 + ldrcsb ip, [r1] + strneb r3, [r0], #1 + strcsb r4, [r0], #1 + strcsb ip, [r0] + + ldmpc regs="r0, r4" + +9: rsb ip, ip, #4 + cmp ip, #2 + ldrgtb r3, [r1], #1 + ldrgeb r4, [r1], #1 + ldrb lr, [r1], #1 + strgtb r3, [r0], #1 + strgeb r4, [r0], #1 + subs r2, r2, ip + strb lr, [r0], #1 + blt 8b + ands ip, r1, #3 + beq 1b + +10: bic r1, r1, #3 + cmp ip, #2 + ldr lr, [r1], #4 + beq 17f + bgt 18f + + + .macro forward_copy_shift pull push + + subs r2, r2, #28 + blt 14f + +11: stmfd sp!, {r5 - r9} + +12: +13: ldmia r1!, {r4, r5, r6, r7} + mov r3, lr, pull #\pull + subs r2, r2, #32 + ldmia r1!, {r8, r9, ip, lr} + orr r3, r3, r4, push #\push + mov r4, r4, pull #\pull + orr r4, r4, r5, push #\push + mov r5, r5, pull #\pull + orr r5, r5, r6, push #\push + mov r6, r6, pull #\pull + orr r6, r6, r7, push #\push + mov r7, r7, pull #\pull + orr r7, r7, r8, push #\push + mov r8, r8, pull #\pull + orr r8, r8, r9, push #\push + mov r9, r9, pull #\pull + orr r9, r9, ip, push #\push + mov ip, ip, pull #\pull + orr ip, ip, lr, push #\push + stmia r0!, {r3, r4, r5, r6, r7, r8, r9, ip} + bge 12b + + ldmfd sp!, {r5 - r9} + +14: ands ip, r2, #28 + beq 16f + +15: mov r3, lr, pull #\pull + ldr lr, [r1], #4 + subs ip, ip, #4 + orr r3, r3, lr, push #\push + str r3, [r0], #4 + bgt 15b + +16: sub r1, r1, #(\push / 8) + b 8b + + .endm + + + forward_copy_shift pull=8 push=24 + +17: forward_copy_shift pull=16 push=16 + +18: forward_copy_shift pull=24 push=8 + diff --git a/firmware/asm/arm/memmove.S b/firmware/asm/arm/memmove.S new file mode 100644 index 0000000000..d8cab048be --- /dev/null +++ b/firmware/asm/arm/memmove.S @@ -0,0 +1,190 @@ +/*************************************************************************** + * __________ __ ___. + * Open \______ \ ____ ____ | | _\_ |__ _______ ___ + * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / + * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < + * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ + * \/ \/ \/ \/ \/ + * $Id$ + * + * Copyright (C) 2006 Free Software Foundation, Inc. + * This file was originally part of the GNU C Library + * Contributed to glibc by MontaVista Software, Inc. (written by Nicolas Pitre) + * Adapted for Rockbox by Daniel Ankers + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ****************************************************************************/ + +#include "config.h" + +/* + * Endian independent macros for shifting bytes within registers. + */ +#ifndef __ARMEB__ +#define pull lsr +#define push lsl +#else +#define pull lsl +#define push lsr +#endif + + .text + +/* + * Prototype: void *memmove(void *dest, const void *src, size_t n); + * + * Note: + * + * If the memory regions don't overlap, we simply branch to memcpy which is + * normally a bit faster. Otherwise the copy is done going downwards. + */ + + .section .icode,"ax",%progbits + + .align 2 + .global memmove + .type memmove,%function + +memmove: + + subs ip, r0, r1 + cmphi r2, ip + bls memcpy + + stmfd sp!, {r0, r4, lr} + add r1, r1, r2 + add r0, r0, r2 + subs r2, r2, #4 + blt 8f + ands ip, r0, #3 + bne 9f + ands ip, r1, #3 + bne 10f + +1: subs r2, r2, #(28) + stmfd sp!, {r5 - r8} + blt 5f + +2: +3: +4: ldmdb r1!, {r3, r4, r5, r6, r7, r8, ip, lr} + subs r2, r2, #32 + stmdb r0!, {r3, r4, r5, r6, r7, r8, ip, lr} + bge 3b + +5: ands ip, r2, #28 + rsb ip, ip, #32 + addne pc, pc, ip @ C is always clear here + b 7f +6: nop + ldr r3, [r1, #-4]! + ldr r4, [r1, #-4]! + ldr r5, [r1, #-4]! + ldr r6, [r1, #-4]! + ldr r7, [r1, #-4]! + ldr r8, [r1, #-4]! + ldr lr, [r1, #-4]! + + add pc, pc, ip + nop + nop + str r3, [r0, #-4]! + str r4, [r0, #-4]! + str r5, [r0, #-4]! + str r6, [r0, #-4]! + str r7, [r0, #-4]! + str r8, [r0, #-4]! + str lr, [r0, #-4]! + +7: ldmfd sp!, {r5 - r8} + +8: movs r2, r2, lsl #31 + ldrneb r3, [r1, #-1]! + ldrcsb r4, [r1, #-1]! + ldrcsb ip, [r1, #-1] + strneb r3, [r0, #-1]! + strcsb r4, [r0, #-1]! + strcsb ip, [r0, #-1] + ldmpc regs="r0, r4" + +9: cmp ip, #2 + ldrgtb r3, [r1, #-1]! + ldrgeb r4, [r1, #-1]! + ldrb lr, [r1, #-1]! + strgtb r3, [r0, #-1]! + strgeb r4, [r0, #-1]! + subs r2, r2, ip + strb lr, [r0, #-1]! + blt 8b + ands ip, r1, #3 + beq 1b + +10: bic r1, r1, #3 + cmp ip, #2 + ldr r3, [r1, #0] + beq 17f + blt 18f + + + .macro backward_copy_shift push pull + + subs r2, r2, #28 + blt 14f + +11: stmfd sp!, {r5 - r9} + +12: +13: ldmdb r1!, {r7, r8, r9, ip} + mov lr, r3, push #\push + subs r2, r2, #32 + ldmdb r1!, {r3, r4, r5, r6} + orr lr, lr, ip, pull #\pull + mov ip, ip, push #\push + orr ip, ip, r9, pull #\pull + mov r9, r9, push #\push + orr r9, r9, r8, pull #\pull + mov r8, r8, push #\push + orr r8, r8, r7, pull #\pull + mov r7, r7, push #\push + orr r7, r7, r6, pull #\pull + mov r6, r6, push #\push + orr r6, r6, r5, pull #\pull + mov r5, r5, push #\push + orr r5, r5, r4, pull #\pull + mov r4, r4, push #\push + orr r4, r4, r3, pull #\pull + stmdb r0!, {r4 - r9, ip, lr} + bge 12b + + ldmfd sp!, {r5 - r9} + +14: ands ip, r2, #28 + beq 16f + +15: mov lr, r3, push #\push + ldr r3, [r1, #-4]! + subs ip, ip, #4 + orr lr, lr, r3, pull #\pull + str lr, [r0, #-4]! + bgt 15b + +16: add r1, r1, #(\pull / 8) + b 8b + + .endm + + + backward_copy_shift push=8 pull=24 + +17: backward_copy_shift push=16 pull=16 + +18: backward_copy_shift push=24 pull=8 + + diff --git a/firmware/asm/arm/memset.S b/firmware/asm/arm/memset.S new file mode 100644 index 0000000000..682da874ce --- /dev/null +++ b/firmware/asm/arm/memset.S @@ -0,0 +1,98 @@ +/*************************************************************************** + * __________ __ ___. + * Open \______ \ ____ ____ | | _\_ |__ _______ ___ + * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / + * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < + * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ + * \/ \/ \/ \/ \/ + * $Id$ + * + * Copyright (C) 2006 by Thom Johansen + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ****************************************************************************/ +#include "config.h" + + .section .icode,"ax",%progbits + + .align 2 + +/* The following code is based on code found in Linux kernel version 2.6.15.3 + * linux/arch/arm/lib/memset.S + * + * Copyright (C) 1995-2000 Russell King + */ + +/* This code will align a pointer for memset, if needed */ +1: cmp r2, #4 @ 1 do we have enough + blt 5f @ 1 bytes to align with? + cmp r3, #2 @ 1 + strgtb r1, [r0, #-1]! @ 1 + strgeb r1, [r0, #-1]! @ 1 + strb r1, [r0, #-1]! @ 1 + sub r2, r2, r3 @ 1 r2 = r2 - r3 + b 2f + + .global memset + .type memset,%function +memset: + add r0, r0, r2 @ we'll write backwards in memory + ands r3, r0, #3 @ 1 unaligned? + bne 1b @ 1 +2: +/* + * we know that the pointer in r0 is aligned to a word boundary. + */ + orr r1, r1, r1, lsl #8 + orr r1, r1, r1, lsl #16 + mov r3, r1 + cmp r2, #16 + blt 5f +/* + * We need an extra register for this loop - save the return address and + * use the LR + */ + str lr, [sp, #-4]! + mov ip, r1 + mov lr, r1 + +3: subs r2, r2, #64 + stmgedb r0!, {r1, r3, ip, lr} @ 64 bytes at a time. + stmgedb r0!, {r1, r3, ip, lr} + stmgedb r0!, {r1, r3, ip, lr} + stmgedb r0!, {r1, r3, ip, lr} + bgt 3b + ldrpc cond=eq @ Now <64 bytes to go. +/* + * No need to correct the count; we're only testing bits from now on + */ + tst r2, #32 + stmnedb r0!, {r1, r3, ip, lr} + stmnedb r0!, {r1, r3, ip, lr} + tst r2, #16 + stmnedb r0!, {r1, r3, ip, lr} + ldr lr, [sp], #4 + +5: tst r2, #8 + stmnedb r0!, {r1, r3} + tst r2, #4 + strne r1, [r0, #-4]! +/* + * When we get here, we've got less than 4 bytes to zero. We + * may have an unaligned pointer as well. + */ +6: tst r2, #2 + strneb r1, [r0, #-1]! + strneb r1, [r0, #-1]! + tst r2, #1 + strneb r1, [r0, #-1]! + bx lr +.end: + .size memset,.end-memset diff --git a/firmware/asm/arm/memset16.S b/firmware/asm/arm/memset16.S new file mode 100644 index 0000000000..5c787b1bed --- /dev/null +++ b/firmware/asm/arm/memset16.S @@ -0,0 +1,82 @@ +/*************************************************************************** + * __________ __ ___. + * Open \______ \ ____ ____ | | _\_ |__ _______ ___ + * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / + * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < + * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ + * \/ \/ \/ \/ \/ + * $Id$ + * + * Copyright (C) 2006 by Thom Johansen + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ****************************************************************************/ +#include "config.h" + + .section .icode,"ax",%progbits + + .align 2 + +/* The following code is based on code from the Linux kernel version 2.6.15.3, + * linux/arch/arm/lib/memset.S + * + * Copyright (C) 1995-2000 Russell King + */ + + .global memset16 + .type memset16,%function +memset16: + tst r0, #2 @ unaligned? + cmpne r2, #0 + strneh r1, [r0], #2 @ store one halfword to align + subne r2, r2, #1 + +/* + * we know that the pointer in r0 is aligned to a word boundary. + */ + orr r1, r1, r1, lsl #16 + mov r3, r1 + cmp r2, #8 + blt 4f +/* + * We need an extra register for this loop - save the return address and + * use the LR + */ + str lr, [sp, #-4]! + mov ip, r1 + mov lr, r1 + +2: subs r2, r2, #32 + stmgeia r0!, {r1, r3, ip, lr} @ 64 bytes at a time. + stmgeia r0!, {r1, r3, ip, lr} + stmgeia r0!, {r1, r3, ip, lr} + stmgeia r0!, {r1, r3, ip, lr} + bgt 2b + ldrpc cond=eq @ Now <64 bytes to go. +/* + * No need to correct the count; we're only testing bits from now on + */ + tst r2, #16 + stmneia r0!, {r1, r3, ip, lr} + stmneia r0!, {r1, r3, ip, lr} + tst r2, #8 + stmneia r0!, {r1, r3, ip, lr} + ldr lr, [sp], #4 + +4: tst r2, #4 + stmneia r0!, {r1, r3} + tst r2, #2 + strne r1, [r0], #4 + + tst r2, #1 + strneh r1, [r0], #2 + bx lr +.end: + .size memset16,.end-memset16 -- cgit v1.2.3