From a035261089403de259e74ce4dd196e2715138ed2 Mon Sep 17 00:00:00 2001
From: Thomas Martitz <kugel@rockbox.org>
Date: Sat, 7 Jan 2012 19:56:09 +0100
Subject: Move optimized memcpy and friends and strlen to firmware/asm, using
 the new automatic-asm-picking infrastructure.

---
 firmware/asm/arm/memcpy.S   | 176 ++++++++++++++++++++++++++++++++++++++++
 firmware/asm/arm/memmove.S  | 190 ++++++++++++++++++++++++++++++++++++++++++++
 firmware/asm/arm/memset.S   |  98 +++++++++++++++++++++++
 firmware/asm/arm/memset16.S |  82 +++++++++++++++++++
 4 files changed, 546 insertions(+)
 create mode 100644 firmware/asm/arm/memcpy.S
 create mode 100644 firmware/asm/arm/memmove.S
 create mode 100644 firmware/asm/arm/memset.S
 create mode 100644 firmware/asm/arm/memset16.S

(limited to 'firmware/asm/arm')

diff --git a/firmware/asm/arm/memcpy.S b/firmware/asm/arm/memcpy.S
new file mode 100644
index 0000000000..2a55fb5656
--- /dev/null
+++ b/firmware/asm/arm/memcpy.S
@@ -0,0 +1,176 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2006 Free Software Foundation, Inc.
+ * This file was originally part of the GNU C Library
+ * Contributed to glibc by MontaVista Software, Inc. (written by Nicolas Pitre)
+ * Adapted for Rockbox by Daniel Ankers
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#include "config.h"
+
+/*
+ * Endian independent macros for shifting bytes within registers.
+ */
+#ifndef __ARMEB__
+#define pull            lsr
+#define push            lsl
+#else
+#define pull            lsl
+#define push            lsr
+#endif
+
+/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
+
+    .section    .icode,"ax",%progbits
+
+    .align      2
+    .global     memcpy
+    .type       memcpy,%function
+
+memcpy:
+        stmfd   sp!, {r0, r4, lr}
+
+        subs    r2, r2, #4
+        blt 8f
+        ands    ip, r0, #3
+        bne 9f
+        ands    ip, r1, #3
+        bne 10f
+
+1:      subs    r2, r2, #(28)
+        stmfd   sp!, {r5 - r8}
+        blt 5f
+
+2:
+3:
+4:      ldmia   r1!, {r3, r4, r5, r6, r7, r8, ip, lr}
+        subs    r2, r2, #32
+        stmia   r0!, {r3, r4, r5, r6, r7, r8, ip, lr}
+        bge 3b
+
+5:      ands    ip, r2, #28
+        rsb ip, ip, #32
+        addne   pc, pc, ip      @ C is always clear here
+        b   7f
+6:      nop
+        ldr r3, [r1], #4
+        ldr r4, [r1], #4
+        ldr r5, [r1], #4
+        ldr r6, [r1], #4
+        ldr r7, [r1], #4
+        ldr r8, [r1], #4
+        ldr lr, [r1], #4
+
+        add pc, pc, ip
+        nop
+        nop
+        str r3, [r0], #4
+        str r4, [r0], #4
+        str r5, [r0], #4
+        str r6, [r0], #4
+        str r7, [r0], #4
+        str r8, [r0], #4
+        str lr, [r0], #4
+
+7:      ldmfd   sp!, {r5 - r8}
+
+8:      movs    r2, r2, lsl #31
+        ldrneb  r3, [r1], #1
+        ldrcsb  r4, [r1], #1
+        ldrcsb  ip, [r1]
+        strneb  r3, [r0], #1
+        strcsb  r4, [r0], #1
+        strcsb  ip, [r0]
+
+        ldmpc   regs="r0, r4"
+
+9:      rsb ip, ip, #4
+        cmp ip, #2
+        ldrgtb  r3, [r1], #1
+        ldrgeb  r4, [r1], #1
+        ldrb    lr, [r1], #1
+        strgtb  r3, [r0], #1
+        strgeb  r4, [r0], #1
+        subs    r2, r2, ip
+        strb    lr, [r0], #1
+        blt 8b
+        ands    ip, r1, #3
+        beq 1b
+
+10:     bic r1, r1, #3
+        cmp ip, #2
+        ldr lr, [r1], #4
+        beq 17f
+        bgt 18f
+
+
+        .macro  forward_copy_shift pull push
+
+        subs    r2, r2, #28
+        blt 14f
+
+11:     stmfd   sp!, {r5 - r9}
+
+12:
+13:     ldmia   r1!, {r4, r5, r6, r7}
+        mov r3, lr, pull #\pull
+        subs    r2, r2, #32
+        ldmia   r1!, {r8, r9, ip, lr}
+        orr r3, r3, r4, push #\push
+        mov r4, r4, pull #\pull
+        orr r4, r4, r5, push #\push
+        mov r5, r5, pull #\pull
+        orr r5, r5, r6, push #\push
+        mov r6, r6, pull #\pull
+        orr r6, r6, r7, push #\push
+        mov r7, r7, pull #\pull
+        orr r7, r7, r8, push #\push
+        mov r8, r8, pull #\pull
+        orr r8, r8, r9, push #\push
+        mov r9, r9, pull #\pull
+        orr r9, r9, ip, push #\push
+        mov ip, ip, pull #\pull
+        orr ip, ip, lr, push #\push
+        stmia   r0!, {r3, r4, r5, r6, r7, r8, r9, ip}
+        bge 12b
+
+        ldmfd   sp!, {r5 - r9}
+
+14:     ands    ip, r2, #28
+        beq 16f
+
+15:     mov r3, lr, pull #\pull
+        ldr lr, [r1], #4
+        subs    ip, ip, #4
+        orr r3, r3, lr, push #\push
+        str r3, [r0], #4
+        bgt 15b
+
+16:     sub r1, r1, #(\push / 8)
+        b   8b
+
+        .endm
+
+
+        forward_copy_shift  pull=8  push=24
+
+17:     forward_copy_shift  pull=16 push=16
+
+18:     forward_copy_shift  pull=24 push=8
+
diff --git a/firmware/asm/arm/memmove.S b/firmware/asm/arm/memmove.S
new file mode 100644
index 0000000000..d8cab048be
--- /dev/null
+++ b/firmware/asm/arm/memmove.S
@@ -0,0 +1,190 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2006 Free Software Foundation, Inc.
+ * This file was originally part of the GNU C Library
+ * Contributed to glibc by MontaVista Software, Inc. (written by Nicolas Pitre)
+ * Adapted for Rockbox by Daniel Ankers
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#include "config.h"
+
+/*
+ * Endian independent macros for shifting bytes within registers.
+ */
+#ifndef __ARMEB__
+#define pull            lsr
+#define push            lsl
+#else
+#define pull            lsl
+#define push            lsr
+#endif
+
+        .text
+
+/*
+ * Prototype: void *memmove(void *dest, const void *src, size_t n);
+ *
+ * Note:
+ *
+ * If the memory regions don't overlap, we simply branch to memcpy which is
+ * normally a bit faster. Otherwise the copy is done going downwards.
+ */
+
+    .section    .icode,"ax",%progbits
+
+    .align      2
+    .global     memmove
+    .type       memmove,%function
+
+memmove:
+
+        subs    ip, r0, r1
+        cmphi   r2, ip
+        bls memcpy
+
+        stmfd   sp!, {r0, r4, lr}
+        add r1, r1, r2
+        add r0, r0, r2
+        subs    r2, r2, #4
+        blt 8f
+        ands    ip, r0, #3
+        bne 9f
+        ands    ip, r1, #3
+        bne 10f
+
+1:      subs    r2, r2, #(28)
+        stmfd   sp!, {r5 - r8}
+        blt 5f
+
+2:
+3:
+4:      ldmdb   r1!, {r3, r4, r5, r6, r7, r8, ip, lr}
+        subs    r2, r2, #32
+        stmdb   r0!, {r3, r4, r5, r6, r7, r8, ip, lr}
+        bge 3b
+
+5:      ands    ip, r2, #28
+        rsb ip, ip, #32
+        addne   pc, pc, ip      @ C is always clear here
+        b   7f
+6:      nop
+        ldr r3, [r1, #-4]!
+        ldr r4, [r1, #-4]!
+        ldr r5, [r1, #-4]!
+        ldr r6, [r1, #-4]!
+        ldr r7, [r1, #-4]!
+        ldr r8, [r1, #-4]!
+        ldr lr, [r1, #-4]!
+
+        add pc, pc, ip
+        nop
+        nop
+        str r3, [r0, #-4]!
+        str r4, [r0, #-4]!
+        str r5, [r0, #-4]!
+        str r6, [r0, #-4]!
+        str r7, [r0, #-4]!
+        str r8, [r0, #-4]!
+        str lr, [r0, #-4]!
+
+7:      ldmfd   sp!, {r5 - r8}
+
+8:      movs    r2, r2, lsl #31
+        ldrneb  r3, [r1, #-1]!
+        ldrcsb  r4, [r1, #-1]!
+        ldrcsb  ip, [r1, #-1]
+        strneb  r3, [r0, #-1]!
+        strcsb  r4, [r0, #-1]!
+        strcsb  ip, [r0, #-1]
+        ldmpc   regs="r0, r4"
+
+9:      cmp ip, #2
+        ldrgtb  r3, [r1, #-1]!
+        ldrgeb  r4, [r1, #-1]!
+        ldrb    lr, [r1, #-1]!
+        strgtb  r3, [r0, #-1]!
+        strgeb  r4, [r0, #-1]!
+        subs    r2, r2, ip
+        strb    lr, [r0, #-1]!
+        blt 8b
+        ands    ip, r1, #3
+        beq 1b
+
+10:     bic r1, r1, #3
+        cmp ip, #2
+        ldr r3, [r1, #0]
+        beq 17f
+        blt 18f
+
+
+        .macro  backward_copy_shift push pull
+
+        subs    r2, r2, #28
+        blt 14f
+
+11:     stmfd   sp!, {r5 - r9}
+
+12:
+13:     ldmdb   r1!, {r7, r8, r9, ip}
+        mov     lr, r3, push #\push
+        subs    r2, r2, #32
+        ldmdb   r1!, {r3, r4, r5, r6}
+        orr     lr, lr, ip, pull #\pull
+        mov     ip, ip, push #\push
+        orr     ip, ip, r9, pull #\pull
+        mov     r9, r9, push #\push
+        orr     r9, r9, r8, pull #\pull
+        mov     r8, r8, push #\push
+        orr     r8, r8, r7, pull #\pull
+        mov     r7, r7, push #\push
+        orr     r7, r7, r6, pull #\pull
+        mov     r6, r6, push #\push
+        orr     r6, r6, r5, pull #\pull
+        mov     r5, r5, push #\push
+        orr     r5, r5, r4, pull #\pull
+        mov     r4, r4, push #\push
+        orr     r4, r4, r3, pull #\pull
+        stmdb   r0!, {r4 - r9, ip, lr}
+        bge 12b
+
+        ldmfd   sp!, {r5 - r9}
+
+14:     ands    ip, r2, #28
+        beq 16f
+
+15:     mov     lr, r3, push #\push
+        ldr r3, [r1, #-4]!
+        subs    ip, ip, #4
+        orr lr, lr, r3, pull #\pull
+        str lr, [r0, #-4]!
+        bgt 15b
+
+16:     add r1, r1, #(\pull / 8)
+        b   8b
+
+        .endm
+
+
+        backward_copy_shift push=8  pull=24
+
+17:     backward_copy_shift push=16 pull=16
+
+18:     backward_copy_shift push=24 pull=8
+
+
diff --git a/firmware/asm/arm/memset.S b/firmware/asm/arm/memset.S
new file mode 100644
index 0000000000..682da874ce
--- /dev/null
+++ b/firmware/asm/arm/memset.S
@@ -0,0 +1,98 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2006 by Thom Johansen
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+#include "config.h"
+
+    .section    .icode,"ax",%progbits
+
+    .align      2
+
+/*  The following code is based on code found in Linux kernel version 2.6.15.3
+ *  linux/arch/arm/lib/memset.S
+ *
+ *  Copyright (C) 1995-2000 Russell King
+ */
+
+/* This code will align a pointer for memset, if needed */
+1:      cmp     r2, #4                  @ 1 do we have enough
+        blt     5f                      @ 1 bytes to align with?
+        cmp     r3, #2                  @ 1
+        strgtb  r1, [r0, #-1]!          @ 1
+        strgeb  r1, [r0, #-1]!          @ 1
+        strb    r1, [r0, #-1]!          @ 1
+        sub     r2, r2, r3              @ 1 r2 = r2 - r3
+        b 2f
+
+        .global     memset
+        .type       memset,%function
+memset:
+        add     r0, r0, r2              @ we'll write backwards in memory
+        ands    r3, r0, #3              @ 1 unaligned?
+        bne     1b                      @ 1
+2:
+/*
+ * we know that the pointer in r0 is aligned to a word boundary.
+ */
+        orr     r1, r1, r1, lsl #8
+        orr     r1, r1, r1, lsl #16
+        mov     r3, r1
+        cmp     r2, #16
+        blt     5f
+/*
+ * We need an extra register for this loop - save the return address and
+ * use the LR
+ */
+        str     lr, [sp, #-4]!
+        mov     ip, r1
+        mov     lr, r1
+
+3:      subs    r2, r2, #64
+        stmgedb r0!, {r1, r3, ip, lr}   @ 64 bytes at a time.
+        stmgedb r0!, {r1, r3, ip, lr}
+        stmgedb r0!, {r1, r3, ip, lr}
+        stmgedb r0!, {r1, r3, ip, lr}
+        bgt     3b
+        ldrpc   cond=eq                 @ Now <64 bytes to go.
+/*
+ * No need to correct the count; we're only testing bits from now on
+ */
+        tst     r2, #32
+        stmnedb r0!, {r1, r3, ip, lr}
+        stmnedb r0!, {r1, r3, ip, lr}
+        tst     r2, #16
+        stmnedb r0!, {r1, r3, ip, lr}
+        ldr     lr, [sp], #4
+
+5:      tst     r2, #8
+        stmnedb r0!, {r1, r3}
+        tst     r2, #4
+        strne   r1, [r0, #-4]!
+/*
+ * When we get here, we've got less than 4 bytes to zero.  We
+ * may have an unaligned pointer as well.
+ */
+6:      tst     r2, #2
+        strneb  r1, [r0, #-1]!
+        strneb  r1, [r0, #-1]!
+        tst     r2, #1
+        strneb  r1, [r0, #-1]!
+        bx      lr
+.end:
+        .size   memset,.end-memset
diff --git a/firmware/asm/arm/memset16.S b/firmware/asm/arm/memset16.S
new file mode 100644
index 0000000000..5c787b1bed
--- /dev/null
+++ b/firmware/asm/arm/memset16.S
@@ -0,0 +1,82 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2006 by Thom Johansen
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+#include "config.h"
+
+    .section    .icode,"ax",%progbits
+
+    .align      2
+
+/*  The following code is based on code from the Linux kernel version 2.6.15.3,
+ *  linux/arch/arm/lib/memset.S
+ *
+ *  Copyright (C) 1995-2000 Russell King
+ */
+
+        .global     memset16
+        .type       memset16,%function
+memset16:
+        tst     r0, #2                  @ unaligned?
+        cmpne   r2, #0
+        strneh  r1, [r0], #2            @ store one halfword to align
+        subne   r2, r2, #1
+
+/*
+ * we know that the pointer in r0 is aligned to a word boundary.
+ */
+        orr     r1, r1, r1, lsl #16
+        mov     r3, r1
+        cmp     r2, #8
+        blt     4f
+/*
+ * We need an extra register for this loop - save the return address and
+ * use the LR
+ */
+        str     lr, [sp, #-4]!
+        mov     ip, r1
+        mov     lr, r1
+
+2:      subs    r2, r2, #32
+        stmgeia r0!, {r1, r3, ip, lr}   @ 64 bytes at a time.
+        stmgeia r0!, {r1, r3, ip, lr}
+        stmgeia r0!, {r1, r3, ip, lr}
+        stmgeia r0!, {r1, r3, ip, lr}
+        bgt     2b
+        ldrpc   cond=eq                 @ Now <64 bytes to go.
+/*
+ * No need to correct the count; we're only testing bits from now on
+ */
+        tst     r2, #16
+        stmneia r0!, {r1, r3, ip, lr}
+        stmneia r0!, {r1, r3, ip, lr}
+        tst     r2, #8
+        stmneia r0!, {r1, r3, ip, lr}
+        ldr     lr, [sp], #4
+
+4:      tst     r2, #4
+        stmneia r0!, {r1, r3}
+        tst     r2, #2
+        strne   r1, [r0], #4
+
+        tst     r2, #1
+        strneh  r1, [r0], #2
+        bx      lr
+.end:
+        .size   memset16,.end-memset16
-- 
cgit v1.2.3