From a035261089403de259e74ce4dd196e2715138ed2 Mon Sep 17 00:00:00 2001 From: Thomas Martitz Date: Sat, 7 Jan 2012 19:56:09 +0100 Subject: Move optimized memcpy and friends and strlen to firmware/asm, using the new automatic-asm-picking infrastructure. --- firmware/asm/m68k/memset.S | 152 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 firmware/asm/m68k/memset.S (limited to 'firmware/asm/m68k/memset.S') diff --git a/firmware/asm/m68k/memset.S b/firmware/asm/m68k/memset.S new file mode 100644 index 0000000000..839b305a05 --- /dev/null +++ b/firmware/asm/m68k/memset.S @@ -0,0 +1,152 @@ +/*************************************************************************** + * __________ __ ___. + * Open \______ \ ____ ____ | | _\_ |__ _______ ___ + * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / + * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < + * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ + * \/ \/ \/ \/ \/ + * $Id$ + * + * Copyright (C) 2004 by Jens Arnold + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ****************************************************************************/ +#include "config.h" + + .section .icode,"ax",@progbits + + .align 2 + .global memset + .type memset,@function + +/* Fills a memory region with specified byte value + * This version is optimized for speed + * + * arguments: + * (4,%sp) - start address + * (8,%sp) - data + * (12,%sp) - length + * + * return value: + * %d0 - start address (like ANSI version) + * + * register usage: + * %d0 - data (spread to all 4 bytes when using long stores) + * %d1 - temporary / data (for burst transfer) + * %d2 - data (for burst transfer) + * %d3 - data (for burst transfer) + * %a0 - start address + * %a1 - current address (runs down from end to start) + * + * For maximum speed this routine uses both long stores and burst mode, + * storing whole lines with movem.l. The routine fills memory from end + * to start in order to ease returning the start address. + */ +memset: + move.l (4,%sp),%a0 /* start address */ + move.l (8,%sp),%d0 /* data */ + move.l (12,%sp),%a1 /* length */ + add.l %a0,%a1 /* %a1 = end address */ + + move.l %a0,%d1 + addq.l #7,%d1 + and.l #0xFFFFFFFC,%d1 /* %d1 = first long bound + 4 */ + cmp.l %d1,%a1 /* at least one aligned longword to fill? */ + blo.b .no_longs /* no, jump directly to byte loop */ + + and.l #0xFF,%d0 /* start: spread data to all 4 bytes */ + move.l %d0,%d1 + lsl.l #8,%d1 + or.l %d1,%d0 /* data now in 2 lower bytes of %d0 */ + move.l %d0,%d1 + swap %d0 + or.l %d1,%d0 /* data now in all 4 bytes of %d0 */ + + move.l %a1,%d1 + and.l #0xFFFFFFFC,%d1 /* %d1 = last long bound */ + cmp.l %d1,%a1 /* any bytes to set? */ + bls.b .end_b1 /* no: skip byte loop */ + + /* leading byte loop: sets 0..3 bytes */ +.loop_b1: + move.b %d0,-(%a1) /* store byte */ + cmp.l %d1,%a1 /* runs %a1 down to last long bound */ + bhi.b .loop_b1 + +.end_b1: + moveq.l #31,%d1 + add.l %a0,%d1 + and.l #0xFFFFFFF0,%d1 /* %d1 = first line bound + 16 */ + cmp.l %d1,%a1 /* at least one full line to fill? */ + blo.b .no_lines /* no, jump to longword loop */ + + mov.l %a1,%d1 + and.l #0xFFFFFFF0,%d1 /* %d1 = last line bound */ + cmp.l %d1,%a1 /* any longwords to set? */ + bls.b .end_l1 /* no: skip longword loop */ + + /* leading longword loop: sets 0..3 longwords */ +.loop_l1: + move.l %d0,-(%a1) /* store longword */ + cmp.l %d1,%a1 /* runs %a1 down to last line bound */ + bhi.b .loop_l1 + +.end_l1: + move.l %d2,-(%sp) /* free some registers */ + move.l %d3,-(%sp) + + move.l %d0,%d1 /* spread data to 4 data registers */ + move.l %d0,%d2 + move.l %d0,%d3 + lea.l (15,%a0),%a0 /* start address += 15, acct. for trl. data */ + + /* main loop: set whole lines utilising burst mode */ +.loop_line: + lea.l (-16,%a1),%a1 /* pre-decrement */ + movem.l %d0-%d3,(%a1) /* store line */ + cmp.l %a0,%a1 /* runs %a1 down to first line bound */ + bhi.b .loop_line + + lea.l (-15,%a0),%a0 /* correct start address */ + move.l (%sp)+,%d3 /* restore registers */ + move.l (%sp)+,%d2 + + move.l %a0,%d1 /* %d1 = start address ... */ + addq.l #3,%d1 /* ... +3, account for possible trailing bytes */ + cmp.l %d1,%a1 /* any longwords left */ + bhi.b .loop_l2 /* yes: jump to longword loop */ + bra.b .no_longs /* no: skip loop */ + +.no_lines: + move.l %a0,%d1 /* %d1 = start address ... */ + addq.l #3,%d1 /* ... +3, account for possible trailing bytes */ + + /* trailing longword loop */ +.loop_l2: + move.l %d0,-(%a1) /* store longword */ + cmp.l %d1,%a1 /* runs %a1 down to first long bound */ + bhi.b .loop_l2 + +.no_longs: + cmp.l %a0,%a1 /* any bytes left? */ + bls.b .end_b2 /* no: skip loop */ + + /* trailing byte loop */ +.loop_b2: + move.b %d0,-(%a1) /* store byte */ + cmp.l %a0,%a1 /* runs %a1 down to start address */ + bhi.b .loop_b2 + +.end_b2: + move.l %a0,%d0 /* return start address */ + rts + +.end: + .size memset,.end-memset -- cgit v1.2.3