1 files changed, 171 insertions, 0 deletions
diff --git a/firmware/common/memcpy.S b/firmware/common/memcpy.S
new file mode 100644
index 0000000000..2fb9f6a5a7
--- /dev/null
+++ b/firmware/common/memcpy.S
@@ -0,0 +1,171 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2004 by Jens Arnold
+ *
+ * All files in this archive are subject to the GNU General Public License.
+ * See the file COPYING in the source tree root for full license agreement.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+    .section    .icode,"ax",@progbits
+    .align      2
+    .global     _memcpy
+    .type       _memcpy,@function
+/* Copies <length> bytes of data in memory from <source> to <dest>
+ * This version is optimized for speed
+ *
+ * arguments:
+ *  r4 - destination address
+ *  r5 - source address
+ *  r6 - length
+ *
+ * return value:
+ *  r0 - destination address (like ANSI version)
+ *
+ * register usage:
+ *  r0 - data / temporary
+ *  r1 - bit mask for rounding to long bounds / 2nd data
+ *  r2 - first long bound (only if >= 12 bytes)
+ *  r3 - last long bound (-4) (only if >= 12 bytes)
+ *  r4 - current dest address
+ *  r5 - current source address
+ *  r6 - source end address
+ *  r7 - stored dest start address
+ *
+ * The instruction order below is devised in a way to utilize the pipelining
+ * of the SH1 to the max. The routine also tries to utilize fast page mode.
+ */
+_memcpy:
+    add     r5,r6       /* r6 = source_end */
+    mov     r4,r7       /* store for returning */
+    add     #-8,r4      /* adjust for early increments (max. 2 longs) */
+    mov     r6,r0
+    add     #-12,r0     /* r0 = r6 - 12; don't go below 12 here! */
+    cmp/hs  r5,r0       /* >= 12 bytes to copy? */
+    bf      .start_b2   /* no, jump into byte loop */
+    mov     #-4,r1      /* r1 = 0xFFFFFFFC */
+    mov     r5,r2
+    add     #3,r2
+    and     r1,r2       /* r2 = first source long bound */
+    mov     r6,r3
+    add     #-4,r3      /* end offset for copying 2 longs per pass */
+    bra     .start_b1   /* jump into leading byte loop */
+    and     r1,r3       /* r3 = last source long bound - 4 */
+    /* leading byte loop: copies 0..3 bytes */
+    .align  2
+.loop_b1:
+    mov.b   @r5+,r0     /* load byte & increment source addr */
+    add     #1,r4       /* increment dest addr */
+    mov.b   r0,@(7,r4)  /* store byte */
+.start_b1:
+    cmp/hi  r5,r2       /* runs r5 up to first long bound */
+    bt      .loop_b1
+    /* now r5 is always at a long boundary */
+    /* -> memory reading is done in longs for all dest alignments */
+    /* selector for main copy loop */
+    mov     r4,r0
+    tst     #3,r0       /* dest now also at long bound? */
+    bt      .loop2_l    /* yes, do long copy */
+    tst     #1,r0       /* dest now at least at word bound? */
+    bt      .start4_w   /* yes, do word copy */
+    /* main loop for byte aligned destination (fast) */
+    /* copies 1 long per pass */
+    add     #4,r3       /* reset end offset */
+    add     #-1,r4      /* adjust to word alignment for word write+ */
+.loop4_b:
+    mov.l   @r5+,r0     /* load a long & increment source addr */
+    add     #4,r4       /* increment dest addr */
+    mov.b   r0,@(8,r4)  /* store low byte */
+    shlr8   r0          /* get middle 2 bytes */
+    mov.w   r0,@(6,r4)  /* store as word+ */
+    shlr16  r0          /* get upper byte */
+    mov.b   r0,@(5,r4)  /* and store */
+    cmp/hi  r5,r3       /* runs r5 up to last long bound */
+    bt      .loop4_b
+    bra     .start_b2   /* jump to trailing byte loop */
+    add     #1,r4       /* readjust */
+    /* main loop for word aligned destination (faster) */
+    /* copies 2 longs per pass, utilizing fast page mode */
+.start4_w:
+    add     #-2,r4      /* adjust to long alignment for long write+ */
+.loop4_w:
+    mov.l   @r5+,r1     /* load first long & increment source addr */
+    add     #8,r4       /* increment dest addr */
+    mov.l   @r5+,r0     /* load second long & increment source addr */
+    cmp/hi  r5,r3       /* runs r5 up to last or second last long bound */
+    mov.w   r0,@(8,r4)  /* store low word of second long */
+    xtrct   r1,r0       /* extract low word of first long & high word of second long */
+    mov.l   r0,@(4,r4)  /* and store as long+ */
+    swap.w  r1,r0       /* get high word of first long */
+    mov.w   r0,@(2,r4)  /* and store it */
+    bt      .loop4_w
+    add     #2,r4       /* readjust destination */
+    add     #4,r3       /* reset end offset */
+    cmp/hi  r5,r3       /* one long left? */
+    bf      .start_b2   /* no, jump to trailing byte loop */
+    mov.l   @r5+,r0     /* load last long & increment source addr */
+    add     #4,r4       /* increment dest addr */
+    mov.w   r0,@(6,r4)  /* store low word */
+    shlr16  r0          /* get high word */
+    bra     .start_b2   /* jump to trailing byte loop */
+    mov.w   r0,@(4,r4)  /* and store it */
+    /* main loop for long aligned destination (fastest) */
+    /* copies 2 longs per pass, utilizing fast page mode */
+.loop2_l:
+    mov.l   @r5+,r1     /* load first long & increment source addr */
+    add     #8,r4       /* increment dest addr */
+    mov.l   @r5+,r0     /* load second long & increment source addr */
+    cmp/hi  r5,r3       /* runs r5 up to last or second last long bound */
+    mov.l   r1,@r4      /* store first long */
+    mov.l   r0,@(4,r4)  /* store second long; NOT ALIGNED - no speed loss here! */
+    bt      .loop2_l
+    add     #4,r3       /* reset end offset */
+    cmp/hi  r5,r3       /* one long left? */
+    bf      .start_b2   /* no, jump to trailing byte loop */
+    mov.l   @r5+,r0     /* load last long & increment source addr */
+    add     #4,r4       /* increment dest addr */
+    bra     .start_b2   /* jump to trailing byte loop */
+    mov.l   r0,@(4,r4)  /* store last long */
+    /* trailing byte loop: copies 0..3 bytes (or all for < 12 in total) */
+.loop_b2:
+    mov.b   @r5+,r0     /* load byte & increment source addr */
+    add     #1,r4       /* increment dest addr */
+    mov.b   r0,@(7,r4)  /* store byte */
+.start_b2:
+    cmp/hi  r5,r6       /* runs r5 up to end address */
+    bt      .loop_b2
+    rts
+    mov     r7,r0       /* return dest start address */
+.end:
+    .size   _memcpy,.end-_memcpy

diff --git a/firmware/common/memcpy.S b/firmware/common/memcpy.S new file mode 100644 index 0000000000..2fb9f6a5a7 --- /dev/null +++ b/firmware/common/memcpy.S
@@ -0,0 +1,171 @@
	1	/***************************************************************************
	2	* __________ __ ___.
	3	* Open \______ \ ____ ____ \| \| _\_ \|__ _______ ___
	4	* Source \| _// _ \_/ ___\\| \|/ /\| __ \ / _ \ \/ /
	5	* Jukebox \| \| ( <_> ) \___\| < \| \_\ ( <_> > < <
	6	* Firmware \|____\|_ /\____/ \___ >__\|_ \\|___ /\____/__/\_ \
	7	* \/ \/ \/ \/ \/
	8	* $Id$
	9	*
	10	* Copyright (C) 2004 by Jens Arnold
	11	*
	12	* All files in this archive are subject to the GNU General Public License.
	13	* See the file COPYING in the source tree root for full license agreement.
	14	*
	15	* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
	16	* KIND, either express or implied.
	17	*
	18	****************************************************************************/
	19
	20	.section .icode,"ax",@progbits
	21
	22	.align 2
	23	.global _memcpy
	24	.type _memcpy,@function
	25
	26	/* Copies <length> bytes of data in memory from <source> to <dest>
	27	* This version is optimized for speed
	28	*
	29	* arguments:
	30	* r4 - destination address
	31	* r5 - source address
	32	* r6 - length
	33	*
	34	* return value:
	35	* r0 - destination address (like ANSI version)
	36	*
	37	* register usage:
	38	* r0 - data / temporary
	39	* r1 - bit mask for rounding to long bounds / 2nd data
	40	* r2 - first long bound (only if >= 12 bytes)
	41	* r3 - last long bound (-4) (only if >= 12 bytes)
	42	* r4 - current dest address
	43	* r5 - current source address
	44	* r6 - source end address
	45	* r7 - stored dest start address
	46	*
	47	* The instruction order below is devised in a way to utilize the pipelining
	48	* of the SH1 to the max. The routine also tries to utilize fast page mode.
	49	*/
	50
	51	_memcpy:
	52	add r5,r6 /* r6 = source_end */
	53	mov r4,r7 /* store for returning */
	54	add #-8,r4 /* adjust for early increments (max. 2 longs) */
	55
	56	mov r6,r0
	57	add #-12,r0 /* r0 = r6 - 12; don't go below 12 here! */
	58	cmp/hs r5,r0 /* >= 12 bytes to copy? */
	59	bf .start_b2 /* no, jump into byte loop */
	60
	61	mov #-4,r1 /* r1 = 0xFFFFFFFC */
	62
	63	mov r5,r2
	64	add #3,r2
	65	and r1,r2 /* r2 = first source long bound */
	66	mov r6,r3
	67	add #-4,r3 /* end offset for copying 2 longs per pass */
	68	bra .start_b1 /* jump into leading byte loop */
	69	and r1,r3 /* r3 = last source long bound - 4 */
	70
	71	/* leading byte loop: copies 0..3 bytes */
	72	.align 2
	73	.loop_b1:
	74	mov.b @r5+,r0 /* load byte & increment source addr */
	75	add #1,r4 /* increment dest addr */
	76	mov.b r0,@(7,r4) /* store byte */
	77	.start_b1:
	78	cmp/hi r5,r2 /* runs r5 up to first long bound */
	79	bt .loop_b1
	80	/* now r5 is always at a long boundary */
	81	/* -> memory reading is done in longs for all dest alignments */
	82
	83	/* selector for main copy loop */
	84	mov r4,r0
	85	tst #3,r0 /* dest now also at long bound? */
	86	bt .loop2_l /* yes, do long copy */
	87	tst #1,r0 /* dest now at least at word bound? */
	88	bt .start4_w /* yes, do word copy */
	89
	90	/* main loop for byte aligned destination (fast) */
	91	/* copies 1 long per pass */
	92	add #4,r3 /* reset end offset */
	93	add #-1,r4 /* adjust to word alignment for word write+ */
	94
	95	.loop4_b:
	96	mov.l @r5+,r0 /* load a long & increment source addr */
	97	add #4,r4 /* increment dest addr */
	98	mov.b r0,@(8,r4) /* store low byte */
	99	shlr8 r0 /* get middle 2 bytes */
	100	mov.w r0,@(6,r4) /* store as word+ */
	101	shlr16 r0 /* get upper byte */
	102	mov.b r0,@(5,r4) /* and store */
	103	cmp/hi r5,r3 /* runs r5 up to last long bound */
	104	bt .loop4_b
	105
	106	bra .start_b2 /* jump to trailing byte loop */
	107	add #1,r4 /* readjust */
	108
	109	/* main loop for word aligned destination (faster) */
	110	/* copies 2 longs per pass, utilizing fast page mode */
	111	.start4_w:
	112	add #-2,r4 /* adjust to long alignment for long write+ */
	113
	114	.loop4_w:
	115	mov.l @r5+,r1 /* load first long & increment source addr */
	116	add #8,r4 /* increment dest addr */
	117	mov.l @r5+,r0 /* load second long & increment source addr */
	118	cmp/hi r5,r3 /* runs r5 up to last or second last long bound */
	119	mov.w r0,@(8,r4) /* store low word of second long */
	120	xtrct r1,r0 /* extract low word of first long & high word of second long */
	121	mov.l r0,@(4,r4) /* and store as long+ */
	122	swap.w r1,r0 /* get high word of first long */
	123	mov.w r0,@(2,r4) /* and store it */
	124	bt .loop4_w
	125
	126	add #2,r4 /* readjust destination */
	127	add #4,r3 /* reset end offset */
	128	cmp/hi r5,r3 /* one long left? */
	129	bf .start_b2 /* no, jump to trailing byte loop */
	130
	131	mov.l @r5+,r0 /* load last long & increment source addr */
	132	add #4,r4 /* increment dest addr */
	133	mov.w r0,@(6,r4) /* store low word */
	134	shlr16 r0 /* get high word */
	135	bra .start_b2 /* jump to trailing byte loop */
	136	mov.w r0,@(4,r4) /* and store it */
	137
	138	/* main loop for long aligned destination (fastest) */
	139	/* copies 2 longs per pass, utilizing fast page mode */
	140	.loop2_l:
	141	mov.l @r5+,r1 /* load first long & increment source addr */
	142	add #8,r4 /* increment dest addr */
	143	mov.l @r5+,r0 /* load second long & increment source addr */
	144	cmp/hi r5,r3 /* runs r5 up to last or second last long bound */
	145	mov.l r1,@r4 /* store first long */
	146	mov.l r0,@(4,r4) /* store second long; NOT ALIGNED - no speed loss here! */
	147	bt .loop2_l
	148
	149	add #4,r3 /* reset end offset */
	150	cmp/hi r5,r3 /* one long left? */
	151	bf .start_b2 /* no, jump to trailing byte loop */
	152
	153	mov.l @r5+,r0 /* load last long & increment source addr */
	154	add #4,r4 /* increment dest addr */
	155	bra .start_b2 /* jump to trailing byte loop */
	156	mov.l r0,@(4,r4) /* store last long */
	157
	158	/* trailing byte loop: copies 0..3 bytes (or all for < 12 in total) */
	159	.loop_b2:
	160	mov.b @r5+,r0 /* load byte & increment source addr */
	161	add #1,r4 /* increment dest addr */
	162	mov.b r0,@(7,r4) /* store byte */
	163	.start_b2:
	164	cmp/hi r5,r6 /* runs r5 up to end address */
	165	bt .loop_b2
	166
	167	rts
	168	mov r7,r0 /* return dest start address */
	169	.end:
	170	.size _memcpy,.end-_memcpy
	171