summaryrefslogtreecommitdiff
path: root/firmware/common/memcpy.S
diff options
context:
space:
mode:
Diffstat (limited to 'firmware/common/memcpy.S')
-rw-r--r--firmware/common/memcpy.S171
1 files changed, 171 insertions, 0 deletions
diff --git a/firmware/common/memcpy.S b/firmware/common/memcpy.S
new file mode 100644
index 0000000000..2fb9f6a5a7
--- /dev/null
+++ b/firmware/common/memcpy.S
@@ -0,0 +1,171 @@
1/***************************************************************************
2 * __________ __ ___.
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 * \/ \/ \/ \/ \/
8 * $Id$
9 *
10 * Copyright (C) 2004 by Jens Arnold
11 *
12 * All files in this archive are subject to the GNU General Public License.
13 * See the file COPYING in the source tree root for full license agreement.
14 *
15 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
16 * KIND, either express or implied.
17 *
18 ****************************************************************************/
19
20 .section .icode,"ax",@progbits
21
22 .align 2
23 .global _memcpy
24 .type _memcpy,@function
25
26/* Copies <length> bytes of data in memory from <source> to <dest>
27 * This version is optimized for speed
28 *
29 * arguments:
30 * r4 - destination address
31 * r5 - source address
32 * r6 - length
33 *
34 * return value:
35 * r0 - destination address (like ANSI version)
36 *
37 * register usage:
38 * r0 - data / temporary
39 * r1 - bit mask for rounding to long bounds / 2nd data
40 * r2 - first long bound (only if >= 12 bytes)
41 * r3 - last long bound (-4) (only if >= 12 bytes)
42 * r4 - current dest address
43 * r5 - current source address
44 * r6 - source end address
45 * r7 - stored dest start address
46 *
47 * The instruction order below is devised in a way to utilize the pipelining
48 * of the SH1 to the max. The routine also tries to utilize fast page mode.
49 */
50
51_memcpy:
52 add r5,r6 /* r6 = source_end */
53 mov r4,r7 /* store for returning */
54 add #-8,r4 /* adjust for early increments (max. 2 longs) */
55
56 mov r6,r0
57 add #-12,r0 /* r0 = r6 - 12; don't go below 12 here! */
58 cmp/hs r5,r0 /* >= 12 bytes to copy? */
59 bf .start_b2 /* no, jump into byte loop */
60
61 mov #-4,r1 /* r1 = 0xFFFFFFFC */
62
63 mov r5,r2
64 add #3,r2
65 and r1,r2 /* r2 = first source long bound */
66 mov r6,r3
67 add #-4,r3 /* end offset for copying 2 longs per pass */
68 bra .start_b1 /* jump into leading byte loop */
69 and r1,r3 /* r3 = last source long bound - 4 */
70
71 /* leading byte loop: copies 0..3 bytes */
72 .align 2
73.loop_b1:
74 mov.b @r5+,r0 /* load byte & increment source addr */
75 add #1,r4 /* increment dest addr */
76 mov.b r0,@(7,r4) /* store byte */
77.start_b1:
78 cmp/hi r5,r2 /* runs r5 up to first long bound */
79 bt .loop_b1
80 /* now r5 is always at a long boundary */
81 /* -> memory reading is done in longs for all dest alignments */
82
83 /* selector for main copy loop */
84 mov r4,r0
85 tst #3,r0 /* dest now also at long bound? */
86 bt .loop2_l /* yes, do long copy */
87 tst #1,r0 /* dest now at least at word bound? */
88 bt .start4_w /* yes, do word copy */
89
90 /* main loop for byte aligned destination (fast) */
91 /* copies 1 long per pass */
92 add #4,r3 /* reset end offset */
93 add #-1,r4 /* adjust to word alignment for word write+ */
94
95.loop4_b:
96 mov.l @r5+,r0 /* load a long & increment source addr */
97 add #4,r4 /* increment dest addr */
98 mov.b r0,@(8,r4) /* store low byte */
99 shlr8 r0 /* get middle 2 bytes */
100 mov.w r0,@(6,r4) /* store as word+ */
101 shlr16 r0 /* get upper byte */
102 mov.b r0,@(5,r4) /* and store */
103 cmp/hi r5,r3 /* runs r5 up to last long bound */
104 bt .loop4_b
105
106 bra .start_b2 /* jump to trailing byte loop */
107 add #1,r4 /* readjust */
108
109 /* main loop for word aligned destination (faster) */
110 /* copies 2 longs per pass, utilizing fast page mode */
111.start4_w:
112 add #-2,r4 /* adjust to long alignment for long write+ */
113
114.loop4_w:
115 mov.l @r5+,r1 /* load first long & increment source addr */
116 add #8,r4 /* increment dest addr */
117 mov.l @r5+,r0 /* load second long & increment source addr */
118 cmp/hi r5,r3 /* runs r5 up to last or second last long bound */
119 mov.w r0,@(8,r4) /* store low word of second long */
120 xtrct r1,r0 /* extract low word of first long & high word of second long */
121 mov.l r0,@(4,r4) /* and store as long+ */
122 swap.w r1,r0 /* get high word of first long */
123 mov.w r0,@(2,r4) /* and store it */
124 bt .loop4_w
125
126 add #2,r4 /* readjust destination */
127 add #4,r3 /* reset end offset */
128 cmp/hi r5,r3 /* one long left? */
129 bf .start_b2 /* no, jump to trailing byte loop */
130
131 mov.l @r5+,r0 /* load last long & increment source addr */
132 add #4,r4 /* increment dest addr */
133 mov.w r0,@(6,r4) /* store low word */
134 shlr16 r0 /* get high word */
135 bra .start_b2 /* jump to trailing byte loop */
136 mov.w r0,@(4,r4) /* and store it */
137
138 /* main loop for long aligned destination (fastest) */
139 /* copies 2 longs per pass, utilizing fast page mode */
140.loop2_l:
141 mov.l @r5+,r1 /* load first long & increment source addr */
142 add #8,r4 /* increment dest addr */
143 mov.l @r5+,r0 /* load second long & increment source addr */
144 cmp/hi r5,r3 /* runs r5 up to last or second last long bound */
145 mov.l r1,@r4 /* store first long */
146 mov.l r0,@(4,r4) /* store second long; NOT ALIGNED - no speed loss here! */
147 bt .loop2_l
148
149 add #4,r3 /* reset end offset */
150 cmp/hi r5,r3 /* one long left? */
151 bf .start_b2 /* no, jump to trailing byte loop */
152
153 mov.l @r5+,r0 /* load last long & increment source addr */
154 add #4,r4 /* increment dest addr */
155 bra .start_b2 /* jump to trailing byte loop */
156 mov.l r0,@(4,r4) /* store last long */
157
158 /* trailing byte loop: copies 0..3 bytes (or all for < 12 in total) */
159.loop_b2:
160 mov.b @r5+,r0 /* load byte & increment source addr */
161 add #1,r4 /* increment dest addr */
162 mov.b r0,@(7,r4) /* store byte */
163.start_b2:
164 cmp/hi r5,r6 /* runs r5 up to end address */
165 bt .loop_b2
166
167 rts
168 mov r7,r0 /* return dest start address */
169.end:
170 .size _memcpy,.end-_memcpy
171