diff options
-rw-r--r-- | firmware/SOURCES | 22 | ||||
-rwxr-xr-x | firmware/target/arm/memset-arm.S | 96 | ||||
-rwxr-xr-x[-rw-r--r--] | firmware/target/coldfire/memcpy-coldfire.S (renamed from firmware/common/memcpy_a.S) | 198 | ||||
-rwxr-xr-x | firmware/target/coldfire/memmove-coldfire.S (renamed from firmware/common/memmove_a.S) | 201 | ||||
-rwxr-xr-x[-rw-r--r--] | firmware/target/coldfire/memset-coldfire.S (renamed from firmware/common/memset_a.S) | 167 | ||||
-rwxr-xr-x | firmware/target/coldfire/memset16-coldfire.S (renamed from firmware/common/memset16_a.S) | 2 | ||||
-rwxr-xr-x | firmware/target/sh/memcpy-sh.S | 217 | ||||
-rwxr-xr-x | firmware/target/sh/memmove-sh.S | 220 | ||||
-rwxr-xr-x | firmware/target/sh/memset-sh.S | 107 | ||||
-rwxr-xr-x | firmware/target/sh/strlen-sh.S (renamed from firmware/common/strlen_a.S) | 0 |
10 files changed, 652 insertions, 578 deletions
diff --git a/firmware/SOURCES b/firmware/SOURCES index 7d93edaca1..2979e33ccc 100644 --- a/firmware/SOURCES +++ b/firmware/SOURCES | |||
@@ -30,7 +30,7 @@ common/strchr.c | |||
30 | common/strcmp.c | 30 | common/strcmp.c |
31 | common/strcpy.c | 31 | common/strcpy.c |
32 | #if (CONFIG_CPU == SH7034) && !defined(SIMULATOR) | 32 | #if (CONFIG_CPU == SH7034) && !defined(SIMULATOR) |
33 | common/strlen_a.S | 33 | target/sh/strlen-sh.S |
34 | #else | 34 | #else |
35 | common/strlen.c | 35 | common/strlen.c |
36 | #endif | 36 | #endif |
@@ -39,22 +39,24 @@ common/strncpy.c | |||
39 | common/strrchr.c | 39 | common/strrchr.c |
40 | common/strtok.c | 40 | common/strtok.c |
41 | common/timefuncs.c | 41 | common/timefuncs.c |
42 | #if (CONFIG_CPU == SH7034) || defined(CPU_COLDFIRE) | 42 | |
43 | common/memcpy_a.S | 43 | #ifdef CPU_COLDFIRE |
44 | common/memmove_a.S | 44 | target/coldfire/memcpy-coldfire.S |
45 | common/memset_a.S | 45 | target/coldfire/memmove-coldfire.S |
46 | target/coldfire/memset-coldfire.S | ||
47 | target/coldfire/memset16-coldfire.S | ||
48 | #elif (CONFIG_CPU == SH7034) | ||
49 | target/sh/memcpy-sh.S | ||
50 | target/sh/memmove-sh.S | ||
51 | target/sh/memset-sh.S | ||
46 | #elif defined(CPU_ARM) | 52 | #elif defined(CPU_ARM) |
47 | common/memcpy.c | 53 | common/memcpy.c |
48 | common/memmove.c | 54 | common/memmove.c |
49 | common/memset_a.S | 55 | target/arm/memset-arm.S |
50 | #else | 56 | #else |
51 | common/memcpy.c | 57 | common/memcpy.c |
52 | common/memmove.c | 58 | common/memmove.c |
53 | common/memset.c | 59 | common/memset.c |
54 | #endif | ||
55 | #ifdef CPU_COLDFIRE | ||
56 | common/memset16_a.S | ||
57 | #else | ||
58 | common/memset16.c | 60 | common/memset16.c |
59 | #endif | 61 | #endif |
60 | #ifdef HAVE_LCD_CHARCELLS | 62 | #ifdef HAVE_LCD_CHARCELLS |
diff --git a/firmware/target/arm/memset-arm.S b/firmware/target/arm/memset-arm.S new file mode 100755 index 0000000000..b3faafcb37 --- /dev/null +++ b/firmware/target/arm/memset-arm.S | |||
@@ -0,0 +1,96 @@ | |||
1 | /*************************************************************************** | ||
2 | * __________ __ ___. | ||
3 | * Open \______ \ ____ ____ | | _\_ |__ _______ ___ | ||
4 | * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / | ||
5 | * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < | ||
6 | * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ | ||
7 | * \/ \/ \/ \/ \/ | ||
8 | * $Id$ | ||
9 | * | ||
10 | * Copyright (C) 2004 by Jens Arnold | ||
11 | * | ||
12 | * All files in this archive are subject to the GNU General Public License. | ||
13 | * See the file COPYING in the source tree root for full license agreement. | ||
14 | * | ||
15 | * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY | ||
16 | * KIND, either express or implied. | ||
17 | * | ||
18 | ****************************************************************************/ | ||
19 | #include "config.h" | ||
20 | |||
21 | .section .icode,"ax",%progbits | ||
22 | |||
23 | .align 2 | ||
24 | |||
25 | /* The following code is based on code found in Linux kernel version 2.6.15.3 | ||
26 | * linux/arch/arm/lib/memset.S | ||
27 | * | ||
28 | * Copyright (C) 1995-2000 Russell King | ||
29 | */ | ||
30 | |||
31 | /* This code will align a pointer for memset, if needed */ | ||
32 | 1: cmp r2, #4 @ 1 do we have enough | ||
33 | blt 5f @ 1 bytes to align with? | ||
34 | cmp r3, #2 @ 1 | ||
35 | strgtb r1, [r0, #-1]! @ 1 | ||
36 | strgeb r1, [r0, #-1]! @ 1 | ||
37 | strb r1, [r0, #-1]! @ 1 | ||
38 | sub r2, r2, r3 @ 1 r2 = r2 - r3 | ||
39 | b 2f | ||
40 | |||
41 | .global memset | ||
42 | .type memset,%function | ||
43 | memset: | ||
44 | add r0, r0, r2 @ we'll write backwards in memory | ||
45 | ands r3, r0, #3 @ 1 unaligned? | ||
46 | bne 1b @ 1 | ||
47 | 2: | ||
48 | /* | ||
49 | * we know that the pointer in r0 is aligned to a word boundary. | ||
50 | */ | ||
51 | orr r1, r1, r1, lsl #8 | ||
52 | orr r1, r1, r1, lsl #16 | ||
53 | mov r3, r1 | ||
54 | cmp r2, #16 | ||
55 | blt 5f | ||
56 | /* | ||
57 | * We need an extra register for this loop - save the return address and | ||
58 | * use the LR | ||
59 | */ | ||
60 | str lr, [sp, #-4]! | ||
61 | mov ip, r1 | ||
62 | mov lr, r1 | ||
63 | |||
64 | 3: subs r2, r2, #64 | ||
65 | stmgedb r0!, {r1, r3, ip, lr} @ 64 bytes at a time. | ||
66 | stmgedb r0!, {r1, r3, ip, lr} | ||
67 | stmgedb r0!, {r1, r3, ip, lr} | ||
68 | stmgedb r0!, {r1, r3, ip, lr} | ||
69 | bgt 3b | ||
70 | ldmeqfd sp!, {pc} @ Now <64 bytes to go. | ||
71 | /* | ||
72 | * No need to correct the count; we're only testing bits from now on | ||
73 | */ | ||
74 | tst r2, #32 | ||
75 | stmnedb r0!, {r1, r3, ip, lr} | ||
76 | stmnedb r0!, {r1, r3, ip, lr} | ||
77 | tst r2, #16 | ||
78 | stmnedb r0!, {r1, r3, ip, lr} | ||
79 | ldr lr, [sp], #4 | ||
80 | |||
81 | 5: tst r2, #8 | ||
82 | stmnedb r0!, {r1, r3} | ||
83 | tst r2, #4 | ||
84 | strne r1, [r0, #-4]! | ||
85 | /* | ||
86 | * When we get here, we've got less than 4 bytes to zero. We | ||
87 | * may have an unaligned pointer as well. | ||
88 | */ | ||
89 | 6: tst r2, #2 | ||
90 | strneb r1, [r0, #-1]! | ||
91 | strneb r1, [r0, #-1]! | ||
92 | tst r2, #1 | ||
93 | strneb r1, [r0, #-1]! | ||
94 | mov pc, lr | ||
95 | end: | ||
96 | .size memset,.end-memset | ||
diff --git a/firmware/common/memcpy_a.S b/firmware/target/coldfire/memcpy-coldfire.S index 9f6c813be3..523e1f5ed9 100644..100755 --- a/firmware/common/memcpy_a.S +++ b/firmware/target/coldfire/memcpy-coldfire.S | |||
@@ -20,203 +20,6 @@ | |||
20 | 20 | ||
21 | .section .icode,"ax",@progbits | 21 | .section .icode,"ax",@progbits |
22 | 22 | ||
23 | #if CONFIG_CPU == SH7034 | ||
24 | .align 2 | ||
25 | .global _memcpy | ||
26 | .global ___memcpy_fwd_entry | ||
27 | .type _memcpy,@function | ||
28 | |||
29 | /* Copies <length> bytes of data in memory from <source> to <dest> | ||
30 | * This version is optimized for speed | ||
31 | * | ||
32 | * arguments: | ||
33 | * r4 - destination address | ||
34 | * r5 - source address | ||
35 | * r6 - length | ||
36 | * | ||
37 | * return value: | ||
38 | * r0 - destination address (like ANSI version) | ||
39 | * | ||
40 | * register usage: | ||
41 | * r0 - data / scratch | ||
42 | * r1 - 2nd data / scratch | ||
43 | * r2 - scratch | ||
44 | * r3 - first long bound / adjusted end address (only if >= 11 bytes) | ||
45 | * r4 - current dest address | ||
46 | * r5 - current source address | ||
47 | * r6 - source end address | ||
48 | * r7 - stored dest start address | ||
49 | * | ||
50 | * The instruction order is devised in a way to utilize the pipelining | ||
51 | * of the SH1 to the max. The routine also tries to utilize fast page mode. | ||
52 | */ | ||
53 | |||
54 | _memcpy: | ||
55 | mov r4,r7 /* store dest for returning */ | ||
56 | ___memcpy_fwd_entry: | ||
57 | add #-8,r4 /* offset for early increment (max. 2 longs) */ | ||
58 | mov #11,r0 | ||
59 | cmp/hs r0,r6 /* at least 11 bytes to copy? (ensures 2 aligned longs) */ | ||
60 | add r5,r6 /* r6 = source_end */ | ||
61 | bf .start_b2 /* no: jump directly to byte loop */ | ||
62 | |||
63 | mov #3,r0 | ||
64 | neg r5,r3 | ||
65 | and r0,r3 /* r3 = (4 - align_offset) % 4 */ | ||
66 | tst r3,r3 /* already aligned? */ | ||
67 | bt .end_b1 /* yes: skip leading byte loop */ | ||
68 | |||
69 | add r5,r3 /* r3 = first source long bound */ | ||
70 | |||
71 | /* leading byte loop: copies 0..3 bytes */ | ||
72 | .loop_b1: | ||
73 | mov.b @r5+,r0 /* load byte & increment source addr */ | ||
74 | add #1,r4 /* increment dest addr */ | ||
75 | mov.b r0,@(7,r4) /* store byte */ | ||
76 | cmp/hi r5,r3 /* runs r5 up to first long bound */ | ||
77 | bt .loop_b1 | ||
78 | /* now r5 is always at a long boundary */ | ||
79 | /* -> memory reading is done in longs for all dest alignments */ | ||
80 | |||
81 | /* selector for main copy loop */ | ||
82 | .end_b1: | ||
83 | mov #3,r1 | ||
84 | and r4,r1 /* r1 = dest alignment offset */ | ||
85 | mova .jmptab,r0 | ||
86 | mov.b @(r0,r1),r1 /* select appropriate main loop */ | ||
87 | add r0,r1 | ||
88 | mov r6,r3 /* move end address to r3 */ | ||
89 | jmp @r1 /* and jump to it */ | ||
90 | add #-7,r3 /* adjust end addr for main loops doing 2 longs/pass */ | ||
91 | |||
92 | /** main loops, copying 2 longs per pass to profit from fast page mode **/ | ||
93 | |||
94 | /* long aligned destination (fastest) */ | ||
95 | .align 2 | ||
96 | .loop_do0: | ||
97 | mov.l @r5+,r1 /* load first long & increment source addr */ | ||
98 | add #16,r4 /* increment dest addr & account for decrementing stores */ | ||
99 | mov.l @r5+,r0 /* load second long & increment source addr */ | ||
100 | cmp/hi r5,r3 /* runs r5 up to last or second last long bound */ | ||
101 | mov.l r0,@-r4 /* store second long */ | ||
102 | mov.l r1,@-r4 /* store first long; NOT ALIGNED - no speed loss here! */ | ||
103 | bt .loop_do0 | ||
104 | |||
105 | add #4,r3 /* readjust end address */ | ||
106 | cmp/hi r5,r3 /* one long left? */ | ||
107 | bf .start_b2 /* no, jump to trailing byte loop */ | ||
108 | |||
109 | mov.l @r5+,r0 /* load last long & increment source addr */ | ||
110 | add #4,r4 /* increment dest addr */ | ||
111 | bra .start_b2 /* jump to trailing byte loop */ | ||
112 | mov.l r0,@(4,r4) /* store last long */ | ||
113 | |||
114 | /* word aligned destination (long + 2) */ | ||
115 | .align 2 | ||
116 | .loop_do2: | ||
117 | mov.l @r5+,r1 /* load first long & increment source addr */ | ||
118 | add #16,r4 /* increment dest addr */ | ||
119 | mov.l @r5+,r0 /* load second long & increment source addr */ | ||
120 | cmp/hi r5,r3 /* runs r5 up to last or second last long bound */ | ||
121 | mov.w r0,@-r4 /* store low word of second long */ | ||
122 | xtrct r1,r0 /* extract low word of first long & high word of second long */ | ||
123 | mov.l r0,@-r4 /* and store as long */ | ||
124 | swap.w r1,r0 /* get high word of first long */ | ||
125 | mov.w r0,@-r4 /* and store it */ | ||
126 | bt .loop_do2 | ||
127 | |||
128 | add #4,r3 /* readjust end address */ | ||
129 | cmp/hi r5,r3 /* one long left? */ | ||
130 | bf .start_b2 /* no, jump to trailing byte loop */ | ||
131 | |||
132 | mov.l @r5+,r0 /* load last long & increment source addr */ | ||
133 | add #4,r4 /* increment dest addr */ | ||
134 | mov.w r0,@(6,r4) /* store low word */ | ||
135 | shlr16 r0 /* get high word */ | ||
136 | bra .start_b2 /* jump to trailing byte loop */ | ||
137 | mov.w r0,@(4,r4) /* and store it */ | ||
138 | |||
139 | /* jumptable for loop selector */ | ||
140 | .align 2 | ||
141 | .jmptab: | ||
142 | .byte .loop_do0 - .jmptab /* placed in the middle because the SH1 */ | ||
143 | .byte .loop_do1 - .jmptab /* loads bytes sign-extended. Otherwise */ | ||
144 | .byte .loop_do2 - .jmptab /* the last loop would be out of reach */ | ||
145 | .byte .loop_do3 - .jmptab /* of the offset range. */ | ||
146 | |||
147 | /* byte aligned destination (long + 1) */ | ||
148 | .align 2 | ||
149 | .loop_do1: | ||
150 | mov.l @r5+,r1 /* load first long & increment source addr */ | ||
151 | add #16,r4 /* increment dest addr */ | ||
152 | mov.l @r5+,r0 /* load second long & increment source addr */ | ||
153 | cmp/hi r5,r3 /* runs r5 up to last or second last long bound */ | ||
154 | mov.b r0,@-r4 /* store low byte of second long */ | ||
155 | shlr8 r0 /* get upper 3 bytes */ | ||
156 | mov r1,r2 /* copy first long */ | ||
157 | shll16 r2 /* move low byte of first long all the way up, .. */ | ||
158 | shll8 r2 | ||
159 | or r2,r0 /* ..combine with the 3 bytes of second long.. */ | ||
160 | mov.l r0,@-r4 /* ..and store as long */ | ||
161 | shlr8 r1 /* get middle 2 bytes */ | ||
162 | mov.w r1,@-r4 /* store as word */ | ||
163 | shlr16 r1 /* get upper byte */ | ||
164 | mov.b r1,@-r4 /* and store */ | ||
165 | bt .loop_do1 | ||
166 | |||
167 | add #4,r3 /* readjust end address */ | ||
168 | .last_do13: | ||
169 | cmp/hi r5,r3 /* one long left? */ | ||
170 | bf .start_b2 /* no, jump to trailing byte loop */ | ||
171 | |||
172 | mov.l @r5+,r0 /* load last long & increment source addr */ | ||
173 | add #12,r4 /* increment dest addr */ | ||
174 | mov.b r0,@-r4 /* store low byte */ | ||
175 | shlr8 r0 /* get middle 2 bytes */ | ||
176 | mov.w r0,@-r4 /* store as word */ | ||
177 | shlr16 r0 /* get upper byte */ | ||
178 | mov.b r0,@-r4 /* and store */ | ||
179 | bra .start_b2 /* jump to trailing byte loop */ | ||
180 | add #-4,r4 /* readjust destination */ | ||
181 | |||
182 | /* byte aligned destination (long + 3) */ | ||
183 | .align 2 | ||
184 | .loop_do3: | ||
185 | mov.l @r5+,r1 /* load first long & increment source addr */ | ||
186 | add #16,r4 /* increment dest addr */ | ||
187 | mov.l @r5+,r0 /* load second long & increment source addr */ | ||
188 | mov r1,r2 /* copy first long */ | ||
189 | mov.b r0,@-r4 /* store low byte of second long */ | ||
190 | shlr8 r0 /* get middle 2 bytes */ | ||
191 | mov.w r0,@-r4 /* store as word */ | ||
192 | shlr16 r0 /* get upper byte */ | ||
193 | shll8 r2 /* move lower 3 bytes of first long one up.. */ | ||
194 | or r2,r0 /* ..combine with the 1 byte of second long.. */ | ||
195 | mov.l r0,@-r4 /* ..and store as long */ | ||
196 | shlr16 r1 /* get upper byte of first long.. */ | ||
197 | shlr8 r1 | ||
198 | cmp/hi r5,r3 /* runs r5 up to last or second last long bound */ | ||
199 | mov.b r1,@-r4 /* ..and store */ | ||
200 | bt .loop_do3 | ||
201 | |||
202 | bra .last_do13 /* handle last longword: reuse routine for (long + 1) */ | ||
203 | add #4,r3 /* readjust end address */ | ||
204 | |||
205 | /* trailing byte loop: copies 0..3 bytes (or all for < 11 in total) */ | ||
206 | .align 2 | ||
207 | .loop_b2: | ||
208 | mov.b @r5+,r0 /* load byte & increment source addr */ | ||
209 | add #1,r4 /* increment dest addr */ | ||
210 | mov.b r0,@(7,r4) /* store byte */ | ||
211 | .start_b2: | ||
212 | cmp/hi r5,r6 /* runs r5 up to end address */ | ||
213 | bt .loop_b2 | ||
214 | |||
215 | rts | ||
216 | mov r7,r0 /* return dest start address */ | ||
217 | .end: | ||
218 | .size _memcpy,.end-_memcpy | ||
219 | #elif defined(CPU_COLDFIRE) | ||
220 | #define FULLSPEED /* use burst writing for word aligned destinations */ | 23 | #define FULLSPEED /* use burst writing for word aligned destinations */ |
221 | .align 2 | 24 | .align 2 |
222 | .global memcpy | 25 | .global memcpy |
@@ -875,4 +678,3 @@ __memcpy_fwd_entry: | |||
875 | 678 | ||
876 | .end: | 679 | .end: |
877 | .size memcpy,.end-memcpy | 680 | .size memcpy,.end-memcpy |
878 | #endif | ||
diff --git a/firmware/common/memmove_a.S b/firmware/target/coldfire/memmove-coldfire.S index d7421333df..bdd2e2e206 100755 --- a/firmware/common/memmove_a.S +++ b/firmware/target/coldfire/memmove-coldfire.S | |||
@@ -20,206 +20,6 @@ | |||
20 | 20 | ||
21 | .section .icode,"ax",@progbits | 21 | .section .icode,"ax",@progbits |
22 | 22 | ||
23 | #if CONFIG_CPU == SH7034 | ||
24 | .align 2 | ||
25 | .global _memmove | ||
26 | .type _memmove,@function | ||
27 | |||
28 | /* Moves <length> bytes of data in memory from <source> to <dest> | ||
29 | * Regions may overlap. | ||
30 | * This version is optimized for speed, and needs the corresponding memcpy | ||
31 | * implementation for the forward copy branch. | ||
32 | * | ||
33 | * arguments: | ||
34 | * r4 - destination address | ||
35 | * r5 - source address | ||
36 | * r6 - length | ||
37 | * | ||
38 | * return value: | ||
39 | * r0 - destination address (like ANSI version) | ||
40 | * | ||
41 | * register usage: | ||
42 | * r0 - data / scratch | ||
43 | * r1 - 2nd data / scratch | ||
44 | * r2 - scratch | ||
45 | * r3 - last long bound / adjusted start address (only if >= 11 bytes) | ||
46 | * r4 - current dest address | ||
47 | * r5 - source start address | ||
48 | * r6 - current source address | ||
49 | * | ||
50 | * The instruction order is devised in a way to utilize the pipelining | ||
51 | * of the SH1 to the max. The routine also tries to utilize fast page mode. | ||
52 | */ | ||
53 | |||
54 | _memmove: | ||
55 | cmp/hi r4,r5 /* source > destination */ | ||
56 | bf .backward /* no: backward copy */ | ||
57 | mov.l .memcpy_fwd,r0 | ||
58 | jmp @r0 | ||
59 | mov r4,r7 /* store dest for returning */ | ||
60 | |||
61 | .align 2 | ||
62 | .memcpy_fwd: | ||
63 | .long ___memcpy_fwd_entry | ||
64 | |||
65 | .backward: | ||
66 | add r6,r4 /* r4 = destination end */ | ||
67 | mov #11,r0 | ||
68 | cmp/hs r0,r6 /* at least 11 bytes to copy? (ensures 2 aligned longs) */ | ||
69 | add #-8,r5 /* adjust for late decrement (max. 2 longs) */ | ||
70 | add r5,r6 /* r6 = source end - 8 */ | ||
71 | bf .start_b2r /* no: jump directly to byte loop */ | ||
72 | |||
73 | mov #-4,r3 /* r3 = 0xfffffffc */ | ||
74 | and r6,r3 /* r3 = last source long bound */ | ||
75 | cmp/hi r3,r6 /* already aligned? */ | ||
76 | bf .end_b1r /* yes: skip leading byte loop */ | ||
77 | |||
78 | .loop_b1r: | ||
79 | mov.b @(7,r6),r0 /* load byte */ | ||
80 | add #-1,r6 /* decrement source addr */ | ||
81 | mov.b r0,@-r4 /* store byte */ | ||
82 | cmp/hi r3,r6 /* runs r6 down to last long bound */ | ||
83 | bt .loop_b1r | ||
84 | |||
85 | .end_b1r: | ||
86 | mov #3,r1 | ||
87 | and r4,r1 /* r1 = dest alignment offset */ | ||
88 | mova .jmptab_r,r0 | ||
89 | mov.b @(r0,r1),r1 /* select appropriate main loop.. */ | ||
90 | add r0,r1 | ||
91 | mov r5,r3 /* copy start adress to r3 */ | ||
92 | jmp @r1 /* ..and jump to it */ | ||
93 | add #7,r3 /* adjust end addr for main loops doing 2 longs/pass */ | ||
94 | |||
95 | /** main loops, copying 2 longs per pass to profit from fast page mode **/ | ||
96 | |||
97 | /* long aligned destination (fastest) */ | ||
98 | .align 2 | ||
99 | .loop_do0r: | ||
100 | mov.l @r6,r1 /* load first long */ | ||
101 | add #-8,r6 /* decrement source addr */ | ||
102 | mov.l @(12,r6),r0 /* load second long */ | ||
103 | cmp/hi r3,r6 /* runs r6 down to first or second long bound */ | ||
104 | mov.l r0,@-r4 /* store second long */ | ||
105 | mov.l r1,@-r4 /* store first long; NOT ALIGNED - no speed loss here! */ | ||
106 | bt .loop_do0r | ||
107 | |||
108 | add #-4,r3 /* readjust end address */ | ||
109 | cmp/hi r3,r6 /* first long left? */ | ||
110 | bf .start_b2r /* no, jump to trailing byte loop */ | ||
111 | |||
112 | mov.l @(4,r6),r0 /* load first long */ | ||
113 | add #-4,r6 /* decrement source addr */ | ||
114 | bra .start_b2r /* jump to trailing byte loop */ | ||
115 | mov.l r0,@-r4 /* store first long */ | ||
116 | |||
117 | /* word aligned destination (long + 2) */ | ||
118 | .align 2 | ||
119 | .loop_do2r: | ||
120 | mov.l @r6,r1 /* load first long */ | ||
121 | add #-8,r6 /* decrement source addr */ | ||
122 | mov.l @(12,r6),r0 /* load second long */ | ||
123 | cmp/hi r3,r6 /* runs r6 down to first or second long bound */ | ||
124 | mov.w r0,@-r4 /* store low word of second long */ | ||
125 | xtrct r1,r0 /* extract low word of first long & high word of second long */ | ||
126 | mov.l r0,@-r4 /* and store as long */ | ||
127 | shlr16 r1 /* get high word of first long */ | ||
128 | mov.w r1,@-r4 /* and store it */ | ||
129 | bt .loop_do2r | ||
130 | |||
131 | add #-4,r3 /* readjust end address */ | ||
132 | cmp/hi r3,r6 /* first long left? */ | ||
133 | bf .start_b2r /* no, jump to trailing byte loop */ | ||
134 | |||
135 | mov.l @(4,r6),r0 /* load first long & decrement source addr */ | ||
136 | add #-4,r6 /* decrement source addr */ | ||
137 | mov.w r0,@-r4 /* store low word */ | ||
138 | shlr16 r0 /* get high word */ | ||
139 | bra .start_b2r /* jump to trailing byte loop */ | ||
140 | mov.w r0,@-r4 /* and store it */ | ||
141 | |||
142 | /* jumptable for loop selector */ | ||
143 | .align 2 | ||
144 | .jmptab_r: | ||
145 | .byte .loop_do0r - .jmptab_r /* placed in the middle because the SH1 */ | ||
146 | .byte .loop_do1r - .jmptab_r /* loads bytes sign-extended. Otherwise */ | ||
147 | .byte .loop_do2r - .jmptab_r /* the last loop would be out of reach */ | ||
148 | .byte .loop_do3r - .jmptab_r /* of the offset range. */ | ||
149 | |||
150 | /* byte aligned destination (long + 1) */ | ||
151 | .align 2 | ||
152 | .loop_do1r: | ||
153 | mov.l @r6,r1 /* load first long */ | ||
154 | add #-8,r6 /* decrement source addr */ | ||
155 | mov.l @(12,r6),r0 /* load second long */ | ||
156 | cmp/hi r3,r6 /* runs r6 down to first or second long bound */ | ||
157 | mov.b r0,@-r4 /* store low byte of second long */ | ||
158 | shlr8 r0 /* get upper 3 bytes */ | ||
159 | mov r1,r2 /* copy first long */ | ||
160 | shll16 r2 /* move low byte of first long all the way up, .. */ | ||
161 | shll8 r2 | ||
162 | or r2,r0 /* ..combine with the 3 bytes of second long.. */ | ||
163 | mov.l r0,@-r4 /* ..and store as long */ | ||
164 | shlr8 r1 /* get middle 2 bytes */ | ||
165 | mov.w r1,@-r4 /* store as word */ | ||
166 | shlr16 r1 /* get upper byte */ | ||
167 | mov.b r1,@-r4 /* and store */ | ||
168 | bt .loop_do1r | ||
169 | |||
170 | add #-4,r3 /* readjust end address */ | ||
171 | .last_do13r: | ||
172 | cmp/hi r3,r6 /* first long left? */ | ||
173 | bf .start_b2r /* no, jump to trailing byte loop */ | ||
174 | |||
175 | nop /* alignment */ | ||
176 | mov.l @(4,r6),r0 /* load first long */ | ||
177 | add #-4,r6 /* decrement source addr */ | ||
178 | mov.b r0,@-r4 /* store low byte */ | ||
179 | shlr8 r0 /* get middle 2 bytes */ | ||
180 | mov.w r0,@-r4 /* store as word */ | ||
181 | shlr16 r0 /* get upper byte */ | ||
182 | bra .start_b2r /* jump to trailing byte loop */ | ||
183 | mov.b r0,@-r4 /* and store */ | ||
184 | |||
185 | /* byte aligned destination (long + 3) */ | ||
186 | .align 2 | ||
187 | .loop_do3r: | ||
188 | mov.l @r6,r1 /* load first long */ | ||
189 | add #-8,r6 /* decrement source addr */ | ||
190 | mov.l @(12,r6),r0 /* load second long */ | ||
191 | mov r1,r2 /* copy first long */ | ||
192 | mov.b r0,@-r4 /* store low byte of second long */ | ||
193 | shlr8 r0 /* get middle 2 bytes */ | ||
194 | mov.w r0,@-r4 /* store as word */ | ||
195 | shlr16 r0 /* get upper byte */ | ||
196 | shll8 r2 /* move lower 3 bytes of first long one up.. */ | ||
197 | or r2,r0 /* ..combine with the 1 byte of second long.. */ | ||
198 | mov.l r0,@-r4 /* ..and store as long */ | ||
199 | shlr16 r1 /* get upper byte of first long */ | ||
200 | shlr8 r1 | ||
201 | cmp/hi r3,r6 /* runs r6 down to first or second long bound */ | ||
202 | mov.b r1,@-r4 /* ..and store */ | ||
203 | bt .loop_do3r | ||
204 | |||
205 | bra .last_do13r /* handle first longword: reuse routine for (long + 1) */ | ||
206 | add #-4,r3 /* readjust end address */ | ||
207 | |||
208 | /* trailing byte loop: copies 0..3 bytes (or all for < 11 in total) */ | ||
209 | .align 2 | ||
210 | .loop_b2r: | ||
211 | mov.b @(7,r6),r0 /* load byte */ | ||
212 | add #-1,r6 /* decrement source addr */ | ||
213 | mov.b r0,@-r4 /* store byte */ | ||
214 | .start_b2r: | ||
215 | cmp/hi r5,r6 /* runs r6 down to start address */ | ||
216 | bt .loop_b2r | ||
217 | |||
218 | rts | ||
219 | mov r4,r0 /* return dest start address */ | ||
220 | .end: | ||
221 | .size _memmove,.end-_memmove | ||
222 | #elif defined(CPU_COLDFIRE) | ||
223 | #define FULLSPEED /* use burst writing for word aligned destinations */ | 23 | #define FULLSPEED /* use burst writing for word aligned destinations */ |
224 | .align 2 | 24 | .align 2 |
225 | .global memmove | 25 | .global memmove |
@@ -866,4 +666,3 @@ memmove: | |||
866 | 666 | ||
867 | .end: | 667 | .end: |
868 | .size memmove,.end-memmove | 668 | .size memmove,.end-memmove |
869 | #endif | ||
diff --git a/firmware/common/memset_a.S b/firmware/target/coldfire/memset-coldfire.S index 6dbdab9595..7c9fe88463 100644..100755 --- a/firmware/common/memset_a.S +++ b/firmware/target/coldfire/memset-coldfire.S | |||
@@ -18,99 +18,9 @@ | |||
18 | ****************************************************************************/ | 18 | ****************************************************************************/ |
19 | #include "config.h" | 19 | #include "config.h" |
20 | 20 | ||
21 | #ifdef CPU_ARM | ||
22 | .section .icode,"ax",%progbits | ||
23 | #else | ||
24 | .section .icode,"ax",@progbits | 21 | .section .icode,"ax",@progbits |
25 | #endif | ||
26 | 22 | ||
27 | .align 2 | 23 | .align 2 |
28 | #if CONFIG_CPU == SH7034 | ||
29 | .global _memset | ||
30 | .type _memset,@function | ||
31 | |||
32 | /* Fills a memory region with specified byte value | ||
33 | * This version is optimized for speed | ||
34 | * | ||
35 | * arguments: | ||
36 | * r4 - start address | ||
37 | * r5 - data | ||
38 | * r6 - length | ||
39 | * | ||
40 | * return value: | ||
41 | * r0 - start address (like ANSI version) | ||
42 | * | ||
43 | * register usage: | ||
44 | * r0 - temporary | ||
45 | * r1 - start address +11 for main loop | ||
46 | * r4 - start address | ||
47 | * r5 - data (spread to all 4 bytes when using long stores) | ||
48 | * r6 - current address (runs down from end to start) | ||
49 | * | ||
50 | * The instruction order below is devised in a way to utilize the pipelining | ||
51 | * of the SH1 to the max. The routine fills memory from end to start in | ||
52 | * order to utilize the auto-decrementing store instructions. | ||
53 | */ | ||
54 | |||
55 | _memset: | ||
56 | neg r4,r0 | ||
57 | and #3,r0 /* r0 = (4 - align_offset) % 4 */ | ||
58 | add #4,r0 | ||
59 | cmp/hs r0,r6 /* at least one aligned longword to fill? */ | ||
60 | add r4,r6 /* r6 = end_address */ | ||
61 | bf .no_longs /* no, jump directly to byte loop */ | ||
62 | |||
63 | extu.b r5,r5 /* start: spread data to all 4 bytes */ | ||
64 | swap.b r5,r0 | ||
65 | or r0,r5 /* data now in 2 lower bytes of r5 */ | ||
66 | swap.w r5,r0 | ||
67 | or r0,r5 /* data now in all 4 bytes of r5 */ | ||
68 | |||
69 | mov r6,r0 | ||
70 | tst #3,r0 /* r0 already long aligned? */ | ||
71 | bt .end_b1 /* yes: skip loop */ | ||
72 | |||
73 | /* leading byte loop: sets 0..3 bytes */ | ||
74 | .loop_b1: | ||
75 | mov.b r5,@-r0 /* store byte */ | ||
76 | tst #3,r0 /* r0 long aligned? */ | ||
77 | bf .loop_b1 /* runs r0 down until long aligned */ | ||
78 | |||
79 | mov r0,r6 /* r6 = last long bound */ | ||
80 | nop /* keep alignment */ | ||
81 | |||
82 | .end_b1: | ||
83 | mov r4,r1 /* r1 = start_address... */ | ||
84 | add #11,r1 /* ... + 11, combined for rounding and offset */ | ||
85 | xor r1,r0 | ||
86 | tst #4,r0 /* bit 2 tells whether an even or odd number of */ | ||
87 | bf .loop_odd /* longwords to set */ | ||
88 | |||
89 | /* main loop: set 2 longs per pass */ | ||
90 | .loop_2l: | ||
91 | mov.l r5,@-r6 /* store first long */ | ||
92 | .loop_odd: | ||
93 | cmp/hi r1,r6 /* runs r6 down to first long bound */ | ||
94 | mov.l r5,@-r6 /* store second long */ | ||
95 | bt .loop_2l | ||
96 | |||
97 | .no_longs: | ||
98 | cmp/hi r4,r6 /* any bytes left? */ | ||
99 | bf .end_b2 /* no: skip loop */ | ||
100 | |||
101 | /* trailing byte loop */ | ||
102 | .loop_b2: | ||
103 | mov.b r5,@-r6 /* store byte */ | ||
104 | cmp/hi r4,r6 /* runs r6 down to the start address */ | ||
105 | bt .loop_b2 | ||
106 | |||
107 | .end_b2: | ||
108 | rts | ||
109 | mov r4,r0 /* return start address */ | ||
110 | |||
111 | .end: | ||
112 | .size _memset,.end-_memset | ||
113 | #elif defined(CPU_COLDFIRE) | ||
114 | .global memset | 24 | .global memset |
115 | .type memset,@function | 25 | .type memset,@function |
116 | 26 | ||
@@ -238,80 +148,3 @@ memset: | |||
238 | 148 | ||
239 | .end: | 149 | .end: |
240 | .size memset,.end-memset | 150 | .size memset,.end-memset |
241 | |||
242 | #elif defined(CPU_ARM) | ||
243 | |||
244 | /* The following code is based on code found in Linux kernel version 2.6.15.3 | ||
245 | * linux/arch/arm/lib/memset.S | ||
246 | * | ||
247 | * Copyright (C) 1995-2000 Russell King | ||
248 | */ | ||
249 | |||
250 | /* This code will align a pointer for memset, if needed */ | ||
251 | 1: cmp r2, #4 @ 1 do we have enough | ||
252 | blt 5f @ 1 bytes to align with? | ||
253 | cmp r3, #2 @ 1 | ||
254 | strgtb r1, [r0, #-1]! @ 1 | ||
255 | strgeb r1, [r0, #-1]! @ 1 | ||
256 | strb r1, [r0, #-1]! @ 1 | ||
257 | sub r2, r2, r3 @ 1 r2 = r2 - r3 | ||
258 | b 2f | ||
259 | |||
260 | .global memset | ||
261 | .type memset,%function | ||
262 | memset: | ||
263 | add r0, r0, r2 @ we'll write backwards in memory | ||
264 | ands r3, r0, #3 @ 1 unaligned? | ||
265 | bne 1b @ 1 | ||
266 | 2: | ||
267 | /* | ||
268 | * we know that the pointer in r0 is aligned to a word boundary. | ||
269 | */ | ||
270 | orr r1, r1, r1, lsl #8 | ||
271 | orr r1, r1, r1, lsl #16 | ||
272 | mov r3, r1 | ||
273 | cmp r2, #16 | ||
274 | blt 5f | ||
275 | /* | ||
276 | * We need an extra register for this loop - save the return address and | ||
277 | * use the LR | ||
278 | */ | ||
279 | str lr, [sp, #-4]! | ||
280 | mov ip, r1 | ||
281 | mov lr, r1 | ||
282 | |||
283 | 3: subs r2, r2, #64 | ||
284 | stmgedb r0!, {r1, r3, ip, lr} @ 64 bytes at a time. | ||
285 | stmgedb r0!, {r1, r3, ip, lr} | ||
286 | stmgedb r0!, {r1, r3, ip, lr} | ||
287 | stmgedb r0!, {r1, r3, ip, lr} | ||
288 | bgt 3b | ||
289 | ldmeqfd sp!, {pc} @ Now <64 bytes to go. | ||
290 | /* | ||
291 | * No need to correct the count; we're only testing bits from now on | ||
292 | */ | ||
293 | tst r2, #32 | ||
294 | stmnedb r0!, {r1, r3, ip, lr} | ||
295 | stmnedb r0!, {r1, r3, ip, lr} | ||
296 | tst r2, #16 | ||
297 | stmnedb r0!, {r1, r3, ip, lr} | ||
298 | ldr lr, [sp], #4 | ||
299 | |||
300 | 5: tst r2, #8 | ||
301 | stmnedb r0!, {r1, r3} | ||
302 | tst r2, #4 | ||
303 | strne r1, [r0, #-4]! | ||
304 | /* | ||
305 | * When we get here, we've got less than 4 bytes to zero. We | ||
306 | * may have an unaligned pointer as well. | ||
307 | */ | ||
308 | 6: tst r2, #2 | ||
309 | strneb r1, [r0, #-1]! | ||
310 | strneb r1, [r0, #-1]! | ||
311 | tst r2, #1 | ||
312 | strneb r1, [r0, #-1]! | ||
313 | mov pc, lr | ||
314 | end: | ||
315 | .size memset,.end-memset | ||
316 | #endif | ||
317 | |||
diff --git a/firmware/common/memset16_a.S b/firmware/target/coldfire/memset16-coldfire.S index 9ab1bdcb5b..d9f72f683f 100755 --- a/firmware/common/memset16_a.S +++ b/firmware/target/coldfire/memset16-coldfire.S | |||
@@ -20,7 +20,6 @@ | |||
20 | 20 | ||
21 | .section .icode,"ax",@progbits | 21 | .section .icode,"ax",@progbits |
22 | 22 | ||
23 | #ifdef CPU_COLDFIRE | ||
24 | .global memset16 | 23 | .global memset16 |
25 | .type memset16,@function | 24 | .type memset16,@function |
26 | 25 | ||
@@ -143,4 +142,3 @@ memset16: | |||
143 | 142 | ||
144 | .end: | 143 | .end: |
145 | .size memset16,.end-memset16 | 144 | .size memset16,.end-memset16 |
146 | #endif | ||
diff --git a/firmware/target/sh/memcpy-sh.S b/firmware/target/sh/memcpy-sh.S new file mode 100755 index 0000000000..0b5e086be9 --- /dev/null +++ b/firmware/target/sh/memcpy-sh.S | |||
@@ -0,0 +1,217 @@ | |||
1 | /*************************************************************************** | ||
2 | * __________ __ ___. | ||
3 | * Open \______ \ ____ ____ | | _\_ |__ _______ ___ | ||
4 | * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / | ||
5 | * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < | ||
6 | * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ | ||
7 | * \/ \/ \/ \/ \/ | ||
8 | * $Id$ | ||
9 | * | ||
10 | * Copyright (C) 2004-2005 by Jens Arnold | ||
11 | * | ||
12 | * All files in this archive are subject to the GNU General Public License. | ||
13 | * See the file COPYING in the source tree root for full license agreement. | ||
14 | * | ||
15 | * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY | ||
16 | * KIND, either express or implied. | ||
17 | * | ||
18 | ****************************************************************************/ | ||
19 | #include "config.h" | ||
20 | |||
21 | .section .icode,"ax",@progbits | ||
22 | |||
23 | .align 2 | ||
24 | .global _memcpy | ||
25 | .global ___memcpy_fwd_entry | ||
26 | .type _memcpy,@function | ||
27 | |||
28 | /* Copies <length> bytes of data in memory from <source> to <dest> | ||
29 | * This version is optimized for speed | ||
30 | * | ||
31 | * arguments: | ||
32 | * r4 - destination address | ||
33 | * r5 - source address | ||
34 | * r6 - length | ||
35 | * | ||
36 | * return value: | ||
37 | * r0 - destination address (like ANSI version) | ||
38 | * | ||
39 | * register usage: | ||
40 | * r0 - data / scratch | ||
41 | * r1 - 2nd data / scratch | ||
42 | * r2 - scratch | ||
43 | * r3 - first long bound / adjusted end address (only if >= 11 bytes) | ||
44 | * r4 - current dest address | ||
45 | * r5 - current source address | ||
46 | * r6 - source end address | ||
47 | * r7 - stored dest start address | ||
48 | * | ||
49 | * The instruction order is devised in a way to utilize the pipelining | ||
50 | * of the SH1 to the max. The routine also tries to utilize fast page mode. | ||
51 | */ | ||
52 | |||
53 | _memcpy: | ||
54 | mov r4,r7 /* store dest for returning */ | ||
55 | ___memcpy_fwd_entry: | ||
56 | add #-8,r4 /* offset for early increment (max. 2 longs) */ | ||
57 | mov #11,r0 | ||
58 | cmp/hs r0,r6 /* at least 11 bytes to copy? (ensures 2 aligned longs) */ | ||
59 | add r5,r6 /* r6 = source_end */ | ||
60 | bf .start_b2 /* no: jump directly to byte loop */ | ||
61 | |||
62 | mov #3,r0 | ||
63 | neg r5,r3 | ||
64 | and r0,r3 /* r3 = (4 - align_offset) % 4 */ | ||
65 | tst r3,r3 /* already aligned? */ | ||
66 | bt .end_b1 /* yes: skip leading byte loop */ | ||
67 | |||
68 | add r5,r3 /* r3 = first source long bound */ | ||
69 | |||
70 | /* leading byte loop: copies 0..3 bytes */ | ||
71 | .loop_b1: | ||
72 | mov.b @r5+,r0 /* load byte & increment source addr */ | ||
73 | add #1,r4 /* increment dest addr */ | ||
74 | mov.b r0,@(7,r4) /* store byte */ | ||
75 | cmp/hi r5,r3 /* runs r5 up to first long bound */ | ||
76 | bt .loop_b1 | ||
77 | /* now r5 is always at a long boundary */ | ||
78 | /* -> memory reading is done in longs for all dest alignments */ | ||
79 | |||
80 | /* selector for main copy loop */ | ||
81 | .end_b1: | ||
82 | mov #3,r1 | ||
83 | and r4,r1 /* r1 = dest alignment offset */ | ||
84 | mova .jmptab,r0 | ||
85 | mov.b @(r0,r1),r1 /* select appropriate main loop */ | ||
86 | add r0,r1 | ||
87 | mov r6,r3 /* move end address to r3 */ | ||
88 | jmp @r1 /* and jump to it */ | ||
89 | add #-7,r3 /* adjust end addr for main loops doing 2 longs/pass */ | ||
90 | |||
91 | /** main loops, copying 2 longs per pass to profit from fast page mode **/ | ||
92 | |||
93 | /* long aligned destination (fastest) */ | ||
94 | .align 2 | ||
95 | .loop_do0: | ||
96 | mov.l @r5+,r1 /* load first long & increment source addr */ | ||
97 | add #16,r4 /* increment dest addr & account for decrementing stores */ | ||
98 | mov.l @r5+,r0 /* load second long & increment source addr */ | ||
99 | cmp/hi r5,r3 /* runs r5 up to last or second last long bound */ | ||
100 | mov.l r0,@-r4 /* store second long */ | ||
101 | mov.l r1,@-r4 /* store first long; NOT ALIGNED - no speed loss here! */ | ||
102 | bt .loop_do0 | ||
103 | |||
104 | add #4,r3 /* readjust end address */ | ||
105 | cmp/hi r5,r3 /* one long left? */ | ||
106 | bf .start_b2 /* no, jump to trailing byte loop */ | ||
107 | |||
108 | mov.l @r5+,r0 /* load last long & increment source addr */ | ||
109 | add #4,r4 /* increment dest addr */ | ||
110 | bra .start_b2 /* jump to trailing byte loop */ | ||
111 | mov.l r0,@(4,r4) /* store last long */ | ||
112 | |||
113 | /* word aligned destination (long + 2) */ | ||
114 | .align 2 | ||
115 | .loop_do2: | ||
116 | mov.l @r5+,r1 /* load first long & increment source addr */ | ||
117 | add #16,r4 /* increment dest addr */ | ||
118 | mov.l @r5+,r0 /* load second long & increment source addr */ | ||
119 | cmp/hi r5,r3 /* runs r5 up to last or second last long bound */ | ||
120 | mov.w r0,@-r4 /* store low word of second long */ | ||
121 | xtrct r1,r0 /* extract low word of first long & high word of second long */ | ||
122 | mov.l r0,@-r4 /* and store as long */ | ||
123 | swap.w r1,r0 /* get high word of first long */ | ||
124 | mov.w r0,@-r4 /* and store it */ | ||
125 | bt .loop_do2 | ||
126 | |||
127 | add #4,r3 /* readjust end address */ | ||
128 | cmp/hi r5,r3 /* one long left? */ | ||
129 | bf .start_b2 /* no, jump to trailing byte loop */ | ||
130 | |||
131 | mov.l @r5+,r0 /* load last long & increment source addr */ | ||
132 | add #4,r4 /* increment dest addr */ | ||
133 | mov.w r0,@(6,r4) /* store low word */ | ||
134 | shlr16 r0 /* get high word */ | ||
135 | bra .start_b2 /* jump to trailing byte loop */ | ||
136 | mov.w r0,@(4,r4) /* and store it */ | ||
137 | |||
138 | /* jumptable for loop selector */ | ||
139 | .align 2 | ||
140 | .jmptab: | ||
141 | .byte .loop_do0 - .jmptab /* placed in the middle because the SH1 */ | ||
142 | .byte .loop_do1 - .jmptab /* loads bytes sign-extended. Otherwise */ | ||
143 | .byte .loop_do2 - .jmptab /* the last loop would be out of reach */ | ||
144 | .byte .loop_do3 - .jmptab /* of the offset range. */ | ||
145 | |||
146 | /* byte aligned destination (long + 1) */ | ||
147 | .align 2 | ||
148 | .loop_do1: | ||
149 | mov.l @r5+,r1 /* load first long & increment source addr */ | ||
150 | add #16,r4 /* increment dest addr */ | ||
151 | mov.l @r5+,r0 /* load second long & increment source addr */ | ||
152 | cmp/hi r5,r3 /* runs r5 up to last or second last long bound */ | ||
153 | mov.b r0,@-r4 /* store low byte of second long */ | ||
154 | shlr8 r0 /* get upper 3 bytes */ | ||
155 | mov r1,r2 /* copy first long */ | ||
156 | shll16 r2 /* move low byte of first long all the way up, .. */ | ||
157 | shll8 r2 | ||
158 | or r2,r0 /* ..combine with the 3 bytes of second long.. */ | ||
159 | mov.l r0,@-r4 /* ..and store as long */ | ||
160 | shlr8 r1 /* get middle 2 bytes */ | ||
161 | mov.w r1,@-r4 /* store as word */ | ||
162 | shlr16 r1 /* get upper byte */ | ||
163 | mov.b r1,@-r4 /* and store */ | ||
164 | bt .loop_do1 | ||
165 | |||
166 | add #4,r3 /* readjust end address */ | ||
167 | .last_do13: | ||
168 | cmp/hi r5,r3 /* one long left? */ | ||
169 | bf .start_b2 /* no, jump to trailing byte loop */ | ||
170 | |||
171 | mov.l @r5+,r0 /* load last long & increment source addr */ | ||
172 | add #12,r4 /* increment dest addr */ | ||
173 | mov.b r0,@-r4 /* store low byte */ | ||
174 | shlr8 r0 /* get middle 2 bytes */ | ||
175 | mov.w r0,@-r4 /* store as word */ | ||
176 | shlr16 r0 /* get upper byte */ | ||
177 | mov.b r0,@-r4 /* and store */ | ||
178 | bra .start_b2 /* jump to trailing byte loop */ | ||
179 | add #-4,r4 /* readjust destination */ | ||
180 | |||
181 | /* byte aligned destination (long + 3) */ | ||
182 | .align 2 | ||
183 | .loop_do3: | ||
184 | mov.l @r5+,r1 /* load first long & increment source addr */ | ||
185 | add #16,r4 /* increment dest addr */ | ||
186 | mov.l @r5+,r0 /* load second long & increment source addr */ | ||
187 | mov r1,r2 /* copy first long */ | ||
188 | mov.b r0,@-r4 /* store low byte of second long */ | ||
189 | shlr8 r0 /* get middle 2 bytes */ | ||
190 | mov.w r0,@-r4 /* store as word */ | ||
191 | shlr16 r0 /* get upper byte */ | ||
192 | shll8 r2 /* move lower 3 bytes of first long one up.. */ | ||
193 | or r2,r0 /* ..combine with the 1 byte of second long.. */ | ||
194 | mov.l r0,@-r4 /* ..and store as long */ | ||
195 | shlr16 r1 /* get upper byte of first long.. */ | ||
196 | shlr8 r1 | ||
197 | cmp/hi r5,r3 /* runs r5 up to last or second last long bound */ | ||
198 | mov.b r1,@-r4 /* ..and store */ | ||
199 | bt .loop_do3 | ||
200 | |||
201 | bra .last_do13 /* handle last longword: reuse routine for (long + 1) */ | ||
202 | add #4,r3 /* readjust end address */ | ||
203 | |||
204 | /* trailing byte loop: copies 0..3 bytes (or all for < 11 in total) */ | ||
205 | .align 2 | ||
206 | .loop_b2: | ||
207 | mov.b @r5+,r0 /* load byte & increment source addr */ | ||
208 | add #1,r4 /* increment dest addr */ | ||
209 | mov.b r0,@(7,r4) /* store byte */ | ||
210 | .start_b2: | ||
211 | cmp/hi r5,r6 /* runs r5 up to end address */ | ||
212 | bt .loop_b2 | ||
213 | |||
214 | rts | ||
215 | mov r7,r0 /* return dest start address */ | ||
216 | .end: | ||
217 | .size _memcpy,.end-_memcpy | ||
diff --git a/firmware/target/sh/memmove-sh.S b/firmware/target/sh/memmove-sh.S new file mode 100755 index 0000000000..9ae9ae5fa2 --- /dev/null +++ b/firmware/target/sh/memmove-sh.S | |||
@@ -0,0 +1,220 @@ | |||
1 | /*************************************************************************** | ||
2 | * __________ __ ___. | ||
3 | * Open \______ \ ____ ____ | | _\_ |__ _______ ___ | ||
4 | * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / | ||
5 | * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < | ||
6 | * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ | ||
7 | * \/ \/ \/ \/ \/ | ||
8 | * $Id$ | ||
9 | * | ||
10 | * Copyright (C) 2006 by Jens Arnold | ||
11 | * | ||
12 | * All files in this archive are subject to the GNU General Public License. | ||
13 | * See the file COPYING in the source tree root for full license agreement. | ||
14 | * | ||
15 | * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY | ||
16 | * KIND, either express or implied. | ||
17 | * | ||
18 | ****************************************************************************/ | ||
19 | #include "config.h" | ||
20 | |||
21 | .section .icode,"ax",@progbits | ||
22 | |||
23 | .align 2 | ||
24 | .global _memmove | ||
25 | .type _memmove,@function | ||
26 | |||
27 | /* Moves <length> bytes of data in memory from <source> to <dest> | ||
28 | * Regions may overlap. | ||
29 | * This version is optimized for speed, and needs the corresponding memcpy | ||
30 | * implementation for the forward copy branch. | ||
31 | * | ||
32 | * arguments: | ||
33 | * r4 - destination address | ||
34 | * r5 - source address | ||
35 | * r6 - length | ||
36 | * | ||
37 | * return value: | ||
38 | * r0 - destination address (like ANSI version) | ||
39 | * | ||
40 | * register usage: | ||
41 | * r0 - data / scratch | ||
42 | * r1 - 2nd data / scratch | ||
43 | * r2 - scratch | ||
44 | * r3 - last long bound / adjusted start address (only if >= 11 bytes) | ||
45 | * r4 - current dest address | ||
46 | * r5 - source start address | ||
47 | * r6 - current source address | ||
48 | * | ||
49 | * The instruction order is devised in a way to utilize the pipelining | ||
50 | * of the SH1 to the max. The routine also tries to utilize fast page mode. | ||
51 | */ | ||
52 | |||
53 | _memmove: | ||
54 | cmp/hi r4,r5 /* source > destination */ | ||
55 | bf .backward /* no: backward copy */ | ||
56 | mov.l .memcpy_fwd,r0 | ||
57 | jmp @r0 | ||
58 | mov r4,r7 /* store dest for returning */ | ||
59 | |||
60 | .align 2 | ||
61 | .memcpy_fwd: | ||
62 | .long ___memcpy_fwd_entry | ||
63 | |||
64 | .backward: | ||
65 | add r6,r4 /* r4 = destination end */ | ||
66 | mov #11,r0 | ||
67 | cmp/hs r0,r6 /* at least 11 bytes to copy? (ensures 2 aligned longs) */ | ||
68 | add #-8,r5 /* adjust for late decrement (max. 2 longs) */ | ||
69 | add r5,r6 /* r6 = source end - 8 */ | ||
70 | bf .start_b2r /* no: jump directly to byte loop */ | ||
71 | |||
72 | mov #-4,r3 /* r3 = 0xfffffffc */ | ||
73 | and r6,r3 /* r3 = last source long bound */ | ||
74 | cmp/hi r3,r6 /* already aligned? */ | ||
75 | bf .end_b1r /* yes: skip leading byte loop */ | ||
76 | |||
77 | .loop_b1r: | ||
78 | mov.b @(7,r6),r0 /* load byte */ | ||
79 | add #-1,r6 /* decrement source addr */ | ||
80 | mov.b r0,@-r4 /* store byte */ | ||
81 | cmp/hi r3,r6 /* runs r6 down to last long bound */ | ||
82 | bt .loop_b1r | ||
83 | |||
84 | .end_b1r: | ||
85 | mov #3,r1 | ||
86 | and r4,r1 /* r1 = dest alignment offset */ | ||
87 | mova .jmptab_r,r0 | ||
88 | mov.b @(r0,r1),r1 /* select appropriate main loop.. */ | ||
89 | add r0,r1 | ||
90 | mov r5,r3 /* copy start adress to r3 */ | ||
91 | jmp @r1 /* ..and jump to it */ | ||
92 | add #7,r3 /* adjust end addr for main loops doing 2 longs/pass */ | ||
93 | |||
94 | /** main loops, copying 2 longs per pass to profit from fast page mode **/ | ||
95 | |||
96 | /* long aligned destination (fastest) */ | ||
97 | .align 2 | ||
98 | .loop_do0r: | ||
99 | mov.l @r6,r1 /* load first long */ | ||
100 | add #-8,r6 /* decrement source addr */ | ||
101 | mov.l @(12,r6),r0 /* load second long */ | ||
102 | cmp/hi r3,r6 /* runs r6 down to first or second long bound */ | ||
103 | mov.l r0,@-r4 /* store second long */ | ||
104 | mov.l r1,@-r4 /* store first long; NOT ALIGNED - no speed loss here! */ | ||
105 | bt .loop_do0r | ||
106 | |||
107 | add #-4,r3 /* readjust end address */ | ||
108 | cmp/hi r3,r6 /* first long left? */ | ||
109 | bf .start_b2r /* no, jump to trailing byte loop */ | ||
110 | |||
111 | mov.l @(4,r6),r0 /* load first long */ | ||
112 | add #-4,r6 /* decrement source addr */ | ||
113 | bra .start_b2r /* jump to trailing byte loop */ | ||
114 | mov.l r0,@-r4 /* store first long */ | ||
115 | |||
116 | /* word aligned destination (long + 2) */ | ||
117 | .align 2 | ||
118 | .loop_do2r: | ||
119 | mov.l @r6,r1 /* load first long */ | ||
120 | add #-8,r6 /* decrement source addr */ | ||
121 | mov.l @(12,r6),r0 /* load second long */ | ||
122 | cmp/hi r3,r6 /* runs r6 down to first or second long bound */ | ||
123 | mov.w r0,@-r4 /* store low word of second long */ | ||
124 | xtrct r1,r0 /* extract low word of first long & high word of second long */ | ||
125 | mov.l r0,@-r4 /* and store as long */ | ||
126 | shlr16 r1 /* get high word of first long */ | ||
127 | mov.w r1,@-r4 /* and store it */ | ||
128 | bt .loop_do2r | ||
129 | |||
130 | add #-4,r3 /* readjust end address */ | ||
131 | cmp/hi r3,r6 /* first long left? */ | ||
132 | bf .start_b2r /* no, jump to trailing byte loop */ | ||
133 | |||
134 | mov.l @(4,r6),r0 /* load first long & decrement source addr */ | ||
135 | add #-4,r6 /* decrement source addr */ | ||
136 | mov.w r0,@-r4 /* store low word */ | ||
137 | shlr16 r0 /* get high word */ | ||
138 | bra .start_b2r /* jump to trailing byte loop */ | ||
139 | mov.w r0,@-r4 /* and store it */ | ||
140 | |||
141 | /* jumptable for loop selector */ | ||
142 | .align 2 | ||
143 | .jmptab_r: | ||
144 | .byte .loop_do0r - .jmptab_r /* placed in the middle because the SH1 */ | ||
145 | .byte .loop_do1r - .jmptab_r /* loads bytes sign-extended. Otherwise */ | ||
146 | .byte .loop_do2r - .jmptab_r /* the last loop would be out of reach */ | ||
147 | .byte .loop_do3r - .jmptab_r /* of the offset range. */ | ||
148 | |||
149 | /* byte aligned destination (long + 1) */ | ||
150 | .align 2 | ||
151 | .loop_do1r: | ||
152 | mov.l @r6,r1 /* load first long */ | ||
153 | add #-8,r6 /* decrement source addr */ | ||
154 | mov.l @(12,r6),r0 /* load second long */ | ||
155 | cmp/hi r3,r6 /* runs r6 down to first or second long bound */ | ||
156 | mov.b r0,@-r4 /* store low byte of second long */ | ||
157 | shlr8 r0 /* get upper 3 bytes */ | ||
158 | mov r1,r2 /* copy first long */ | ||
159 | shll16 r2 /* move low byte of first long all the way up, .. */ | ||
160 | shll8 r2 | ||
161 | or r2,r0 /* ..combine with the 3 bytes of second long.. */ | ||
162 | mov.l r0,@-r4 /* ..and store as long */ | ||
163 | shlr8 r1 /* get middle 2 bytes */ | ||
164 | mov.w r1,@-r4 /* store as word */ | ||
165 | shlr16 r1 /* get upper byte */ | ||
166 | mov.b r1,@-r4 /* and store */ | ||
167 | bt .loop_do1r | ||
168 | |||
169 | add #-4,r3 /* readjust end address */ | ||
170 | .last_do13r: | ||
171 | cmp/hi r3,r6 /* first long left? */ | ||
172 | bf .start_b2r /* no, jump to trailing byte loop */ | ||
173 | |||
174 | nop /* alignment */ | ||
175 | mov.l @(4,r6),r0 /* load first long */ | ||
176 | add #-4,r6 /* decrement source addr */ | ||
177 | mov.b r0,@-r4 /* store low byte */ | ||
178 | shlr8 r0 /* get middle 2 bytes */ | ||
179 | mov.w r0,@-r4 /* store as word */ | ||
180 | shlr16 r0 /* get upper byte */ | ||
181 | bra .start_b2r /* jump to trailing byte loop */ | ||
182 | mov.b r0,@-r4 /* and store */ | ||
183 | |||
184 | /* byte aligned destination (long + 3) */ | ||
185 | .align 2 | ||
186 | .loop_do3r: | ||
187 | mov.l @r6,r1 /* load first long */ | ||
188 | add #-8,r6 /* decrement source addr */ | ||
189 | mov.l @(12,r6),r0 /* load second long */ | ||
190 | mov r1,r2 /* copy first long */ | ||
191 | mov.b r0,@-r4 /* store low byte of second long */ | ||
192 | shlr8 r0 /* get middle 2 bytes */ | ||
193 | mov.w r0,@-r4 /* store as word */ | ||
194 | shlr16 r0 /* get upper byte */ | ||
195 | shll8 r2 /* move lower 3 bytes of first long one up.. */ | ||
196 | or r2,r0 /* ..combine with the 1 byte of second long.. */ | ||
197 | mov.l r0,@-r4 /* ..and store as long */ | ||
198 | shlr16 r1 /* get upper byte of first long */ | ||
199 | shlr8 r1 | ||
200 | cmp/hi r3,r6 /* runs r6 down to first or second long bound */ | ||
201 | mov.b r1,@-r4 /* ..and store */ | ||
202 | bt .loop_do3r | ||
203 | |||
204 | bra .last_do13r /* handle first longword: reuse routine for (long + 1) */ | ||
205 | add #-4,r3 /* readjust end address */ | ||
206 | |||
207 | /* trailing byte loop: copies 0..3 bytes (or all for < 11 in total) */ | ||
208 | .align 2 | ||
209 | .loop_b2r: | ||
210 | mov.b @(7,r6),r0 /* load byte */ | ||
211 | add #-1,r6 /* decrement source addr */ | ||
212 | mov.b r0,@-r4 /* store byte */ | ||
213 | .start_b2r: | ||
214 | cmp/hi r5,r6 /* runs r6 down to start address */ | ||
215 | bt .loop_b2r | ||
216 | |||
217 | rts | ||
218 | mov r4,r0 /* return dest start address */ | ||
219 | .end: | ||
220 | .size _memmove,.end-_memmove | ||
diff --git a/firmware/target/sh/memset-sh.S b/firmware/target/sh/memset-sh.S new file mode 100755 index 0000000000..9b96b93f27 --- /dev/null +++ b/firmware/target/sh/memset-sh.S | |||
@@ -0,0 +1,107 @@ | |||
1 | /*************************************************************************** | ||
2 | * __________ __ ___. | ||
3 | * Open \______ \ ____ ____ | | _\_ |__ _______ ___ | ||
4 | * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / | ||
5 | * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < | ||
6 | * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ | ||
7 | * \/ \/ \/ \/ \/ | ||
8 | * $Id$ | ||
9 | * | ||
10 | * Copyright (C) 2004 by Jens Arnold | ||
11 | * | ||
12 | * All files in this archive are subject to the GNU General Public License. | ||
13 | * See the file COPYING in the source tree root for full license agreement. | ||
14 | * | ||
15 | * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY | ||
16 | * KIND, either express or implied. | ||
17 | * | ||
18 | ****************************************************************************/ | ||
19 | #include "config.h" | ||
20 | |||
21 | .section .icode,"ax",@progbits | ||
22 | |||
23 | .align 2 | ||
24 | .global _memset | ||
25 | .type _memset,@function | ||
26 | |||
27 | /* Fills a memory region with specified byte value | ||
28 | * This version is optimized for speed | ||
29 | * | ||
30 | * arguments: | ||
31 | * r4 - start address | ||
32 | * r5 - data | ||
33 | * r6 - length | ||
34 | * | ||
35 | * return value: | ||
36 | * r0 - start address (like ANSI version) | ||
37 | * | ||
38 | * register usage: | ||
39 | * r0 - temporary | ||
40 | * r1 - start address +11 for main loop | ||
41 | * r4 - start address | ||
42 | * r5 - data (spread to all 4 bytes when using long stores) | ||
43 | * r6 - current address (runs down from end to start) | ||
44 | * | ||
45 | * The instruction order below is devised in a way to utilize the pipelining | ||
46 | * of the SH1 to the max. The routine fills memory from end to start in | ||
47 | * order to utilize the auto-decrementing store instructions. | ||
48 | */ | ||
49 | |||
50 | _memset: | ||
51 | neg r4,r0 | ||
52 | and #3,r0 /* r0 = (4 - align_offset) % 4 */ | ||
53 | add #4,r0 | ||
54 | cmp/hs r0,r6 /* at least one aligned longword to fill? */ | ||
55 | add r4,r6 /* r6 = end_address */ | ||
56 | bf .no_longs /* no, jump directly to byte loop */ | ||
57 | |||
58 | extu.b r5,r5 /* start: spread data to all 4 bytes */ | ||
59 | swap.b r5,r0 | ||
60 | or r0,r5 /* data now in 2 lower bytes of r5 */ | ||
61 | swap.w r5,r0 | ||
62 | or r0,r5 /* data now in all 4 bytes of r5 */ | ||
63 | |||
64 | mov r6,r0 | ||
65 | tst #3,r0 /* r0 already long aligned? */ | ||
66 | bt .end_b1 /* yes: skip loop */ | ||
67 | |||
68 | /* leading byte loop: sets 0..3 bytes */ | ||
69 | .loop_b1: | ||
70 | mov.b r5,@-r0 /* store byte */ | ||
71 | tst #3,r0 /* r0 long aligned? */ | ||
72 | bf .loop_b1 /* runs r0 down until long aligned */ | ||
73 | |||
74 | mov r0,r6 /* r6 = last long bound */ | ||
75 | nop /* keep alignment */ | ||
76 | |||
77 | .end_b1: | ||
78 | mov r4,r1 /* r1 = start_address... */ | ||
79 | add #11,r1 /* ... + 11, combined for rounding and offset */ | ||
80 | xor r1,r0 | ||
81 | tst #4,r0 /* bit 2 tells whether an even or odd number of */ | ||
82 | bf .loop_odd /* longwords to set */ | ||
83 | |||
84 | /* main loop: set 2 longs per pass */ | ||
85 | .loop_2l: | ||
86 | mov.l r5,@-r6 /* store first long */ | ||
87 | .loop_odd: | ||
88 | cmp/hi r1,r6 /* runs r6 down to first long bound */ | ||
89 | mov.l r5,@-r6 /* store second long */ | ||
90 | bt .loop_2l | ||
91 | |||
92 | .no_longs: | ||
93 | cmp/hi r4,r6 /* any bytes left? */ | ||
94 | bf .end_b2 /* no: skip loop */ | ||
95 | |||
96 | /* trailing byte loop */ | ||
97 | .loop_b2: | ||
98 | mov.b r5,@-r6 /* store byte */ | ||
99 | cmp/hi r4,r6 /* runs r6 down to the start address */ | ||
100 | bt .loop_b2 | ||
101 | |||
102 | .end_b2: | ||
103 | rts | ||
104 | mov r4,r0 /* return start address */ | ||
105 | |||
106 | .end: | ||
107 | .size _memset,.end-_memset | ||
diff --git a/firmware/common/strlen_a.S b/firmware/target/sh/strlen-sh.S index 34837605ac..34837605ac 100755 --- a/firmware/common/strlen_a.S +++ b/firmware/target/sh/strlen-sh.S | |||