summaryrefslogtreecommitdiff
path: root/firmware
diff options
context:
space:
mode:
authorJörg Hohensohn <hohensoh@rockbox.org>2004-03-18 22:06:36 +0000
committerJörg Hohensohn <hohensoh@rockbox.org>2004-03-18 22:06:36 +0000
commitb61cf76aba768513ab2a1fa9e7cc80f59dbce02f (patch)
treedb81683f1743eab4a04f28b4b604aa1bdb297a4b /firmware
parentc40c069a67a41a85eb2525561de65b11d240d85e (diff)
downloadrockbox-b61cf76aba768513ab2a1fa9e7cc80f59dbce02f.tar.gz
rockbox-b61cf76aba768513ab2a1fa9e7cc80f59dbce02f.zip
patch #917153: faster memset()/memcpy()
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@4406 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'firmware')
-rw-r--r--firmware/Makefile7
-rw-r--r--firmware/common/memcpy.S171
-rw-r--r--firmware/common/memcpy.c117
-rw-r--r--firmware/common/memset.S108
-rw-r--r--firmware/common/memset.c109
5 files changed, 283 insertions, 229 deletions
diff --git a/firmware/Makefile b/firmware/Makefile
index 93ee38ac78..38bcd4cc86 100644
--- a/firmware/Makefile
+++ b/firmware/Makefile
@@ -25,16 +25,17 @@ endif
25ifdef DEBUG 25ifdef DEBUG
26CFLAGS += -g -DDEBUG 26CFLAGS += -g -DDEBUG
27else 27else
28CFLAGS += -fomit-frame-pointer -fschedule-insns 28CFLAGS += -fomit-frame-pointer -fschedule-insns
29endif 29endif
30 30
31SRC := $(wildcard drivers/*.c common/*.c *.c) 31SRC := $(wildcard drivers/*.c common/*.c *.c)
32SRC_S := $(wildcard drivers/*.S common/*.S *.S)
32 33
33OBJS := $(SRC:%.c=$(OBJDIR)/%.o) $(OBJDIR)/crt0.o $(OBJDIR)/bitswap.o $(OBJDIR)/descramble.o 34OBJS := $(SRC:%.c=$(OBJDIR)/%.o) $(SRC_S:%.S=$(OBJDIR)/%.o)
34DEPS:=.deps 35DEPS:=.deps
35DEPDIRS:=$(DEPS) $(DEPS)/drivers $(DEPS)/common $(DEPS)/malloc 36DEPDIRS:=$(DEPS) $(DEPS)/drivers $(DEPS)/common $(DEPS)/malloc
36 37
37DIRS = $(subst $(DEPS),".",$(DEPDIRS)) 38DIRS = $(subst $(DEPS),".",$(DEPDIRS))
38 39
39OUTPUT = $(OBJDIR)/librockbox.a 40OUTPUT = $(OBJDIR)/librockbox.a
40 41
diff --git a/firmware/common/memcpy.S b/firmware/common/memcpy.S
new file mode 100644
index 0000000000..2fb9f6a5a7
--- /dev/null
+++ b/firmware/common/memcpy.S
@@ -0,0 +1,171 @@
1/***************************************************************************
2 * __________ __ ___.
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 * \/ \/ \/ \/ \/
8 * $Id$
9 *
10 * Copyright (C) 2004 by Jens Arnold
11 *
12 * All files in this archive are subject to the GNU General Public License.
13 * See the file COPYING in the source tree root for full license agreement.
14 *
15 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
16 * KIND, either express or implied.
17 *
18 ****************************************************************************/
19
20 .section .icode,"ax",@progbits
21
22 .align 2
23 .global _memcpy
24 .type _memcpy,@function
25
26/* Copies <length> bytes of data in memory from <source> to <dest>
27 * This version is optimized for speed
28 *
29 * arguments:
30 * r4 - destination address
31 * r5 - source address
32 * r6 - length
33 *
34 * return value:
35 * r0 - destination address (like ANSI version)
36 *
37 * register usage:
38 * r0 - data / temporary
39 * r1 - bit mask for rounding to long bounds / 2nd data
40 * r2 - first long bound (only if >= 12 bytes)
41 * r3 - last long bound (-4) (only if >= 12 bytes)
42 * r4 - current dest address
43 * r5 - current source address
44 * r6 - source end address
45 * r7 - stored dest start address
46 *
47 * The instruction order below is devised in a way to utilize the pipelining
48 * of the SH1 to the max. The routine also tries to utilize fast page mode.
49 */
50
51_memcpy:
52 add r5,r6 /* r6 = source_end */
53 mov r4,r7 /* store for returning */
54 add #-8,r4 /* adjust for early increments (max. 2 longs) */
55
56 mov r6,r0
57 add #-12,r0 /* r0 = r6 - 12; don't go below 12 here! */
58 cmp/hs r5,r0 /* >= 12 bytes to copy? */
59 bf .start_b2 /* no, jump into byte loop */
60
61 mov #-4,r1 /* r1 = 0xFFFFFFFC */
62
63 mov r5,r2
64 add #3,r2
65 and r1,r2 /* r2 = first source long bound */
66 mov r6,r3
67 add #-4,r3 /* end offset for copying 2 longs per pass */
68 bra .start_b1 /* jump into leading byte loop */
69 and r1,r3 /* r3 = last source long bound - 4 */
70
71 /* leading byte loop: copies 0..3 bytes */
72 .align 2
73.loop_b1:
74 mov.b @r5+,r0 /* load byte & increment source addr */
75 add #1,r4 /* increment dest addr */
76 mov.b r0,@(7,r4) /* store byte */
77.start_b1:
78 cmp/hi r5,r2 /* runs r5 up to first long bound */
79 bt .loop_b1
80 /* now r5 is always at a long boundary */
81 /* -> memory reading is done in longs for all dest alignments */
82
83 /* selector for main copy loop */
84 mov r4,r0
85 tst #3,r0 /* dest now also at long bound? */
86 bt .loop2_l /* yes, do long copy */
87 tst #1,r0 /* dest now at least at word bound? */
88 bt .start4_w /* yes, do word copy */
89
90 /* main loop for byte aligned destination (fast) */
91 /* copies 1 long per pass */
92 add #4,r3 /* reset end offset */
93 add #-1,r4 /* adjust to word alignment for word write+ */
94
95.loop4_b:
96 mov.l @r5+,r0 /* load a long & increment source addr */
97 add #4,r4 /* increment dest addr */
98 mov.b r0,@(8,r4) /* store low byte */
99 shlr8 r0 /* get middle 2 bytes */
100 mov.w r0,@(6,r4) /* store as word+ */
101 shlr16 r0 /* get upper byte */
102 mov.b r0,@(5,r4) /* and store */
103 cmp/hi r5,r3 /* runs r5 up to last long bound */
104 bt .loop4_b
105
106 bra .start_b2 /* jump to trailing byte loop */
107 add #1,r4 /* readjust */
108
109 /* main loop for word aligned destination (faster) */
110 /* copies 2 longs per pass, utilizing fast page mode */
111.start4_w:
112 add #-2,r4 /* adjust to long alignment for long write+ */
113
114.loop4_w:
115 mov.l @r5+,r1 /* load first long & increment source addr */
116 add #8,r4 /* increment dest addr */
117 mov.l @r5+,r0 /* load second long & increment source addr */
118 cmp/hi r5,r3 /* runs r5 up to last or second last long bound */
119 mov.w r0,@(8,r4) /* store low word of second long */
120 xtrct r1,r0 /* extract low word of first long & high word of second long */
121 mov.l r0,@(4,r4) /* and store as long+ */
122 swap.w r1,r0 /* get high word of first long */
123 mov.w r0,@(2,r4) /* and store it */
124 bt .loop4_w
125
126 add #2,r4 /* readjust destination */
127 add #4,r3 /* reset end offset */
128 cmp/hi r5,r3 /* one long left? */
129 bf .start_b2 /* no, jump to trailing byte loop */
130
131 mov.l @r5+,r0 /* load last long & increment source addr */
132 add #4,r4 /* increment dest addr */
133 mov.w r0,@(6,r4) /* store low word */
134 shlr16 r0 /* get high word */
135 bra .start_b2 /* jump to trailing byte loop */
136 mov.w r0,@(4,r4) /* and store it */
137
138 /* main loop for long aligned destination (fastest) */
139 /* copies 2 longs per pass, utilizing fast page mode */
140.loop2_l:
141 mov.l @r5+,r1 /* load first long & increment source addr */
142 add #8,r4 /* increment dest addr */
143 mov.l @r5+,r0 /* load second long & increment source addr */
144 cmp/hi r5,r3 /* runs r5 up to last or second last long bound */
145 mov.l r1,@r4 /* store first long */
146 mov.l r0,@(4,r4) /* store second long; NOT ALIGNED - no speed loss here! */
147 bt .loop2_l
148
149 add #4,r3 /* reset end offset */
150 cmp/hi r5,r3 /* one long left? */
151 bf .start_b2 /* no, jump to trailing byte loop */
152
153 mov.l @r5+,r0 /* load last long & increment source addr */
154 add #4,r4 /* increment dest addr */
155 bra .start_b2 /* jump to trailing byte loop */
156 mov.l r0,@(4,r4) /* store last long */
157
158 /* trailing byte loop: copies 0..3 bytes (or all for < 12 in total) */
159.loop_b2:
160 mov.b @r5+,r0 /* load byte & increment source addr */
161 add #1,r4 /* increment dest addr */
162 mov.b r0,@(7,r4) /* store byte */
163.start_b2:
164 cmp/hi r5,r6 /* runs r5 up to end address */
165 bt .loop_b2
166
167 rts
168 mov r7,r0 /* return dest start address */
169.end:
170 .size _memcpy,.end-_memcpy
171
diff --git a/firmware/common/memcpy.c b/firmware/common/memcpy.c
deleted file mode 100644
index 49678920fa..0000000000
--- a/firmware/common/memcpy.c
+++ /dev/null
@@ -1,117 +0,0 @@
1/*
2FUNCTION
3 <<memcpy>>---copy memory regions
4
5ANSI_SYNOPSIS
6 #include <string.h>
7 void* memcpy(void *<[out]>, const void *<[in]>, size_t <[n]>);
8
9TRAD_SYNOPSIS
10 void *memcpy(<[out]>, <[in]>, <[n]>
11 void *<[out]>;
12 void *<[in]>;
13 size_t <[n]>;
14
15DESCRIPTION
16 This function copies <[n]> bytes from the memory region
17 pointed to by <[in]> to the memory region pointed to by
18 <[out]>.
19
20 If the regions overlap, the behavior is undefined.
21
22RETURNS
23 <<memcpy>> returns a pointer to the first byte of the <[out]>
24 region.
25
26PORTABILITY
27<<memcpy>> is ANSI C.
28
29<<memcpy>> requires no supporting OS subroutines.
30
31QUICKREF
32 memcpy ansi pure
33 */
34
35#include <_ansi.h>
36#include <stddef.h>
37#include <limits.h>
38
39/* Nonzero if either X or Y is not aligned on a "long" boundary. */
40#define UNALIGNED(X, Y) \
41 (((long)X & (sizeof (long) - 1)) | ((long)Y & (sizeof (long) - 1)))
42
43/* How many bytes are copied each iteration of the 4X unrolled loop. */
44#define BIGBLOCKSIZE (sizeof (long) << 2)
45
46/* How many bytes are copied each iteration of the word copy loop. */
47#define LITTLEBLOCKSIZE (sizeof (long))
48
49/* Threshhold for punting to the byte copier. */
50#define TOO_SMALL(LEN) ((LEN) < BIGBLOCKSIZE)
51
52_PTR
53_DEFUN (memcpy, (dst0, src0, len0),
54 _PTR dst0 _AND
55 _CONST _PTR src0 _AND
56 size_t len0) __attribute__ ((section (".icode")));
57
58_PTR
59_DEFUN (memcpy, (dst0, src0, len0),
60 _PTR dst0 _AND
61 _CONST _PTR src0 _AND
62 size_t len0)
63{
64#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
65 char *dst = (char *) dst0;
66 char *src = (char *) src0;
67
68 _PTR save = dst0;
69
70 while (len0--)
71 {
72 *dst++ = *src++;
73 }
74
75 return save;
76#else
77 char *dst = dst0;
78 _CONST char *src = src0;
79 long *aligned_dst;
80 _CONST long *aligned_src;
81 unsigned int len = len0;
82
83 /* If the size is small, or either SRC or DST is unaligned,
84 then punt into the byte copy loop. This should be rare. */
85 if (!TOO_SMALL(len) && !UNALIGNED (src, dst))
86 {
87 aligned_dst = (long*)dst;
88 aligned_src = (long*)src;
89
90 /* Copy 4X long words at a time if possible. */
91 while (len >= BIGBLOCKSIZE)
92 {
93 *aligned_dst++ = *aligned_src++;
94 *aligned_dst++ = *aligned_src++;
95 *aligned_dst++ = *aligned_src++;
96 *aligned_dst++ = *aligned_src++;
97 len -= BIGBLOCKSIZE;
98 }
99
100 /* Copy one long word at a time if possible. */
101 while (len >= LITTLEBLOCKSIZE)
102 {
103 *aligned_dst++ = *aligned_src++;
104 len -= LITTLEBLOCKSIZE;
105 }
106
107 /* Pick up any residual with a byte copier. */
108 dst = (char*)aligned_dst;
109 src = (char*)aligned_src;
110 }
111
112 while (len--)
113 *dst++ = *src++;
114
115 return dst0;
116#endif /* not PREFER_SIZE_OVER_SPEED */
117}
diff --git a/firmware/common/memset.S b/firmware/common/memset.S
new file mode 100644
index 0000000000..038915c475
--- /dev/null
+++ b/firmware/common/memset.S
@@ -0,0 +1,108 @@
1/***************************************************************************
2 * __________ __ ___.
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 * \/ \/ \/ \/ \/
8 * $Id$
9 *
10 * Copyright (C) 2004 by Jens Arnold
11 *
12 * All files in this archive are subject to the GNU General Public License.
13 * See the file COPYING in the source tree root for full license agreement.
14 *
15 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
16 * KIND, either express or implied.
17 *
18 ****************************************************************************/
19
20 .section .icode,"ax",@progbits
21
22 .align 2
23 .global _memset
24 .type _memset,@function
25
26/* Fills a memory region with specified byte value
27 * This version is optimized for speed
28 *
29 * arguments:
30 * r4 - start address
31 * r5 - data
32 * r6 - length
33 *
34 * return value:
35 * r0 - start address (like ANSI version)
36 *
37 * register usage:
38 * r0 - temporary
39 * r1 - bit mask for rounding to long bounds
40 * r2 - last / first long bound (only if >= 12 bytes)
41 * r4 - start address
42 * r5 - data (spread to all 4 bytes if >= 12 bytes)
43 * r6 - current address (runs down from end to start)
44 *
45 * The instruction order below is devised in a way to utilize the pipelining
46 * of the SH1 to the max. The routine fills memory from end to start in
47 * order to utilize the auto-decrementing store instructions.
48 */
49
50_memset:
51 add r4,r6 /* r6 = end_address */
52
53 mov r6,r0
54 add #-12,r0 /* r0 = r6 - 12; don't go below 12 here! */
55 cmp/hs r4,r0 /* >= 12 bytes to fill? */
56 bf .start_b2 /* no, jump directly to byte loop */
57
58 extu.b r5,r5 /* start: spread data to all 4 bytes */
59 swap.b r5,r0
60 or r0,r5 /* data now in 2 lower bytes of r5 */
61 swap.w r5,r0
62 or r0,r5 /* data now in all 4 bytes of r5 */
63
64 mov #-4,r1 /* r1 = 0xFFFFFFFC */
65
66 mov r6,r2
67 bra .start_b1
68 and r1,r2 /* r2 = last long bound */
69
70 /* leading byte loop: sets 0..3 bytes */
71.loop_b1:
72 mov.b r5,@-r6 /* store byte */
73.start_b1:
74 cmp/hi r2,r6 /* runs r6 down to last long bound */
75 bt .loop_b1
76
77 mov r4,r2
78 add #11,r2 /* combined for rounding and offset */
79 and r1,r2 /* r2 = first long bound + 8 */
80
81 /* main loop: set 2 longs per pass */
82.loop2_l:
83 mov.l r5,@-r6 /* store first long */
84 cmp/hi r2,r6 /* runs r6 down to first or second long bound */
85 mov.l r5,@-r6 /* store second long */
86 bt .loop2_l
87
88 add #-8,r2 /* correct offset */
89 cmp/hi r2,r6 /* 1 long left? */
90 bf .start_b2 /* no, jump to trailing byte loop */
91
92 bra .start_b2 /* jump to trailing byte loop */
93 mov.l r5,@-r6 /* store last long */
94
95 /* trailing byte loop */
96 .align 2
97.loop_b2:
98 mov.b r5,@-r6 /* store byte */
99.start_b2:
100 cmp/hi r4,r6 /* runs r6 down to the start address */
101 bt .loop_b2
102
103 rts
104 mov r4,r0 /* return start address */
105
106.end:
107 .size _memset,.end-_memset
108
diff --git a/firmware/common/memset.c b/firmware/common/memset.c
deleted file mode 100644
index c370191cda..0000000000
--- a/firmware/common/memset.c
+++ /dev/null
@@ -1,109 +0,0 @@
1/*
2FUNCTION
3 <<memset>>---set an area of memory
4
5INDEX
6 memset
7
8ANSI_SYNOPSIS
9 #include <string.h>
10 void *memset(const void *<[dst]>, int <[c]>, size_t <[length]>);
11
12TRAD_SYNOPSIS
13 #include <string.h>
14 void *memset(<[dst]>, <[c]>, <[length]>)
15 void *<[dst]>;
16 int <[c]>;
17 size_t <[length]>;
18
19DESCRIPTION
20 This function converts the argument <[c]> into an unsigned
21 char and fills the first <[length]> characters of the array
22 pointed to by <[dst]> to the value.
23
24RETURNS
25 <<memset>> returns the value of <[m]>.
26
27PORTABILITY
28<<memset>> is ANSI C.
29
30 <<memset>> requires no supporting OS subroutines.
31
32QUICKREF
33 memset ansi pure
34*/
35
36#include <string.h>
37
38#define LBLOCKSIZE (sizeof(long))
39#define UNALIGNED(X) ((long)X & (LBLOCKSIZE - 1))
40#define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)
41
42_PTR
43_DEFUN (memset, (m, c, n),
44 _PTR m _AND
45 int c _AND
46 size_t n)
47{
48#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
49 char *s = (char *) m;
50
51 while (n-- != 0)
52 {
53 *s++ = (char) c;
54 }
55
56 return m;
57#else
58 char *s = (char *) m;
59 unsigned int i;
60 unsigned long buffer;
61 unsigned long *aligned_addr;
62
63 if (!TOO_SMALL (n) && !UNALIGNED (m))
64 {
65 /* If we get this far, we know that n is large and m is word-aligned. */
66
67 aligned_addr = (unsigned long*)m;
68
69 /* Store C into each char sized location in BUFFER so that
70 we can set large blocks quickly. */
71 c &= 0xff;
72 if (LBLOCKSIZE == 4)
73 {
74 buffer = (c << 8) | c;
75 buffer |= (buffer << 16);
76 }
77 else
78 {
79 buffer = 0;
80 for (i = 0; i < LBLOCKSIZE; i++)
81 buffer = (buffer << 8) | c;
82 }
83
84 while (n >= LBLOCKSIZE*4)
85 {
86 *aligned_addr++ = buffer;
87 *aligned_addr++ = buffer;
88 *aligned_addr++ = buffer;
89 *aligned_addr++ = buffer;
90 n -= 4*LBLOCKSIZE;
91 }
92
93 while (n >= LBLOCKSIZE)
94 {
95 *aligned_addr++ = buffer;
96 n -= LBLOCKSIZE;
97 }
98 /* Pick up the remainder with a bytewise loop. */
99 s = (char*)aligned_addr;
100 }
101
102 while (n--)
103 {
104 *s++ = (char)c;
105 }
106
107 return m;
108#endif /* not PREFER_SIZE_OVER_SPEED */
109}