summaryrefslogtreecommitdiff
path: root/firmware/bitswap.S
diff options
context:
space:
mode:
authorJörg Hohensohn <hohensoh@rockbox.org>2004-03-18 22:44:05 +0000
committerJörg Hohensohn <hohensoh@rockbox.org>2004-03-18 22:44:05 +0000
commite64256d499f2fa45a724269cb1a8043a7434ba8d (patch)
tree3215fafc37e3dec517d498cd8b6fd0aa5f118713 /firmware/bitswap.S
parentb61cf76aba768513ab2a1fa9e7cc80f59dbce02f (diff)
downloadrockbox-e64256d499f2fa45a724269cb1a8043a7434ba8d.tar.gz
rockbox-e64256d499f2fa45a724269cb1a8043a7434ba8d.zip
patch #919088: 17% faster bitswap(), by Jens "SH" Arnold
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@4407 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'firmware/bitswap.S')
-rw-r--r--firmware/bitswap.S160
1 files changed, 89 insertions, 71 deletions
diff --git a/firmware/bitswap.S b/firmware/bitswap.S
index 990ecb4d00..25d7a99ad2 100644
--- a/firmware/bitswap.S
+++ b/firmware/bitswap.S
@@ -7,7 +7,7 @@
7 * \/ \/ \/ \/ \/ 7 * \/ \/ \/ \/ \/
8 * $Id$ 8 * $Id$
9 * 9 *
10 * Copyright (C) 2002 by Magnus Holmgren 10 * Copyright (C) 2004 by Jens Arnold
11 * 11 *
12 * All files in this archive are subject to the GNU General Public License. 12 * All files in this archive are subject to the GNU General Public License.
13 * See the file COPYING in the source tree root for full license agreement. 13 * See the file COPYING in the source tree root for full license agreement.
@@ -22,67 +22,87 @@
22 .global _bitswap 22 .global _bitswap
23 .type _bitswap,@function 23 .type _bitswap,@function
24 24
25/* Registers used: 25/* Flips the bits of all bytes in a memory area (required for mp3 data on
26 * the Archos). This version is optimized for speed and size.
26 * 27 *
27 * r0 Temporary (required by some instructions) 28 * arguments:
28 * r1 Low byte 29 * r4 - start address
29 * r2 High byte / final result 30 * r5 - length
30 * r4 &Data 31 *
31 * r5 Length 32 * return value: void
32 * r7 Flip table 33 *
33 */ 34 * register usage:
34 35 * r0 - temporary
35/* The instruction order below is a bit strange, because: 36 * r1 - bit mask for rounding to long bound / low byte (after swap)
36 * 1) Keeping load/stores on longword boundaries means the instruction fetch 37 * r2 - high byte (after swap) / combined result
37 * won't compete with the memory access (because instructions are fetched 38 * r4 - data address - 4
38 * in pairs). 39 * r5 - end address - 4
39 * 2) Using the result of a fetch in the next instruction causes a stall 40 * r7 - flip table (addressing with signed offset)
40 * (except in certain circumstances). 41 *
41 * See the SH-1 programming manual for details. 42 * The instruction order below is devised in a way to utilize the pipelining
43 * of the SH1 to the max.
42 */ 44 */
43 45
44_bitswap: 46_bitswap:
45 mov.l .fliptable,r7 47 mova _fliptable,r0
46 add #-2,r4 /* ptr is used shifted by 2 */ 48 mov r0,r7
47 add r4,r5 /* r5 = end_address - 2 */ 49 add #-4,r4 /* address is shifted by 4 */
48 add #-1,r5 /* r5 = &last_byte - 2 */ 50 add r4,r5 /* r5 = end_address - 4 */
51 cmp/hi r4,r5 /* at least something to do? */
52 bf .exit /* no, get out of here! */
53
54 add #-3,r5 /* end offset for flipping 4 bytes per pass */
49 mov r4,r0 55 mov r4,r0
50 tst #1,r0 /* even address? */ 56 tst #1,r0 /* even address? */
51 bt .init /* yes */ 57 bt .start2_w /* yes, jump into main loop */
52 58
53 add #1,r4 /* r4 now even */ 59 /* no, flip first byte */
54 mov.b @(1,r4),r0 /* no, swap first byte */ 60 mov.b @(4,r4),r0 /* load byte, sign extension! */
55 extu.b r0,r0 61 add #1,r4 /* early increment */
56 mov.b @(r0,r7),r0 62 mov.b @(r0,r7),r0 /* fliptable offset is signed */
57 mov.b r0,@(1,r4) 63 bra .start2_w /* jump into main loop */
64 mov.b r0,@(3,r4) /* store byte */
58 65
59.init: 66 /* main loop: flips 2 words per pass */
60 cmp/hi r4,r5 /* at least 2 bytes to swap? */ 67 .align 2
61 bf .last /* no, skip main loop */ 68.loop2_w:
62 69 mov.w @(6,r4),r0 /* load second word */
63.loop: 70 add #4,r4 /* early increment */
64 mov.w @(2,r4),r0 /* data to flip */ 71 swap.b r0,r2 /* get high byte (2nd word) */
65 add #2,r4 /* early increment */ 72 exts.b r0,r0 /* prepare low byte (2nd word) */
66 swap.b r0,r2 /* get high byte */ 73 mov.b @(r0,r7),r1 /* swap low byte (2nd word) */
67 extu.b r0,r0 /* prepare low byte */ 74 exts.b r2,r0 /* prepare high byte (2nd word) */
68 mov.b @(r0,r7),r1 /* swap low byte */ 75 mov.b @(r0,r7),r2 /* swap high byte (2nd word) */
69 extu.b r2,r0 /* prepare high byte */ 76 extu.b r1,r0 /* zero extend low byte (2nd word) */
70 mov.b @(r0,r7),r2 /* swap high byte */ 77 mov.w @r4,r1 /* load first word */
71 extu.b r1,r1 /* zero extend low byte */ 78 shll8 r2 /* shift high byte (2nd word), low byte zeroed */
72 shll8 r2 /* shift high byte, low byte zeroed */ 79 or r2,r0 /* put low byte (2nd word) in result */
73 or r1,r2 /* put low byte in result */ 80 swap.b r1,r2 /* get high byte (1st word) */
74 mov.w r2,@r4 /* store result, ptr already incr'd */ 81 mov.w r0,@(2,r4) /* store result (2nd word) */
75 cmp/hi r4,r5 /* while &last_byte > data */ 82 exts.b r1,r0 /* prepare low byte (1st word) */
76 bt .loop 83 mov.b @(r0,r7),r1 /* swap low byte (1st word) */
84 exts.b r2,r0 /* prepare high byte (1st word) */
85 mov.b @(r0,r7),r2 /* swap high byte (1st word) */
86 extu.b r1,r0 /* zero extend low byte (1st word) */
87 shll8 r2 /* shift high byte (1st word), low byte zeroed */
88 or r2,r0 /* put low byte (1st word) in result */
89 mov.w r0,@r4 /* store result (1st word) */
90.start2_w:
91 cmp/hi r4,r5 /* runs r4 up to last long bound */
92 bt .loop2_w
77 93
78.last: 94 bra .start_b2 /* jump into trailing byte loop */
79 cmp/eq r4,r5 /* if behind (&last_byte - 2), exit */ 95 add #3,r5 /* reset end offset */
80 bf .exit
81 96
82 mov.b @(2,r4),r0 /* swap last byte */ 97 /* trailing byte loop: flips 0..3 bytes */
83 extu.b r0,r0 98.loop_b2:
84 mov.b @(r0,r7),r0 99 mov.b @(4,r4),r0 /* loand byte, sign extension! */
85 mov.b r0,@(2,r4) 100 add #1,r4 /* early increment */
101 mov.b @(r0,r7),r0 /* fliptable offset is signed */
102 mov.b r0,@(3,r4) /* store byte */
103.start_b2:
104 cmp/hi r4,r5 /* runs r4 up to end address */
105 bt .loop_b2
86 106
87.exit: 107.exit:
88 rts 108 rts
@@ -90,9 +110,22 @@ _bitswap:
90 110
91 .align 2 111 .align 2
92 112
93.fliptable: 113 .byte 0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1
94 .long _fliptable 114 .byte 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1
95 115 .byte 0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9
116 .byte 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9
117 .byte 0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5
118 .byte 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5
119 .byte 0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed
120 .byte 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd
121 .byte 0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3
122 .byte 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3
123 .byte 0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb
124 .byte 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb
125 .byte 0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7
126 .byte 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7
127 .byte 0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef
128 .byte 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff
96_fliptable: 129_fliptable:
97 .byte 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0 130 .byte 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0
98 .byte 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0 131 .byte 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0
@@ -110,22 +143,7 @@ _fliptable:
110 .byte 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6 143 .byte 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6
111 .byte 0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee 144 .byte 0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee
112 .byte 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe 145 .byte 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe
113 .byte 0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1 146
114 .byte 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1
115 .byte 0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9
116 .byte 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9
117 .byte 0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5
118 .byte 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5
119 .byte 0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed
120 .byte 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd
121 .byte 0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3
122 .byte 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3
123 .byte 0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb
124 .byte 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb
125 .byte 0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7
126 .byte 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7
127 .byte 0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef
128 .byte 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff
129 147
130.end: 148.end:
131 .size _bitswap,.end-_bitswap 149 .size _bitswap,.end-_bitswap