diff options
author | Jörg Hohensohn <hohensoh@rockbox.org> | 2004-03-18 22:44:05 +0000 |
---|---|---|
committer | Jörg Hohensohn <hohensoh@rockbox.org> | 2004-03-18 22:44:05 +0000 |
commit | e64256d499f2fa45a724269cb1a8043a7434ba8d (patch) | |
tree | 3215fafc37e3dec517d498cd8b6fd0aa5f118713 | |
parent | b61cf76aba768513ab2a1fa9e7cc80f59dbce02f (diff) | |
download | rockbox-e64256d499f2fa45a724269cb1a8043a7434ba8d.tar.gz rockbox-e64256d499f2fa45a724269cb1a8043a7434ba8d.zip |
patch #919088: 17% faster bitswap(), by Jens "SH" Arnold
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@4407 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r-- | firmware/bitswap.S | 160 |
1 files changed, 89 insertions, 71 deletions
diff --git a/firmware/bitswap.S b/firmware/bitswap.S index 990ecb4d00..25d7a99ad2 100644 --- a/firmware/bitswap.S +++ b/firmware/bitswap.S | |||
@@ -7,7 +7,7 @@ | |||
7 | * \/ \/ \/ \/ \/ | 7 | * \/ \/ \/ \/ \/ |
8 | * $Id$ | 8 | * $Id$ |
9 | * | 9 | * |
10 | * Copyright (C) 2002 by Magnus Holmgren | 10 | * Copyright (C) 2004 by Jens Arnold |
11 | * | 11 | * |
12 | * All files in this archive are subject to the GNU General Public License. | 12 | * All files in this archive are subject to the GNU General Public License. |
13 | * See the file COPYING in the source tree root for full license agreement. | 13 | * See the file COPYING in the source tree root for full license agreement. |
@@ -22,67 +22,87 @@ | |||
22 | .global _bitswap | 22 | .global _bitswap |
23 | .type _bitswap,@function | 23 | .type _bitswap,@function |
24 | 24 | ||
25 | /* Registers used: | 25 | /* Flips the bits of all bytes in a memory area (required for mp3 data on |
26 | * the Archos). This version is optimized for speed and size. | ||
26 | * | 27 | * |
27 | * r0 Temporary (required by some instructions) | 28 | * arguments: |
28 | * r1 Low byte | 29 | * r4 - start address |
29 | * r2 High byte / final result | 30 | * r5 - length |
30 | * r4 &Data | 31 | * |
31 | * r5 Length | 32 | * return value: void |
32 | * r7 Flip table | 33 | * |
33 | */ | 34 | * register usage: |
34 | 35 | * r0 - temporary | |
35 | /* The instruction order below is a bit strange, because: | 36 | * r1 - bit mask for rounding to long bound / low byte (after swap) |
36 | * 1) Keeping load/stores on longword boundaries means the instruction fetch | 37 | * r2 - high byte (after swap) / combined result |
37 | * won't compete with the memory access (because instructions are fetched | 38 | * r4 - data address - 4 |
38 | * in pairs). | 39 | * r5 - end address - 4 |
39 | * 2) Using the result of a fetch in the next instruction causes a stall | 40 | * r7 - flip table (addressing with signed offset) |
40 | * (except in certain circumstances). | 41 | * |
41 | * See the SH-1 programming manual for details. | 42 | * The instruction order below is devised in a way to utilize the pipelining |
43 | * of the SH1 to the max. | ||
42 | */ | 44 | */ |
43 | 45 | ||
44 | _bitswap: | 46 | _bitswap: |
45 | mov.l .fliptable,r7 | 47 | mova _fliptable,r0 |
46 | add #-2,r4 /* ptr is used shifted by 2 */ | 48 | mov r0,r7 |
47 | add r4,r5 /* r5 = end_address - 2 */ | 49 | add #-4,r4 /* address is shifted by 4 */ |
48 | add #-1,r5 /* r5 = &last_byte - 2 */ | 50 | add r4,r5 /* r5 = end_address - 4 */ |
51 | cmp/hi r4,r5 /* at least something to do? */ | ||
52 | bf .exit /* no, get out of here! */ | ||
53 | |||
54 | add #-3,r5 /* end offset for flipping 4 bytes per pass */ | ||
49 | mov r4,r0 | 55 | mov r4,r0 |
50 | tst #1,r0 /* even address? */ | 56 | tst #1,r0 /* even address? */ |
51 | bt .init /* yes */ | 57 | bt .start2_w /* yes, jump into main loop */ |
52 | 58 | ||
53 | add #1,r4 /* r4 now even */ | 59 | /* no, flip first byte */ |
54 | mov.b @(1,r4),r0 /* no, swap first byte */ | 60 | mov.b @(4,r4),r0 /* load byte, sign extension! */ |
55 | extu.b r0,r0 | 61 | add #1,r4 /* early increment */ |
56 | mov.b @(r0,r7),r0 | 62 | mov.b @(r0,r7),r0 /* fliptable offset is signed */ |
57 | mov.b r0,@(1,r4) | 63 | bra .start2_w /* jump into main loop */ |
64 | mov.b r0,@(3,r4) /* store byte */ | ||
58 | 65 | ||
59 | .init: | 66 | /* main loop: flips 2 words per pass */ |
60 | cmp/hi r4,r5 /* at least 2 bytes to swap? */ | 67 | .align 2 |
61 | bf .last /* no, skip main loop */ | 68 | .loop2_w: |
62 | 69 | mov.w @(6,r4),r0 /* load second word */ | |
63 | .loop: | 70 | add #4,r4 /* early increment */ |
64 | mov.w @(2,r4),r0 /* data to flip */ | 71 | swap.b r0,r2 /* get high byte (2nd word) */ |
65 | add #2,r4 /* early increment */ | 72 | exts.b r0,r0 /* prepare low byte (2nd word) */ |
66 | swap.b r0,r2 /* get high byte */ | 73 | mov.b @(r0,r7),r1 /* swap low byte (2nd word) */ |
67 | extu.b r0,r0 /* prepare low byte */ | 74 | exts.b r2,r0 /* prepare high byte (2nd word) */ |
68 | mov.b @(r0,r7),r1 /* swap low byte */ | 75 | mov.b @(r0,r7),r2 /* swap high byte (2nd word) */ |
69 | extu.b r2,r0 /* prepare high byte */ | 76 | extu.b r1,r0 /* zero extend low byte (2nd word) */ |
70 | mov.b @(r0,r7),r2 /* swap high byte */ | 77 | mov.w @r4,r1 /* load first word */ |
71 | extu.b r1,r1 /* zero extend low byte */ | 78 | shll8 r2 /* shift high byte (2nd word), low byte zeroed */ |
72 | shll8 r2 /* shift high byte, low byte zeroed */ | 79 | or r2,r0 /* put low byte (2nd word) in result */ |
73 | or r1,r2 /* put low byte in result */ | 80 | swap.b r1,r2 /* get high byte (1st word) */ |
74 | mov.w r2,@r4 /* store result, ptr already incr'd */ | 81 | mov.w r0,@(2,r4) /* store result (2nd word) */ |
75 | cmp/hi r4,r5 /* while &last_byte > data */ | 82 | exts.b r1,r0 /* prepare low byte (1st word) */ |
76 | bt .loop | 83 | mov.b @(r0,r7),r1 /* swap low byte (1st word) */ |
84 | exts.b r2,r0 /* prepare high byte (1st word) */ | ||
85 | mov.b @(r0,r7),r2 /* swap high byte (1st word) */ | ||
86 | extu.b r1,r0 /* zero extend low byte (1st word) */ | ||
87 | shll8 r2 /* shift high byte (1st word), low byte zeroed */ | ||
88 | or r2,r0 /* put low byte (1st word) in result */ | ||
89 | mov.w r0,@r4 /* store result (1st word) */ | ||
90 | .start2_w: | ||
91 | cmp/hi r4,r5 /* runs r4 up to last long bound */ | ||
92 | bt .loop2_w | ||
77 | 93 | ||
78 | .last: | 94 | bra .start_b2 /* jump into trailing byte loop */ |
79 | cmp/eq r4,r5 /* if behind (&last_byte - 2), exit */ | 95 | add #3,r5 /* reset end offset */ |
80 | bf .exit | ||
81 | 96 | ||
82 | mov.b @(2,r4),r0 /* swap last byte */ | 97 | /* trailing byte loop: flips 0..3 bytes */ |
83 | extu.b r0,r0 | 98 | .loop_b2: |
84 | mov.b @(r0,r7),r0 | 99 | mov.b @(4,r4),r0 /* loand byte, sign extension! */ |
85 | mov.b r0,@(2,r4) | 100 | add #1,r4 /* early increment */ |
101 | mov.b @(r0,r7),r0 /* fliptable offset is signed */ | ||
102 | mov.b r0,@(3,r4) /* store byte */ | ||
103 | .start_b2: | ||
104 | cmp/hi r4,r5 /* runs r4 up to end address */ | ||
105 | bt .loop_b2 | ||
86 | 106 | ||
87 | .exit: | 107 | .exit: |
88 | rts | 108 | rts |
@@ -90,9 +110,22 @@ _bitswap: | |||
90 | 110 | ||
91 | .align 2 | 111 | .align 2 |
92 | 112 | ||
93 | .fliptable: | 113 | .byte 0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1 |
94 | .long _fliptable | 114 | .byte 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1 |
95 | 115 | .byte 0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9 | |
116 | .byte 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9 | ||
117 | .byte 0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5 | ||
118 | .byte 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5 | ||
119 | .byte 0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed | ||
120 | .byte 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd | ||
121 | .byte 0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3 | ||
122 | .byte 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3 | ||
123 | .byte 0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb | ||
124 | .byte 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb | ||
125 | .byte 0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7 | ||
126 | .byte 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7 | ||
127 | .byte 0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef | ||
128 | .byte 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff | ||
96 | _fliptable: | 129 | _fliptable: |
97 | .byte 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0 | 130 | .byte 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0 |
98 | .byte 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0 | 131 | .byte 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0 |
@@ -110,22 +143,7 @@ _fliptable: | |||
110 | .byte 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6 | 143 | .byte 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6 |
111 | .byte 0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee | 144 | .byte 0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee |
112 | .byte 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe | 145 | .byte 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe |
113 | .byte 0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1 | 146 | |
114 | .byte 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1 | ||
115 | .byte 0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9 | ||
116 | .byte 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9 | ||
117 | .byte 0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5 | ||
118 | .byte 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5 | ||
119 | .byte 0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed | ||
120 | .byte 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd | ||
121 | .byte 0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3 | ||
122 | .byte 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3 | ||
123 | .byte 0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb | ||
124 | .byte 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb | ||
125 | .byte 0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7 | ||
126 | .byte 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7 | ||
127 | .byte 0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef | ||
128 | .byte 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff | ||
129 | 147 | ||
130 | .end: | 148 | .end: |
131 | .size _bitswap,.end-_bitswap | 149 | .size _bitswap,.end-_bitswap |