summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJörg Hohensohn <hohensoh@rockbox.org>2004-03-03 07:18:26 +0000
committerJörg Hohensohn <hohensoh@rockbox.org>2004-03-03 07:18:26 +0000
commit239a91c28cce4a120af21f7ea598217f54e17d0c (patch)
treee89ef8a4beb9b6c7c214b9ee004fc98af3432f62
parent860586d992a1a434b3d40e594a755e5fb450f394 (diff)
downloadrockbox-239a91c28cce4a120af21f7ea598217f54e17d0c.tar.gz
rockbox-239a91c28cce4a120af21f7ea598217f54e17d0c.zip
14% faster bitswap, thanks Jens
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@4337 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--firmware/bitswap.S81
1 files changed, 41 insertions, 40 deletions
diff --git a/firmware/bitswap.S b/firmware/bitswap.S
index da628a3b7f..990ecb4d00 100644
--- a/firmware/bitswap.S
+++ b/firmware/bitswap.S
@@ -18,7 +18,7 @@
18 ****************************************************************************/ 18 ****************************************************************************/
19 19
20 .section .icode,"ax",@progbits 20 .section .icode,"ax",@progbits
21 .align 4 21 .align 2
22 .global _bitswap 22 .global _bitswap
23 .type _bitswap,@function 23 .type _bitswap,@function
24 24
@@ -26,68 +26,69 @@
26 * 26 *
27 * r0 Temporary (required by some instructions) 27 * r0 Temporary (required by some instructions)
28 * r1 Low byte 28 * r1 Low byte
29 * r2 High byte 29 * r2 High byte / final result
30 * r3 Result after flip 30 * r4 &Data
31 * r4 Data
32 * r5 Length 31 * r5 Length
33 * r6 1
34 * r7 Flip table 32 * r7 Flip table
35 */ 33 */
36 34
35/* The instruction order below is a bit strange, because:
36 * 1) Keeping load/stores on longword boundaries means the instruction fetch
37 * won't compete with the memory access (because instructions are fetched
38 * in pairs).
39 * 2) Using the result of a fetch in the next instruction causes a stall
40 * (except in certain circumstances).
41 * See the SH-1 programming manual for details.
42 */
43
37_bitswap: 44_bitswap:
38 mov.l .fliptable,r7 45 mov.l .fliptable,r7
39 mov #1,r6 46 add #-2,r4 /* ptr is used shifted by 2 */
47 add r4,r5 /* r5 = end_address - 2 */
48 add #-1,r5 /* r5 = &last_byte - 2 */
40 mov r4,r0 49 mov r4,r0
41 tst #1,r0 /* odd address? */ 50 tst #1,r0 /* even address? */
42 bt .init /* no, address is even */ 51 bt .init /* yes */
43 52
44 mov.b @r4,r0 /* swap first byte */ 53 add #1,r4 /* r4 now even */
54 mov.b @(1,r4),r0 /* no, swap first byte */
45 extu.b r0,r0 55 extu.b r0,r0
46 mov.b @(r0,r7),r0 56 mov.b @(r0,r7),r0
47 mov.b r0,@r4 57 mov.b r0,@(1,r4)
48 add #1,r4
49 add #-1,r5
50 bra .init
51 58
52 /* The instruction order below is a bit strange, because: 59.init:
53 * 1) Keeping load/stores on longword boundaries means the instruction 60 cmp/hi r4,r5 /* at least 2 bytes to swap? */
54 * fetch won't compete with the memory access (because instructions 61 bf .last /* no, skip main loop */
55 * are fetched in pairs).
56 * 2) Using the result of a fetch in the next instruction causes a
57 * stall (except in certain circumstances).
58 * See the SH-1 programming manual for details.
59 */
60 62
61.loop: 63.loop:
62 mov.w @r4,r1 /* data to flip */ 64 mov.w @(2,r4),r0 /* data to flip */
63 add #-2,r5 65 add #2,r4 /* early increment */
64 swap.b r1,r2 /* get high byte */ 66 swap.b r0,r2 /* get high byte */
67 extu.b r0,r0 /* prepare low byte */
68 mov.b @(r0,r7),r1 /* swap low byte */
65 extu.b r2,r0 /* prepare high byte */ 69 extu.b r2,r0 /* prepare high byte */
66 mov.b @(r0,r7),r2 /* swap high byte */ 70 mov.b @(r0,r7),r2 /* swap high byte */
67 extu.b r1,r0 /* perpare low byte */ 71 extu.b r1,r1 /* zero extend low byte */
68 mov.b @(r0,r7),r1 /* swap low byte */ 72 shll8 r2 /* shift high byte, low byte zeroed */
69 extu.b r2,r2 /* zero extend high byte */ 73 or r1,r2 /* put low byte in result */
70 swap.b r2,r3 /* put high byte in result */ 74 mov.w r2,@r4 /* store result, ptr already incr'd */
71 extu.b r1,r0 /* zero extend low byte */ 75 cmp/hi r4,r5 /* while &last_byte > data */
72 or r0,r3 /* put low byte in result */ 76 bt .loop
73 mov.w r3,@r4 /* store result */
74 add #2,r4
75.init:
76 cmp/gt r6,r5 /* while [bytes remaining] > 1 */
77 bt .loop /* (at least 2 bytes left) */
78 77
79 cmp/eq r6,r5 78.last:
80 bf .exit /* if not 1 byte left, exit */ 79 cmp/eq r4,r5 /* if behind (&last_byte - 2), exit */
80 bf .exit
81 81
82 mov.b @r4,r0 /* swap last byte */ 82 mov.b @(2,r4),r0 /* swap last byte */
83 extu.b r0,r0 83 extu.b r0,r0
84 mov.b @(r0,r7),r0 84 mov.b @(r0,r7),r0
85 mov.b r0,@r4 85 mov.b r0,@(2,r4)
86
86.exit: 87.exit:
87 rts 88 rts
88 nop 89 nop
89 90
90 .align 4 91 .align 2
91 92
92.fliptable: 93.fliptable:
93 .long _fliptable 94 .long _fliptable