From 9c52b24b008b522b665c0fd9c57a7d82395afcec Mon Sep 17 00:00:00 2001 From: Jörg Hohensohn Date: Thu, 1 Apr 2004 05:46:31 +0000 Subject: patch #922836 by Jens: way faster disk writing, in assembler. The code is in, but yet disabled by #define PREFER_C_WRITING unless it's proven safe for all disks. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@4460 a1c6a512-1295-4272-9138-f99709370657 --- firmware/drivers/ata.c | 139 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 126 insertions(+), 13 deletions(-) (limited to 'firmware/drivers') diff --git a/firmware/drivers/ata.c b/firmware/drivers/ata.c index d12dfbec31..b991387074 100644 --- a/firmware/drivers/ata.c +++ b/firmware/drivers/ata.c @@ -32,6 +32,8 @@ /* use plain C code in copy_read_sectors(), instead of tweaked assembler */ #define PREFER_C /* mystery: assembler caused problems with some disks */ +/* use plain C code in copy_write_sectors(), instead of tweaked assembler */ +#define PREFER_C_WRITING /* we don't know yet about this one */ #define SECTOR_SIZE 512 #define ATA_DATA (*((volatile unsigned short*)0x06104100)) @@ -208,11 +210,9 @@ static void copy_read_sectors(unsigned char* buf, int wordcount) "tst #1,r0 \n" /* 16-bit aligned ? */ "bt .aligned \n" /* yes, do word copy */ - ".align 2 \n" /* not 16-bit aligned */ "mov #-1,r3 \n" /* prepare a bit mask for high byte */ - "extu.b r3,r3 \n" - "swap.b r3,r3 \n" /* r3 = 0x0000FF00 */ + "shll8 r3 \n" /* r3 = 0xFFFFFF00 */ "mov.w @%2,r2 \n" /* read first word (1st round) */ "add #-12,%1 \n" /* adjust end address for offsets */ @@ -220,6 +220,7 @@ static void copy_read_sectors(unsigned char* buf, int wordcount) "bra .start4_b \n" /* jump into loop after next instr. */ "add #-5,%0 \n" /* adjust for dest. offsets; now even */ + ".align 2 \n" ".loop4_b: \n" /* main loop: copy 4 words in a row */ "mov.w @%2,r2 \n" /* read first word (2+ round) */ "and r3,r1 \n" /* get high byte of fourth word (2+ round) */ @@ -250,10 +251,8 @@ static void copy_read_sectors(unsigned char* buf, int wordcount) /* avg. 6.5 cycles per word - 100% faster */ "swap.b r1,r0 \n" /* get high byte of last word */ - "mov.b r0,@(4,%0) \n" /* and store it */ - "bra .exit \n" - "nop \n" + "mov.b r0,@(4,%0) \n" /* and store it */ ".align 2 \n" /* 16-bit aligned, loop(read and store word) */ @@ -287,7 +286,7 @@ static void copy_read_sectors(unsigned char* buf, int wordcount) "swap.b r1,r0 \n" /* swap fourth word (last round) */ "mov.w r0,@(4,%0) \n" /* and store it */ - ".exit: \n" + ".exit: \n" : /* outputs */ : /* inputs */ /* %0 */ "r"(buf), @@ -447,6 +446,124 @@ int ata_read_sectors(unsigned long start, return ret; } +/* the tight loop of ata_write_sectors(), to avoid the whole in IRAM */ +static void copy_write_sectors(unsigned char* buf, + int wordcount) + __attribute__ ((section (".icode"))); + +static void copy_write_sectors(unsigned char* buf, int wordcount) +{ +#ifdef PREFER_C_WRITING + + if ( (unsigned int)buf & 1) + { /* not 16-bit aligned, copy byte by byte */ + unsigned short tmp = 0; + unsigned char* bufend = buf + wordcount*2; + do + { /* loop compiles to 8 assembler instructions */ + /* takes 12 clock cycles because of 2 pipeline stalls */ + tmp = (unsigned short) *buf++; + tmp |= (unsigned short) *buf++ << 8; /* I assume big endian */ + ATA_DATA = tmp; /* and don't use the SWAB16 macro */ + } while (buf < bufend); /* tail loop is faster */ + } + else + { /* 16-bit aligned, can do faster copy */ + unsigned short* wbuf = (unsigned short*)buf; + unsigned short* wbufend = wbuf + wordcount; + do + { /* loop compiles to 5 assembler instructions */ + /* takes 9 clock cycles because of 2 pipeline stalls */ + ATA_DATA = SWAB16(*wbuf); + } while (++wbuf < wbufend); /* tail loop is faster */ + } +#else + /* optimized assembler version */ + /* this assumes wordcount to be a multiple of 2 */ + +/* writing is not unrolled as much as reading, for several reasons: + * - a similar instruction sequence is faster for writing than for reading + * because the auto-incrementing load inctructions can be used + * - writing profits from warp mode + * Both of these add up to have writing faster than the more unrolled reading. + */ + asm ( + "add %1,%1 \n" /* wordcount -> bytecount */ + "add %0,%1 \n" /* bytecount -> bufend */ + "mov %0,r0 \n" + "tst #1,r0 \n" /* 16-bit aligned ? */ + "bt .w_aligned \n" /* yes, do word copy */ + + /* not 16-bit aligned */ + "mov #-1,r6 \n" /* prepare a bit mask for high byte */ + "shll8 r6 \n" /* r6 = 0xFFFFFF00 */ + + "mov.b @%0+,r2 \n" /* load (initial old second) first byte */ + "add #-4,%1 \n" /* adjust end address for early check */ + "mov.w @%0+,r3 \n" /* load (initial) first word */ + "bra .w_start2_b \n" + "extu.b r2,r0 \n" /* extend unsigned */ + + ".align 2 \n" + ".w_loop2_b: \n" /* main loop: copy 2 words in a row */ + "mov.w @%0+,r3 \n" /* load first word (2+ round) */ + "extu.b r2,r0 \n" /* put away low byte of second word (2+ round) */ + "and r6,r2 \n" /* get high byte of second word (2+ round) */ + "or r1,r2 \n" /* combine with low byte of old first word */ + "mov.w r2,@%2 \n" /* write that */ + ".w_start2_b: \n" + "cmp/hi %0,%1 \n" /* check for end */ + "mov.w @%0+,r2 \n" /* load second word */ + "extu.b r3,r1 \n" /* put away low byte of first word */ + "and r6,r3 \n" /* get high byte of first word */ + "or r0,r3 \n" /* combine with high byte of old second word */ + "mov.w r3,@%2 \n" /* write that */ + "bt .w_loop2_b \n" + /* 12 instructions for 2 copies, takes 14 clock cycles */ + /* avg. 7 cycles per word - 71% faster */ + + /* the loop "overreads" 1 byte past the buffer end, however, the last */ + /* byte is not written to disk */ + "and r6,r2 \n" /* get high byte of last word */ + "or r1,r2 \n" /* combine with low byte of old first word */ + "bra .w_exit \n" + "mov.w r2,@%2 \n" /* write last word */ + + /* 16-bit aligned, loop(load and write word) */ + ".w_aligned: \n" + "mov.w @%0+,r2 \n" /* load first word (1st round) */ + "bra .w_start2_w \n" /* jump into loop after next instr. */ + "add #-4,%1 \n" /* adjust end address for early check */ + + ".align 2 \n" + ".w_loop2_w: \n" /* main loop: copy 2 words in a row */ + "mov.w @%0+,r2 \n" /* load first word (2+ round) */ + "swap.b r1,r0 \n" /* swap second word (2+ round) */ + "mov.w r0,@%2 \n" /* write second word (2+ round) */ + ".w_start2_w: \n" + "cmp/hi %0,%1 \n" /* check for end */ + "mov.w @%0+,r1 \n" /* load second word */ + "swap.b r2,r0 \n" /* swap first word */ + "mov.w r0,@%2 \n" /* write first word */ + "bt .w_loop2_w \n" + /* 8 instructions for 2 copies, takes 10 clock cycles */ + /* avg. 5 cycles per word - 80% faster */ + + "swap.b r1,r0 \n" /* swap second word (last round) */ + "mov.w r0,@%2 \n" /* and write it */ + + ".w_exit: \n" + : /* outputs */ + : /* inputs */ + /* %0 */ "r"(buf), + /* %1 */ "r"(wordcount), + /* %2 */ "r"(&ATA_DATA) + : /*trashed */ + "r0","r1","r2","r3","r6" + ); +#endif +} + int ata_write_sectors(unsigned long start, int count, void* buf) @@ -502,7 +619,7 @@ int ata_write_sectors(unsigned long start, ATA_COMMAND = CMD_WRITE_SECTORS; for (i=0; i