diff options
author | Jörg Hohensohn <hohensoh@rockbox.org> | 2004-04-01 05:46:31 +0000 |
---|---|---|
committer | Jörg Hohensohn <hohensoh@rockbox.org> | 2004-04-01 05:46:31 +0000 |
commit | 9c52b24b008b522b665c0fd9c57a7d82395afcec (patch) | |
tree | d28fa04acdfef57ebbbd27fabe761e0d09e9a987 /firmware/drivers | |
parent | cbd992b440d9eb79303c0b2c463306a96f27418c (diff) | |
download | rockbox-9c52b24b008b522b665c0fd9c57a7d82395afcec.tar.gz rockbox-9c52b24b008b522b665c0fd9c57a7d82395afcec.zip |
patch #922836 by Jens: way faster disk writing, in assembler. The code is in, but yet disabled by #define PREFER_C_WRITING unless it's proven safe for all disks.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@4460 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'firmware/drivers')
-rw-r--r-- | firmware/drivers/ata.c | 139 |
1 files changed, 126 insertions, 13 deletions
diff --git a/firmware/drivers/ata.c b/firmware/drivers/ata.c index d12dfbec31..b991387074 100644 --- a/firmware/drivers/ata.c +++ b/firmware/drivers/ata.c | |||
@@ -32,6 +32,8 @@ | |||
32 | 32 | ||
33 | /* use plain C code in copy_read_sectors(), instead of tweaked assembler */ | 33 | /* use plain C code in copy_read_sectors(), instead of tweaked assembler */ |
34 | #define PREFER_C /* mystery: assembler caused problems with some disks */ | 34 | #define PREFER_C /* mystery: assembler caused problems with some disks */ |
35 | /* use plain C code in copy_write_sectors(), instead of tweaked assembler */ | ||
36 | #define PREFER_C_WRITING /* we don't know yet about this one */ | ||
35 | 37 | ||
36 | #define SECTOR_SIZE 512 | 38 | #define SECTOR_SIZE 512 |
37 | #define ATA_DATA (*((volatile unsigned short*)0x06104100)) | 39 | #define ATA_DATA (*((volatile unsigned short*)0x06104100)) |
@@ -208,11 +210,9 @@ static void copy_read_sectors(unsigned char* buf, int wordcount) | |||
208 | "tst #1,r0 \n" /* 16-bit aligned ? */ | 210 | "tst #1,r0 \n" /* 16-bit aligned ? */ |
209 | "bt .aligned \n" /* yes, do word copy */ | 211 | "bt .aligned \n" /* yes, do word copy */ |
210 | 212 | ||
211 | ".align 2 \n" | ||
212 | /* not 16-bit aligned */ | 213 | /* not 16-bit aligned */ |
213 | "mov #-1,r3 \n" /* prepare a bit mask for high byte */ | 214 | "mov #-1,r3 \n" /* prepare a bit mask for high byte */ |
214 | "extu.b r3,r3 \n" | 215 | "shll8 r3 \n" /* r3 = 0xFFFFFF00 */ |
215 | "swap.b r3,r3 \n" /* r3 = 0x0000FF00 */ | ||
216 | 216 | ||
217 | "mov.w @%2,r2 \n" /* read first word (1st round) */ | 217 | "mov.w @%2,r2 \n" /* read first word (1st round) */ |
218 | "add #-12,%1 \n" /* adjust end address for offsets */ | 218 | "add #-12,%1 \n" /* adjust end address for offsets */ |
@@ -220,6 +220,7 @@ static void copy_read_sectors(unsigned char* buf, int wordcount) | |||
220 | "bra .start4_b \n" /* jump into loop after next instr. */ | 220 | "bra .start4_b \n" /* jump into loop after next instr. */ |
221 | "add #-5,%0 \n" /* adjust for dest. offsets; now even */ | 221 | "add #-5,%0 \n" /* adjust for dest. offsets; now even */ |
222 | 222 | ||
223 | ".align 2 \n" | ||
223 | ".loop4_b: \n" /* main loop: copy 4 words in a row */ | 224 | ".loop4_b: \n" /* main loop: copy 4 words in a row */ |
224 | "mov.w @%2,r2 \n" /* read first word (2+ round) */ | 225 | "mov.w @%2,r2 \n" /* read first word (2+ round) */ |
225 | "and r3,r1 \n" /* get high byte of fourth word (2+ round) */ | 226 | "and r3,r1 \n" /* get high byte of fourth word (2+ round) */ |
@@ -250,10 +251,8 @@ static void copy_read_sectors(unsigned char* buf, int wordcount) | |||
250 | /* avg. 6.5 cycles per word - 100% faster */ | 251 | /* avg. 6.5 cycles per word - 100% faster */ |
251 | 252 | ||
252 | "swap.b r1,r0 \n" /* get high byte of last word */ | 253 | "swap.b r1,r0 \n" /* get high byte of last word */ |
253 | "mov.b r0,@(4,%0) \n" /* and store it */ | ||
254 | |||
255 | "bra .exit \n" | 254 | "bra .exit \n" |
256 | "nop \n" | 255 | "mov.b r0,@(4,%0) \n" /* and store it */ |
257 | 256 | ||
258 | ".align 2 \n" | 257 | ".align 2 \n" |
259 | /* 16-bit aligned, loop(read and store word) */ | 258 | /* 16-bit aligned, loop(read and store word) */ |
@@ -287,7 +286,7 @@ static void copy_read_sectors(unsigned char* buf, int wordcount) | |||
287 | "swap.b r1,r0 \n" /* swap fourth word (last round) */ | 286 | "swap.b r1,r0 \n" /* swap fourth word (last round) */ |
288 | "mov.w r0,@(4,%0) \n" /* and store it */ | 287 | "mov.w r0,@(4,%0) \n" /* and store it */ |
289 | 288 | ||
290 | ".exit: \n" | 289 | ".exit: \n" |
291 | : /* outputs */ | 290 | : /* outputs */ |
292 | : /* inputs */ | 291 | : /* inputs */ |
293 | /* %0 */ "r"(buf), | 292 | /* %0 */ "r"(buf), |
@@ -447,6 +446,124 @@ int ata_read_sectors(unsigned long start, | |||
447 | return ret; | 446 | return ret; |
448 | } | 447 | } |
449 | 448 | ||
449 | /* the tight loop of ata_write_sectors(), to avoid the whole in IRAM */ | ||
450 | static void copy_write_sectors(unsigned char* buf, | ||
451 | int wordcount) | ||
452 | __attribute__ ((section (".icode"))); | ||
453 | |||
454 | static void copy_write_sectors(unsigned char* buf, int wordcount) | ||
455 | { | ||
456 | #ifdef PREFER_C_WRITING | ||
457 | |||
458 | if ( (unsigned int)buf & 1) | ||
459 | { /* not 16-bit aligned, copy byte by byte */ | ||
460 | unsigned short tmp = 0; | ||
461 | unsigned char* bufend = buf + wordcount*2; | ||
462 | do | ||
463 | { /* loop compiles to 8 assembler instructions */ | ||
464 | /* takes 12 clock cycles because of 2 pipeline stalls */ | ||
465 | tmp = (unsigned short) *buf++; | ||
466 | tmp |= (unsigned short) *buf++ << 8; /* I assume big endian */ | ||
467 | ATA_DATA = tmp; /* and don't use the SWAB16 macro */ | ||
468 | } while (buf < bufend); /* tail loop is faster */ | ||
469 | } | ||
470 | else | ||
471 | { /* 16-bit aligned, can do faster copy */ | ||
472 | unsigned short* wbuf = (unsigned short*)buf; | ||
473 | unsigned short* wbufend = wbuf + wordcount; | ||
474 | do | ||
475 | { /* loop compiles to 5 assembler instructions */ | ||
476 | /* takes 9 clock cycles because of 2 pipeline stalls */ | ||
477 | ATA_DATA = SWAB16(*wbuf); | ||
478 | } while (++wbuf < wbufend); /* tail loop is faster */ | ||
479 | } | ||
480 | #else | ||
481 | /* optimized assembler version */ | ||
482 | /* this assumes wordcount to be a multiple of 2 */ | ||
483 | |||
484 | /* writing is not unrolled as much as reading, for several reasons: | ||
485 | * - a similar instruction sequence is faster for writing than for reading | ||
486 | * because the auto-incrementing load inctructions can be used | ||
487 | * - writing profits from warp mode | ||
488 | * Both of these add up to have writing faster than the more unrolled reading. | ||
489 | */ | ||
490 | asm ( | ||
491 | "add %1,%1 \n" /* wordcount -> bytecount */ | ||
492 | "add %0,%1 \n" /* bytecount -> bufend */ | ||
493 | "mov %0,r0 \n" | ||
494 | "tst #1,r0 \n" /* 16-bit aligned ? */ | ||
495 | "bt .w_aligned \n" /* yes, do word copy */ | ||
496 | |||
497 | /* not 16-bit aligned */ | ||
498 | "mov #-1,r6 \n" /* prepare a bit mask for high byte */ | ||
499 | "shll8 r6 \n" /* r6 = 0xFFFFFF00 */ | ||
500 | |||
501 | "mov.b @%0+,r2 \n" /* load (initial old second) first byte */ | ||
502 | "add #-4,%1 \n" /* adjust end address for early check */ | ||
503 | "mov.w @%0+,r3 \n" /* load (initial) first word */ | ||
504 | "bra .w_start2_b \n" | ||
505 | "extu.b r2,r0 \n" /* extend unsigned */ | ||
506 | |||
507 | ".align 2 \n" | ||
508 | ".w_loop2_b: \n" /* main loop: copy 2 words in a row */ | ||
509 | "mov.w @%0+,r3 \n" /* load first word (2+ round) */ | ||
510 | "extu.b r2,r0 \n" /* put away low byte of second word (2+ round) */ | ||
511 | "and r6,r2 \n" /* get high byte of second word (2+ round) */ | ||
512 | "or r1,r2 \n" /* combine with low byte of old first word */ | ||
513 | "mov.w r2,@%2 \n" /* write that */ | ||
514 | ".w_start2_b: \n" | ||
515 | "cmp/hi %0,%1 \n" /* check for end */ | ||
516 | "mov.w @%0+,r2 \n" /* load second word */ | ||
517 | "extu.b r3,r1 \n" /* put away low byte of first word */ | ||
518 | "and r6,r3 \n" /* get high byte of first word */ | ||
519 | "or r0,r3 \n" /* combine with high byte of old second word */ | ||
520 | "mov.w r3,@%2 \n" /* write that */ | ||
521 | "bt .w_loop2_b \n" | ||
522 | /* 12 instructions for 2 copies, takes 14 clock cycles */ | ||
523 | /* avg. 7 cycles per word - 71% faster */ | ||
524 | |||
525 | /* the loop "overreads" 1 byte past the buffer end, however, the last */ | ||
526 | /* byte is not written to disk */ | ||
527 | "and r6,r2 \n" /* get high byte of last word */ | ||
528 | "or r1,r2 \n" /* combine with low byte of old first word */ | ||
529 | "bra .w_exit \n" | ||
530 | "mov.w r2,@%2 \n" /* write last word */ | ||
531 | |||
532 | /* 16-bit aligned, loop(load and write word) */ | ||
533 | ".w_aligned: \n" | ||
534 | "mov.w @%0+,r2 \n" /* load first word (1st round) */ | ||
535 | "bra .w_start2_w \n" /* jump into loop after next instr. */ | ||
536 | "add #-4,%1 \n" /* adjust end address for early check */ | ||
537 | |||
538 | ".align 2 \n" | ||
539 | ".w_loop2_w: \n" /* main loop: copy 2 words in a row */ | ||
540 | "mov.w @%0+,r2 \n" /* load first word (2+ round) */ | ||
541 | "swap.b r1,r0 \n" /* swap second word (2+ round) */ | ||
542 | "mov.w r0,@%2 \n" /* write second word (2+ round) */ | ||
543 | ".w_start2_w: \n" | ||
544 | "cmp/hi %0,%1 \n" /* check for end */ | ||
545 | "mov.w @%0+,r1 \n" /* load second word */ | ||
546 | "swap.b r2,r0 \n" /* swap first word */ | ||
547 | "mov.w r0,@%2 \n" /* write first word */ | ||
548 | "bt .w_loop2_w \n" | ||
549 | /* 8 instructions for 2 copies, takes 10 clock cycles */ | ||
550 | /* avg. 5 cycles per word - 80% faster */ | ||
551 | |||
552 | "swap.b r1,r0 \n" /* swap second word (last round) */ | ||
553 | "mov.w r0,@%2 \n" /* and write it */ | ||
554 | |||
555 | ".w_exit: \n" | ||
556 | : /* outputs */ | ||
557 | : /* inputs */ | ||
558 | /* %0 */ "r"(buf), | ||
559 | /* %1 */ "r"(wordcount), | ||
560 | /* %2 */ "r"(&ATA_DATA) | ||
561 | : /*trashed */ | ||
562 | "r0","r1","r2","r3","r6" | ||
563 | ); | ||
564 | #endif | ||
565 | } | ||
566 | |||
450 | int ata_write_sectors(unsigned long start, | 567 | int ata_write_sectors(unsigned long start, |
451 | int count, | 568 | int count, |
452 | void* buf) | 569 | void* buf) |
@@ -502,7 +619,7 @@ int ata_write_sectors(unsigned long start, | |||
502 | ATA_COMMAND = CMD_WRITE_SECTORS; | 619 | ATA_COMMAND = CMD_WRITE_SECTORS; |
503 | 620 | ||
504 | for (i=0; i<count; i++) { | 621 | for (i=0; i<count; i++) { |
505 | int j; | 622 | |
506 | if (!wait_for_start_of_transfer()) { | 623 | if (!wait_for_start_of_transfer()) { |
507 | ret = -3; | 624 | ret = -3; |
508 | break; | 625 | break; |
@@ -515,11 +632,7 @@ int ata_write_sectors(unsigned long start, | |||
515 | poweroff = false; | 632 | poweroff = false; |
516 | } | 633 | } |
517 | 634 | ||
518 | for (j=0; j<SECTOR_SIZE/2; j++) { | 635 | copy_write_sectors(buf, SECTOR_SIZE/2); |
519 | ATA_DATA = (unsigned short) | ||
520 | (((unsigned char *)buf)[j*2+1] << 8) | | ||
521 | ((unsigned char *)buf)[j*2]; | ||
522 | } | ||
523 | 636 | ||
524 | #ifdef USE_INTERRUPT | 637 | #ifdef USE_INTERRUPT |
525 | /* reading the status register clears the interrupt */ | 638 | /* reading the status register clears the interrupt */ |