summaryrefslogtreecommitdiff
path: root/firmware/drivers/ata.c
diff options
context:
space:
mode:
authorJörg Hohensohn <hohensoh@rockbox.org>2004-04-01 05:46:31 +0000
committerJörg Hohensohn <hohensoh@rockbox.org>2004-04-01 05:46:31 +0000
commit9c52b24b008b522b665c0fd9c57a7d82395afcec (patch)
treed28fa04acdfef57ebbbd27fabe761e0d09e9a987 /firmware/drivers/ata.c
parentcbd992b440d9eb79303c0b2c463306a96f27418c (diff)
downloadrockbox-9c52b24b008b522b665c0fd9c57a7d82395afcec.tar.gz
rockbox-9c52b24b008b522b665c0fd9c57a7d82395afcec.zip
patch #922836 by Jens: way faster disk writing, in assembler. The code is in, but yet disabled by #define PREFER_C_WRITING unless it's proven safe for all disks.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@4460 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'firmware/drivers/ata.c')
-rw-r--r--firmware/drivers/ata.c139
1 files changed, 126 insertions, 13 deletions
diff --git a/firmware/drivers/ata.c b/firmware/drivers/ata.c
index d12dfbec31..b991387074 100644
--- a/firmware/drivers/ata.c
+++ b/firmware/drivers/ata.c
@@ -32,6 +32,8 @@
32 32
33/* use plain C code in copy_read_sectors(), instead of tweaked assembler */ 33/* use plain C code in copy_read_sectors(), instead of tweaked assembler */
34#define PREFER_C /* mystery: assembler caused problems with some disks */ 34#define PREFER_C /* mystery: assembler caused problems with some disks */
35/* use plain C code in copy_write_sectors(), instead of tweaked assembler */
36#define PREFER_C_WRITING /* we don't know yet about this one */
35 37
36#define SECTOR_SIZE 512 38#define SECTOR_SIZE 512
37#define ATA_DATA (*((volatile unsigned short*)0x06104100)) 39#define ATA_DATA (*((volatile unsigned short*)0x06104100))
@@ -208,11 +210,9 @@ static void copy_read_sectors(unsigned char* buf, int wordcount)
208 "tst #1,r0 \n" /* 16-bit aligned ? */ 210 "tst #1,r0 \n" /* 16-bit aligned ? */
209 "bt .aligned \n" /* yes, do word copy */ 211 "bt .aligned \n" /* yes, do word copy */
210 212
211 ".align 2 \n"
212 /* not 16-bit aligned */ 213 /* not 16-bit aligned */
213 "mov #-1,r3 \n" /* prepare a bit mask for high byte */ 214 "mov #-1,r3 \n" /* prepare a bit mask for high byte */
214 "extu.b r3,r3 \n" 215 "shll8 r3 \n" /* r3 = 0xFFFFFF00 */
215 "swap.b r3,r3 \n" /* r3 = 0x0000FF00 */
216 216
217 "mov.w @%2,r2 \n" /* read first word (1st round) */ 217 "mov.w @%2,r2 \n" /* read first word (1st round) */
218 "add #-12,%1 \n" /* adjust end address for offsets */ 218 "add #-12,%1 \n" /* adjust end address for offsets */
@@ -220,6 +220,7 @@ static void copy_read_sectors(unsigned char* buf, int wordcount)
220 "bra .start4_b \n" /* jump into loop after next instr. */ 220 "bra .start4_b \n" /* jump into loop after next instr. */
221 "add #-5,%0 \n" /* adjust for dest. offsets; now even */ 221 "add #-5,%0 \n" /* adjust for dest. offsets; now even */
222 222
223 ".align 2 \n"
223 ".loop4_b: \n" /* main loop: copy 4 words in a row */ 224 ".loop4_b: \n" /* main loop: copy 4 words in a row */
224 "mov.w @%2,r2 \n" /* read first word (2+ round) */ 225 "mov.w @%2,r2 \n" /* read first word (2+ round) */
225 "and r3,r1 \n" /* get high byte of fourth word (2+ round) */ 226 "and r3,r1 \n" /* get high byte of fourth word (2+ round) */
@@ -250,10 +251,8 @@ static void copy_read_sectors(unsigned char* buf, int wordcount)
250 /* avg. 6.5 cycles per word - 100% faster */ 251 /* avg. 6.5 cycles per word - 100% faster */
251 252
252 "swap.b r1,r0 \n" /* get high byte of last word */ 253 "swap.b r1,r0 \n" /* get high byte of last word */
253 "mov.b r0,@(4,%0) \n" /* and store it */
254
255 "bra .exit \n" 254 "bra .exit \n"
256 "nop \n" 255 "mov.b r0,@(4,%0) \n" /* and store it */
257 256
258 ".align 2 \n" 257 ".align 2 \n"
259 /* 16-bit aligned, loop(read and store word) */ 258 /* 16-bit aligned, loop(read and store word) */
@@ -287,7 +286,7 @@ static void copy_read_sectors(unsigned char* buf, int wordcount)
287 "swap.b r1,r0 \n" /* swap fourth word (last round) */ 286 "swap.b r1,r0 \n" /* swap fourth word (last round) */
288 "mov.w r0,@(4,%0) \n" /* and store it */ 287 "mov.w r0,@(4,%0) \n" /* and store it */
289 288
290 ".exit: \n" 289 ".exit: \n"
291 : /* outputs */ 290 : /* outputs */
292 : /* inputs */ 291 : /* inputs */
293 /* %0 */ "r"(buf), 292 /* %0 */ "r"(buf),
@@ -447,6 +446,124 @@ int ata_read_sectors(unsigned long start,
447 return ret; 446 return ret;
448} 447}
449 448
449/* the tight loop of ata_write_sectors(), to avoid the whole in IRAM */
450static void copy_write_sectors(unsigned char* buf,
451 int wordcount)
452 __attribute__ ((section (".icode")));
453
454static void copy_write_sectors(unsigned char* buf, int wordcount)
455{
456#ifdef PREFER_C_WRITING
457
458 if ( (unsigned int)buf & 1)
459 { /* not 16-bit aligned, copy byte by byte */
460 unsigned short tmp = 0;
461 unsigned char* bufend = buf + wordcount*2;
462 do
463 { /* loop compiles to 8 assembler instructions */
464 /* takes 12 clock cycles because of 2 pipeline stalls */
465 tmp = (unsigned short) *buf++;
466 tmp |= (unsigned short) *buf++ << 8; /* I assume big endian */
467 ATA_DATA = tmp; /* and don't use the SWAB16 macro */
468 } while (buf < bufend); /* tail loop is faster */
469 }
470 else
471 { /* 16-bit aligned, can do faster copy */
472 unsigned short* wbuf = (unsigned short*)buf;
473 unsigned short* wbufend = wbuf + wordcount;
474 do
475 { /* loop compiles to 5 assembler instructions */
476 /* takes 9 clock cycles because of 2 pipeline stalls */
477 ATA_DATA = SWAB16(*wbuf);
478 } while (++wbuf < wbufend); /* tail loop is faster */
479 }
480#else
481 /* optimized assembler version */
482 /* this assumes wordcount to be a multiple of 2 */
483
484/* writing is not unrolled as much as reading, for several reasons:
485 * - a similar instruction sequence is faster for writing than for reading
486 * because the auto-incrementing load inctructions can be used
487 * - writing profits from warp mode
488 * Both of these add up to have writing faster than the more unrolled reading.
489 */
490 asm (
491 "add %1,%1 \n" /* wordcount -> bytecount */
492 "add %0,%1 \n" /* bytecount -> bufend */
493 "mov %0,r0 \n"
494 "tst #1,r0 \n" /* 16-bit aligned ? */
495 "bt .w_aligned \n" /* yes, do word copy */
496
497 /* not 16-bit aligned */
498 "mov #-1,r6 \n" /* prepare a bit mask for high byte */
499 "shll8 r6 \n" /* r6 = 0xFFFFFF00 */
500
501 "mov.b @%0+,r2 \n" /* load (initial old second) first byte */
502 "add #-4,%1 \n" /* adjust end address for early check */
503 "mov.w @%0+,r3 \n" /* load (initial) first word */
504 "bra .w_start2_b \n"
505 "extu.b r2,r0 \n" /* extend unsigned */
506
507 ".align 2 \n"
508 ".w_loop2_b: \n" /* main loop: copy 2 words in a row */
509 "mov.w @%0+,r3 \n" /* load first word (2+ round) */
510 "extu.b r2,r0 \n" /* put away low byte of second word (2+ round) */
511 "and r6,r2 \n" /* get high byte of second word (2+ round) */
512 "or r1,r2 \n" /* combine with low byte of old first word */
513 "mov.w r2,@%2 \n" /* write that */
514 ".w_start2_b: \n"
515 "cmp/hi %0,%1 \n" /* check for end */
516 "mov.w @%0+,r2 \n" /* load second word */
517 "extu.b r3,r1 \n" /* put away low byte of first word */
518 "and r6,r3 \n" /* get high byte of first word */
519 "or r0,r3 \n" /* combine with high byte of old second word */
520 "mov.w r3,@%2 \n" /* write that */
521 "bt .w_loop2_b \n"
522 /* 12 instructions for 2 copies, takes 14 clock cycles */
523 /* avg. 7 cycles per word - 71% faster */
524
525 /* the loop "overreads" 1 byte past the buffer end, however, the last */
526 /* byte is not written to disk */
527 "and r6,r2 \n" /* get high byte of last word */
528 "or r1,r2 \n" /* combine with low byte of old first word */
529 "bra .w_exit \n"
530 "mov.w r2,@%2 \n" /* write last word */
531
532 /* 16-bit aligned, loop(load and write word) */
533 ".w_aligned: \n"
534 "mov.w @%0+,r2 \n" /* load first word (1st round) */
535 "bra .w_start2_w \n" /* jump into loop after next instr. */
536 "add #-4,%1 \n" /* adjust end address for early check */
537
538 ".align 2 \n"
539 ".w_loop2_w: \n" /* main loop: copy 2 words in a row */
540 "mov.w @%0+,r2 \n" /* load first word (2+ round) */
541 "swap.b r1,r0 \n" /* swap second word (2+ round) */
542 "mov.w r0,@%2 \n" /* write second word (2+ round) */
543 ".w_start2_w: \n"
544 "cmp/hi %0,%1 \n" /* check for end */
545 "mov.w @%0+,r1 \n" /* load second word */
546 "swap.b r2,r0 \n" /* swap first word */
547 "mov.w r0,@%2 \n" /* write first word */
548 "bt .w_loop2_w \n"
549 /* 8 instructions for 2 copies, takes 10 clock cycles */
550 /* avg. 5 cycles per word - 80% faster */
551
552 "swap.b r1,r0 \n" /* swap second word (last round) */
553 "mov.w r0,@%2 \n" /* and write it */
554
555 ".w_exit: \n"
556 : /* outputs */
557 : /* inputs */
558 /* %0 */ "r"(buf),
559 /* %1 */ "r"(wordcount),
560 /* %2 */ "r"(&ATA_DATA)
561 : /*trashed */
562 "r0","r1","r2","r3","r6"
563 );
564#endif
565}
566
450int ata_write_sectors(unsigned long start, 567int ata_write_sectors(unsigned long start,
451 int count, 568 int count,
452 void* buf) 569 void* buf)
@@ -502,7 +619,7 @@ int ata_write_sectors(unsigned long start,
502 ATA_COMMAND = CMD_WRITE_SECTORS; 619 ATA_COMMAND = CMD_WRITE_SECTORS;
503 620
504 for (i=0; i<count; i++) { 621 for (i=0; i<count; i++) {
505 int j; 622
506 if (!wait_for_start_of_transfer()) { 623 if (!wait_for_start_of_transfer()) {
507 ret = -3; 624 ret = -3;
508 break; 625 break;
@@ -515,11 +632,7 @@ int ata_write_sectors(unsigned long start,
515 poweroff = false; 632 poweroff = false;
516 } 633 }
517 634
518 for (j=0; j<SECTOR_SIZE/2; j++) { 635 copy_write_sectors(buf, SECTOR_SIZE/2);
519 ATA_DATA = (unsigned short)
520 (((unsigned char *)buf)[j*2+1] << 8) |
521 ((unsigned char *)buf)[j*2];
522 }
523 636
524#ifdef USE_INTERRUPT 637#ifdef USE_INTERRUPT
525 /* reading the status register clears the interrupt */ 638 /* reading the status register clears the interrupt */