patch #922836 by Jens: way faster disk writing, in assembler. The code is in, but yet disabled by #define PREFER_C_WRITING unless it's proven safe for all disks.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@4460 a1c6a512-1295-4272-9138-f99709370657
author: Jörg Hohensohn <hohensoh@rockbox.org> 2004-04-01 05:46:31 +0000
committer: Jörg Hohensohn <hohensoh@rockbox.org> 2004-04-01 05:46:31 +0000
commit: 9c52b24b008b522b665c0fd9c57a7d82395afcec (patch)
tree: d28fa04acdfef57ebbbd27fabe761e0d09e9a987 /firmware/drivers/ata.c
parent: cbd992b440d9eb79303c0b2c463306a96f27418c (diff)
download: rockbox-9c52b24b008b522b665c0fd9c57a7d82395afcec.tar.gz
rockbox-9c52b24b008b522b665c0fd9c57a7d82395afcec.zip
1 files changed, 126 insertions, 13 deletions
diff --git a/firmware/drivers/ata.c b/firmware/drivers/ata.c
index d12dfbec31..b991387074 100644
--- a/firmware/drivers/ata.c
+++ b/firmware/drivers/ata.c
@@ -32,6 +32,8 @@
 /* use plain C code in copy_read_sectors(), instead of tweaked assembler */
 #define PREFER_C /* mystery: assembler caused problems with some disks */
+/* use plain C code in copy_write_sectors(), instead of tweaked assembler */
+#define PREFER_C_WRITING /* we don't know yet about this one */
 #define SECTOR_SIZE     512
 #define ATA_DATA        (*((volatile unsigned short*)0x06104100))
@@ -208,11 +210,9 @@ static void copy_read_sectors(unsigned char* buf, int wordcount)
        "tst     #1,r0       \n"  /* 16-bit aligned ? */
        "bt      .aligned    \n"  /* yes, do word copy */
-        ".align  2           \n"
        /* not 16-bit aligned */
        "mov     #-1,r3      \n"  /* prepare a bit mask for high byte */
-        "extu.b  r3,r3       \n"
+        "shll8   r3          \n"  /* r3 = 0xFFFFFF00 */
-        "swap.b  r3,r3       \n"  /* r3 = 0x0000FF00 */
        "mov.w   @%2,r2      \n"  /* read first word (1st round) */
        "add     #-12,%1     \n"  /* adjust end address for offsets */
@@ -220,6 +220,7 @@ static void copy_read_sectors(unsigned char* buf, int wordcount)
        "bra     .start4_b   \n"  /* jump into loop after next instr. */
        "add     #-5,%0      \n"  /* adjust for dest. offsets; now even */
+        ".align  2           \n"
    ".loop4_b:               \n"  /* main loop: copy 4 words in a row */
        "mov.w   @%2,r2      \n"  /* read first word (2+ round) */
        "and     r3,r1       \n"  /* get high byte of fourth word (2+ round) */
@@ -250,10 +251,8 @@ static void copy_read_sectors(unsigned char* buf, int wordcount)
        /* avg. 6.5 cycles per word - 100% faster */
        "swap.b  r1,r0       \n"  /* get high byte of last word */
-        "mov.b   r0,@(4,%0)  \n"  /* and store it */
        "bra     .exit       \n"
-        "nop                 \n"
+        "mov.b   r0,@(4,%0)  \n"  /* and store it */
        ".align  2           \n"
        /* 16-bit aligned, loop(read and store word) */
@@ -287,7 +286,7 @@ static void copy_read_sectors(unsigned char* buf, int wordcount)
        "swap.b  r1,r0       \n"  /* swap fourth word (last round) */
        "mov.w   r0,@(4,%0)  \n"  /* and store it */
-        ".exit:              \n"
+    ".exit:                  \n"
        : /* outputs */
        : /* inputs */
        /* %0 */ "r"(buf),
@@ -447,6 +446,124 @@ int ata_read_sectors(unsigned long start,
    return ret;
 }
+/* the tight loop of ata_write_sectors(), to avoid the whole in IRAM */
+static void copy_write_sectors(unsigned char* buf,
+                               int wordcount)
+                               __attribute__ ((section (".icode")));
+static void copy_write_sectors(unsigned char* buf, int wordcount)
+{
+#ifdef PREFER_C_WRITING
+    if ( (unsigned int)buf & 1)
+    {   /* not 16-bit aligned, copy byte by byte */
+        unsigned short tmp = 0;
+        unsigned char* bufend = buf + wordcount*2;
+        do
+        {   /* loop compiles to 8 assembler instructions */
+            /* takes 12 clock cycles because of 2 pipeline stalls */
+            tmp = (unsigned short) *buf++;       
+            tmp |= (unsigned short) *buf++ << 8; /* I assume big endian */
+            ATA_DATA = tmp;           /* and don't use the SWAB16 macro */
+        } while (buf < bufend); /* tail loop is faster */
+    }
+    else
+    {   /* 16-bit aligned, can do faster copy */
+        unsigned short* wbuf = (unsigned short*)buf;
+        unsigned short* wbufend = wbuf + wordcount;
+        do
+        {   /* loop compiles to 5 assembler instructions */
+            /* takes 9 clock cycles because of 2 pipeline stalls */
+            ATA_DATA = SWAB16(*wbuf);
+        } while (++wbuf < wbufend); /* tail loop is faster */
+    }
+#else
+    /* optimized assembler version */
+    /* this assumes wordcount to be a multiple of 2 */
+/* writing is not unrolled as much as reading, for several reasons:
+ * - a similar instruction sequence is faster for writing than for reading
+ *   because the auto-incrementing load inctructions can be used
+ * - writing profits from warp mode
+ * Both of these add up to have writing faster than the more unrolled reading.
+ */
+    asm (
+        "add     %1,%1       \n"  /* wordcount -> bytecount */
+        "add     %0,%1       \n"  /* bytecount -> bufend */
+        "mov     %0,r0       \n"
+        "tst     #1,r0       \n"  /* 16-bit aligned ? */
+        "bt      .w_aligned  \n"  /* yes, do word copy */
+        /* not 16-bit aligned */
+        "mov     #-1,r6      \n"  /* prepare a bit mask for high byte */
+        "shll8   r6          \n"  /* r6 = 0xFFFFFF00 */
+        "mov.b   @%0+,r2     \n"  /* load (initial old second) first byte */
+        "add     #-4,%1      \n"  /* adjust end address for early check */
+        "mov.w   @%0+,r3     \n"  /* load (initial) first word */
+        "bra     .w_start2_b \n"
+        "extu.b  r2,r0       \n"  /* extend unsigned */
+        ".align  2           \n"
+    ".w_loop2_b:             \n"  /* main loop: copy 2 words in a row */
+        "mov.w   @%0+,r3     \n"  /* load first word (2+ round) */
+        "extu.b  r2,r0       \n"  /* put away low byte of second word (2+ round) */
+        "and     r6,r2       \n"  /* get high byte of second word (2+ round) */
+        "or      r1,r2       \n"  /* combine with low byte of old first word */
+        "mov.w   r2,@%2      \n"  /* write that */
+    ".w_start2_b:            \n"
+        "cmp/hi  %0,%1       \n"  /* check for end */
+        "mov.w   @%0+,r2     \n"  /* load second word */
+        "extu.b  r3,r1       \n"  /* put away low byte of first word */
+        "and     r6,r3       \n"  /* get high byte of first word */
+        "or      r0,r3       \n"  /* combine with high byte of old second word */
+        "mov.w   r3,@%2      \n"  /* write that */
+        "bt      .w_loop2_b  \n"
+        /* 12 instructions for 2 copies, takes 14 clock cycles */
+        /* avg. 7 cycles per word - 71% faster */
+        /* the loop "overreads" 1 byte past the buffer end, however, the last */
+        /* byte is not written to disk */
+        "and     r6,r2       \n"  /* get high byte of last word */
+        "or      r1,r2       \n"  /* combine with low byte of old first word */
+        "bra     .w_exit     \n"
+        "mov.w   r2,@%2      \n"  /* write last word */
+        /* 16-bit aligned, loop(load and write word) */
+    ".w_aligned:             \n"
+        "mov.w   @%0+,r2     \n"  /* load first word (1st round) */
+        "bra     .w_start2_w \n"  /* jump into loop after next instr. */
+        "add     #-4,%1      \n"  /* adjust end address for early check */
+        ".align  2           \n"
+    ".w_loop2_w:             \n"  /* main loop: copy 2 words in a row */
+        "mov.w   @%0+,r2     \n"  /* load first word (2+ round) */
+        "swap.b  r1,r0       \n"  /* swap second word (2+ round) */
+        "mov.w   r0,@%2      \n"  /* write second word (2+ round) */
+    ".w_start2_w:            \n"
+        "cmp/hi  %0,%1       \n"  /* check for end */
+        "mov.w   @%0+,r1     \n"  /* load second word */
+        "swap.b  r2,r0       \n"  /* swap first word */
+        "mov.w   r0,@%2      \n"  /* write first word */
+        "bt      .w_loop2_w  \n"
+        /* 8 instructions for 2 copies, takes 10 clock cycles */
+        /* avg. 5 cycles per word - 80% faster */
+        "swap.b  r1,r0       \n"  /* swap second word (last round) */
+        "mov.w   r0,@%2      \n"  /* and write it */
+    ".w_exit:                \n"
+        : /* outputs */
+        : /* inputs */
+        /* %0 */ "r"(buf),
+        /* %1 */ "r"(wordcount),
+        /* %2 */ "r"(&ATA_DATA)
+        : /*trashed */
+        "r0","r1","r2","r3","r6"
+    );
+#endif
+}
 int ata_write_sectors(unsigned long start,
                      int count,
                      void* buf)
@@ -502,7 +619,7 @@ int ata_write_sectors(unsigned long start,
    ATA_COMMAND = CMD_WRITE_SECTORS;
    for (i=0; i<count; i++) {
-        int j;
        if (!wait_for_start_of_transfer()) {
            ret = -3;
            break;
@@ -515,11 +632,7 @@ int ata_write_sectors(unsigned long start,
            poweroff = false;
        }
-        for (j=0; j<SECTOR_SIZE/2; j++) {
+        copy_write_sectors(buf, SECTOR_SIZE/2);
-            ATA_DATA = (unsigned short)
-                (((unsigned char *)buf)[j*2+1] << 8) |
-                ((unsigned char *)buf)[j*2];
-        }
 #ifdef USE_INTERRUPT
        /* reading the status register clears the interrupt */
author	Jörg Hohensohn <hohensoh@rockbox.org>	2004-04-01 05:46:31 +0000
committer	Jörg Hohensohn <hohensoh@rockbox.org>	2004-04-01 05:46:31 +0000
commit	9c52b24b008b522b665c0fd9c57a7d82395afcec (patch)
tree	d28fa04acdfef57ebbbd27fabe761e0d09e9a987 /firmware/drivers/ata.c
parent	cbd992b440d9eb79303c0b2c463306a96f27418c (diff)
download	rockbox-9c52b24b008b522b665c0fd9c57a7d82395afcec.tar.gz rockbox-9c52b24b008b522b665c0fd9c57a7d82395afcec.zip

diff --git a/firmware/drivers/ata.c b/firmware/drivers/ata.c index d12dfbec31..b991387074 100644 --- a/firmware/drivers/ata.c +++ b/firmware/drivers/ata.c
@@ -32,6 +32,8 @@
32		32
33	/* use plain C code in copy_read_sectors(), instead of tweaked assembler */	33	/* use plain C code in copy_read_sectors(), instead of tweaked assembler */
34	#define PREFER_C /* mystery: assembler caused problems with some disks */	34	#define PREFER_C /* mystery: assembler caused problems with some disks */
		35	/* use plain C code in copy_write_sectors(), instead of tweaked assembler */
		36	#define PREFER_C_WRITING /* we don't know yet about this one */
35		37
36	#define SECTOR_SIZE 512	38	#define SECTOR_SIZE 512
37	#define ATA_DATA (((volatile unsigned short)0x06104100))	39	#define ATA_DATA (((volatile unsigned short)0x06104100))
@@ -208,11 +210,9 @@ static void copy_read_sectors(unsigned char* buf, int wordcount)
208	"tst #1,r0 \n" /* 16-bit aligned ? */	210	"tst #1,r0 \n" /* 16-bit aligned ? */
209	"bt .aligned \n" /* yes, do word copy */	211	"bt .aligned \n" /* yes, do word copy */
210		212
211	".align 2 \n"
212	/* not 16-bit aligned */	213	/* not 16-bit aligned */
213	"mov #-1,r3 \n" /* prepare a bit mask for high byte */	214	"mov #-1,r3 \n" /* prepare a bit mask for high byte */
214	"extu.b r3,r3 \n"	215	"shll8 r3 \n" /* r3 = 0xFFFFFF00 */
215	"swap.b r3,r3 \n" /* r3 = 0x0000FF00 */
216		216
217	"mov.w @%2,r2 \n" /* read first word (1st round) */	217	"mov.w @%2,r2 \n" /* read first word (1st round) */
218	"add #-12,%1 \n" /* adjust end address for offsets */	218	"add #-12,%1 \n" /* adjust end address for offsets */
@@ -220,6 +220,7 @@ static void copy_read_sectors(unsigned char* buf, int wordcount)
220	"bra .start4_b \n" /* jump into loop after next instr. */	220	"bra .start4_b \n" /* jump into loop after next instr. */
221	"add #-5,%0 \n" /* adjust for dest. offsets; now even */	221	"add #-5,%0 \n" /* adjust for dest. offsets; now even */
222		222
		223	".align 2 \n"
223	".loop4_b: \n" /* main loop: copy 4 words in a row */	224	".loop4_b: \n" /* main loop: copy 4 words in a row */
224	"mov.w @%2,r2 \n" /* read first word (2+ round) */	225	"mov.w @%2,r2 \n" /* read first word (2+ round) */
225	"and r3,r1 \n" /* get high byte of fourth word (2+ round) */	226	"and r3,r1 \n" /* get high byte of fourth word (2+ round) */
@@ -250,10 +251,8 @@ static void copy_read_sectors(unsigned char* buf, int wordcount)
250	/* avg. 6.5 cycles per word - 100% faster */	251	/* avg. 6.5 cycles per word - 100% faster */
251		252
252	"swap.b r1,r0 \n" /* get high byte of last word */	253	"swap.b r1,r0 \n" /* get high byte of last word */
253	"mov.b r0,@(4,%0) \n" /* and store it */
254
255	"bra .exit \n"	254	"bra .exit \n"
256	"nop \n"	255	"mov.b r0,@(4,%0) \n" /* and store it */
257		256
258	".align 2 \n"	257	".align 2 \n"
259	/* 16-bit aligned, loop(read and store word) */	258	/* 16-bit aligned, loop(read and store word) */
@@ -287,7 +286,7 @@ static void copy_read_sectors(unsigned char* buf, int wordcount)
287	"swap.b r1,r0 \n" /* swap fourth word (last round) */	286	"swap.b r1,r0 \n" /* swap fourth word (last round) */
288	"mov.w r0,@(4,%0) \n" /* and store it */	287	"mov.w r0,@(4,%0) \n" /* and store it */
289		288
290	".exit: \n"	289	".exit: \n"
291	: /* outputs */	290	: /* outputs */
292	: /* inputs */	291	: /* inputs */
293	/* %0 */ "r"(buf),	292	/* %0 */ "r"(buf),
@@ -447,6 +446,124 @@ int ata_read_sectors(unsigned long start,
447	return ret;	446	return ret;
448	}	447	}
449		448
		449	/* the tight loop of ata_write_sectors(), to avoid the whole in IRAM */
		450	static void copy_write_sectors(unsigned char* buf,
		451	int wordcount)
		452	__attribute__ ((section (".icode")));
		453
		454	static void copy_write_sectors(unsigned char* buf, int wordcount)
		455	{
		456	#ifdef PREFER_C_WRITING
		457
		458	if ( (unsigned int)buf & 1)
		459	{ /* not 16-bit aligned, copy byte by byte */
		460	unsigned short tmp = 0;
		461	unsigned char* bufend = buf + wordcount*2;
		462	do
		463	{ /* loop compiles to 8 assembler instructions */
		464	/* takes 12 clock cycles because of 2 pipeline stalls */
		465	tmp = (unsigned short) *buf++;
		466	tmp \|= (unsigned short) buf++ << 8; / I assume big endian */
		467	ATA_DATA = tmp; /* and don't use the SWAB16 macro */
		468	} while (buf < bufend); /* tail loop is faster */
		469	}
		470	else
		471	{ /* 16-bit aligned, can do faster copy */
		472	unsigned short* wbuf = (unsigned short*)buf;
		473	unsigned short* wbufend = wbuf + wordcount;
		474	do
		475	{ /* loop compiles to 5 assembler instructions */
		476	/* takes 9 clock cycles because of 2 pipeline stalls */
		477	ATA_DATA = SWAB16(*wbuf);
		478	} while (++wbuf < wbufend); /* tail loop is faster */
		479	}
		480	#else
		481	/* optimized assembler version */
		482	/* this assumes wordcount to be a multiple of 2 */
		483
		484	/* writing is not unrolled as much as reading, for several reasons:
		485	* - a similar instruction sequence is faster for writing than for reading
		486	* because the auto-incrementing load inctructions can be used
		487	* - writing profits from warp mode
		488	* Both of these add up to have writing faster than the more unrolled reading.
		489	*/
		490	asm (
		491	"add %1,%1 \n" /* wordcount -> bytecount */
		492	"add %0,%1 \n" /* bytecount -> bufend */
		493	"mov %0,r0 \n"
		494	"tst #1,r0 \n" /* 16-bit aligned ? */
		495	"bt .w_aligned \n" /* yes, do word copy */
		496
		497	/* not 16-bit aligned */
		498	"mov #-1,r6 \n" /* prepare a bit mask for high byte */
		499	"shll8 r6 \n" /* r6 = 0xFFFFFF00 */
		500
		501	"mov.b @%0+,r2 \n" /* load (initial old second) first byte */
		502	"add #-4,%1 \n" /* adjust end address for early check */
		503	"mov.w @%0+,r3 \n" /* load (initial) first word */
		504	"bra .w_start2_b \n"
		505	"extu.b r2,r0 \n" /* extend unsigned */
		506
		507	".align 2 \n"
		508	".w_loop2_b: \n" /* main loop: copy 2 words in a row */
		509	"mov.w @%0+,r3 \n" /* load first word (2+ round) */
		510	"extu.b r2,r0 \n" /* put away low byte of second word (2+ round) */
		511	"and r6,r2 \n" /* get high byte of second word (2+ round) */
		512	"or r1,r2 \n" /* combine with low byte of old first word */
		513	"mov.w r2,@%2 \n" /* write that */
		514	".w_start2_b: \n"
		515	"cmp/hi %0,%1 \n" /* check for end */
		516	"mov.w @%0+,r2 \n" /* load second word */
		517	"extu.b r3,r1 \n" /* put away low byte of first word */
		518	"and r6,r3 \n" /* get high byte of first word */
		519	"or r0,r3 \n" /* combine with high byte of old second word */
		520	"mov.w r3,@%2 \n" /* write that */
		521	"bt .w_loop2_b \n"
		522	/* 12 instructions for 2 copies, takes 14 clock cycles */
		523	/* avg. 7 cycles per word - 71% faster */
		524
		525	/* the loop "overreads" 1 byte past the buffer end, however, the last */
		526	/* byte is not written to disk */
		527	"and r6,r2 \n" /* get high byte of last word */
		528	"or r1,r2 \n" /* combine with low byte of old first word */
		529	"bra .w_exit \n"
		530	"mov.w r2,@%2 \n" /* write last word */
		531
		532	/* 16-bit aligned, loop(load and write word) */
		533	".w_aligned: \n"
		534	"mov.w @%0+,r2 \n" /* load first word (1st round) */
		535	"bra .w_start2_w \n" /* jump into loop after next instr. */
		536	"add #-4,%1 \n" /* adjust end address for early check */
		537
		538	".align 2 \n"
		539	".w_loop2_w: \n" /* main loop: copy 2 words in a row */
		540	"mov.w @%0+,r2 \n" /* load first word (2+ round) */
		541	"swap.b r1,r0 \n" /* swap second word (2+ round) */
		542	"mov.w r0,@%2 \n" /* write second word (2+ round) */
		543	".w_start2_w: \n"
		544	"cmp/hi %0,%1 \n" /* check for end */
		545	"mov.w @%0+,r1 \n" /* load second word */
		546	"swap.b r2,r0 \n" /* swap first word */
		547	"mov.w r0,@%2 \n" /* write first word */
		548	"bt .w_loop2_w \n"
		549	/* 8 instructions for 2 copies, takes 10 clock cycles */
		550	/* avg. 5 cycles per word - 80% faster */
		551
		552	"swap.b r1,r0 \n" /* swap second word (last round) */
		553	"mov.w r0,@%2 \n" /* and write it */
		554
		555	".w_exit: \n"
		556	: /* outputs */
		557	: /* inputs */
		558	/* %0 */ "r"(buf),
		559	/* %1 */ "r"(wordcount),
		560	/* %2 */ "r"(&ATA_DATA)
		561	: /trashed /
		562	"r0","r1","r2","r3","r6"
		563	);
		564	#endif
		565	}
		566
450	int ata_write_sectors(unsigned long start,	567	int ata_write_sectors(unsigned long start,
451	int count,	568	int count,
452	void* buf)	569	void* buf)
@@ -502,7 +619,7 @@ int ata_write_sectors(unsigned long start,
502	ATA_COMMAND = CMD_WRITE_SECTORS;	619	ATA_COMMAND = CMD_WRITE_SECTORS;
503		620
504	for (i=0; i<count; i++) {	621	for (i=0; i<count; i++) {
505	int j;	622
506	if (!wait_for_start_of_transfer()) {	623	if (!wait_for_start_of_transfer()) {
507	ret = -3;	624	ret = -3;
508	break;	625	break;
@@ -515,11 +632,7 @@ int ata_write_sectors(unsigned long start,
515	poweroff = false;	632	poweroff = false;
516	}	633	}
517		634
518	for (j=0; j<SECTOR_SIZE/2; j++) {	635	copy_write_sectors(buf, SECTOR_SIZE/2);
519	ATA_DATA = (unsigned short)
520	(((unsigned char )buf)[j2+1] << 8) \|
521	((unsigned char )buf)[j2];
522	}
523		636
524	#ifdef USE_INTERRUPT	637	#ifdef USE_INTERRUPT
525	/* reading the status register clears the interrupt */	638	/* reading the status register clears the interrupt */