1 files changed, 105 insertions, 72 deletions
diff --git a/firmware/drivers/ata.c b/firmware/drivers/ata.c
index 5653466900..7641be7ec1 100644
--- a/firmware/drivers/ata.c
+++ b/firmware/drivers/ata.c
@@ -174,92 +174,129 @@ static void copy_read_sectors(unsigned char* buf,
                         __attribute__ ((section (".icode")));
 static void copy_read_sectors(unsigned char* buf, int wordcount)
 {
-    unsigned short tmp = 0; /* have to init to prevent warning? */
+#ifdef PREFER_C
+    unsigned short tmp = 0;
-    if ( (unsigned int)buf & 1) 
+    if ( (unsigned int)buf & 1)
    {   /* not 16-bit aligned, copy byte by byte */
        unsigned char* bufend = buf + wordcount*2;
-#ifdef PREFER_C
        do
        {   /* loop compiles to 9 assembler instructions */
+            /* takes 13 clock cycles because of 2 pipeline stalls */
            tmp = ATA_DATA;
            *buf++ = tmp & 0xff; /* I assume big endian */
            *buf++ = tmp >> 8;   /*  and don't use the SWAB16 macro */
        } while (buf < bufend); /* tail loop is faster */
-#else
-        /* I can bring it down to 7 instructions/loop, and exploit pipeline */
-        asm (
-            "mov    #1, r0 \n"      /* r0 = 1; */
-            /* correct for the "early increment" below */
-            "add        #-2,%2 \n"    /* buf -= 2; */
-            "add        #-2,%3 \n"    /* bufend -= 2; */
-            "loop_b: \n"
-            "mov.w      @%1,%0 \n"      /* tmp = ATA_DATA; */
-            /* Now we're reading from the bus, I do something independent we 
-               need later, to avoid pipeline stall */
-            "add        #0x02,%2 \n"    /* buf += 2; */
-            "cmp/hs     %3,%2 \n"       /* if (buf < bufend) */
-            /* now use the read result */
-            "mov.b      %0,@%2 \n"      /* buf[0] = lowbyte(tmp); */
-            "shlr8      %0 \n"          /* tmp >>= 8; */
-            "mov.b      %0,@(r0,%2) \n" /* buf[r0] = lowbyte(tmp); */
-            "bf     loop_b \n"      /* goto loop_b; */
-            : /* outputs */
-            : /* inputs */
-            /* %0 */ "r"(tmp),
-            /* %1 */ "r"(&ATA_DATA),
-            /* %2 */ "r"(buf),
-            /* %3 */ "r"(bufend)
-            : /* trashed */
-            "r0"
-        );
-#endif
    }
-    else 
+    else
    {   /* 16-bit aligned, can do faster copy */
        unsigned short* wbuf = (unsigned short*)buf;
        unsigned short* wbufend = wbuf + wordcount;
-#ifdef PREFER_C
        do
        {   /* loop compiles to 7 assembler instructions */
+            /* takes 11 clock cycles because of 2 pipeline stalls */
            *wbuf = SWAB16(ATA_DATA);
        } while (++wbuf < wbufend); /* tail loop is faster */
+    }
 #else
-        /* I can bring it down to 9 instructions for 2 loops, and pipeline */
+    /* turbo-charged assembler version */
-        asm (
+    /* this assumes wordcount to be a multiple of 4 */
-            "mov    #2, r0 \n"      /* r0 = 2 */
+    asm (
-            /* correct for the "early increment" below */
+        "add     %1,%1       \n"  /* wordcount -> bytecount */
-            "add        #-4,%2 \n"      /* wbuf -= 4; */
+        "add     %0,%1       \n"  /* bytecount -> bufend */
-            "bra enter_loop \n"     /* goto enter_loop, after next instr. */
+        "mov     %0,r0       \n"
-            "add        #-4,%3 \n"      /* wbufend -= 4; */
+        "tst     #1,r0       \n"  /* 16-bit aligned ? */
-            "loop_w: \n"
+        "bt      .aligned    \n"  /* yes, do word copy */
-            /* use read result and store, from last round */
-            "swap.b     %0,%0 \n"       /* endian_swap(tmp); */
+        ".align  2           \n"
-            "mov.w      %0,@(r0,%2) \n" /* wbuf[r0] = tmp; */
+        /* not 16-bit aligned */
-            "enter_loop: \n"
+        "mov     #-1,r3      \n"  /* prepare a bit mask for high byte */
-            "mov.w      @%1,%0 \n"      /* tmp = ATA_DATA; */
+        "extu.b  r3,r3       \n"
-            /* keep the pipeline busy with 2 independent instructions */
+        "swap.b  r3,r3       \n"  /* r3 = 0x0000FF00 */
-            "add        #0x04,%2 \n"    /* wbuf += 4; */
-            "cmp/hs     %3,%2 \n"       /* if (wbuf < wbufend) */
+        "mov.w   @%2,r2      \n"  /* read first word (1st round) */
-            "swap.b     %0,%0 \n"       /* endian_swap(tmp); */
+        "add     #-12,%1     \n"  /* adjust end address for offsets */
-            "mov.w      %0,@%2 \n"      /* wbuf[0] = tmp; */
+        "mov.b   r2,@%0      \n"  /* store low byte of first word */
-            /* unrolled, do one more */
+        "bra     .start4_b   \n"  /* jump into loop after next instr. */
-            "mov.w      @%1,%0 \n"      /* tmp = ATA_DATA; */
+        "add     #-5,%0      \n"  /* adjust for dest. offsets; now even */
-            /* use and store later, to keep pipeline busy */
-            "bf     loop_w \n"      /* goto loop_w; */
+    ".loop4_b:               \n"  /* main loop: copy 4 words in a row */
-            "swap.b     %0,%0 \n"       /* endian_swap(tmp); */
+        "mov.w   @%2,r2      \n"  /* read first word (2+ round) */
-            "mov.w      %0,@(r0,%2) \n" /* wbuf[r0] = tmp; */
+        "and     r3,r1       \n"  /* get high byte of fourth word (2+ round) */
-            : /* outputs */
+        "extu.b  r2,r0       \n"  /* get low byte of first word (2+ round) */
-            : /* inputs */
+        "or      r1,r0       \n"  /* combine with high byte of fourth word */
-            /* %0 */ "r"(tmp),
+        "mov.w   r0,@(4,%0)  \n"  /* store at buf[4] */
-            /* %1 */ "r"(&ATA_DATA),
+        "nop                 \n"  /* maintain alignment */
-            /* %2 */ "r"(wbuf),
+    ".start4_b:              \n"
-            /* %3 */ "r"(wbufend)
+        "mov.w   @%2,r1      \n"  /* read second word */
-            : /* trashed */
+        "and     r3,r2       \n"  /* get high byte of first word */
-            "r0"
+        "extu.b  r1,r0       \n"  /* get low byte of second word */
-        );
+        "or      r2,r0       \n"  /* combine with high byte of first word */
+        "mov.w   r0,@(6,%0)  \n"  /* store at buf[6] */
+        "add     #8,%0       \n"  /* buf += 8 */
+        "mov.w   @%2,r2      \n"  /* read third word */
+        "and     r3,r1       \n"  /* get high byte of second word */
+        "extu.b  r2,r0       \n"  /* get low byte of third word */
+        "or      r1,r0       \n"  /* combine with high byte of second word */
+        "mov.w   r0,@%0      \n"  /* store at buf[0] */
+        "cmp/hi  %0,%1       \n"  /* check for end */
+        "mov.w   @%2,r1      \n"  /* read fourth word */
+        "and     r3,r2       \n"  /* get high byte of third word */
+        "extu.b  r1,r0       \n"  /* get low byte of fourth word */
+        "or      r2,r0       \n"  /* combine with high byte of third word */
+        "mov.w   r0,@(2,%0)  \n"  /* store at buf[2] */
+        "bt      .loop4_b    \n"
+        /* 24 instructions for 4 copies, takes 26 clock cycles */
+        /* avg. 6.5 cycles per word - 100% faster */
+        "swap.b  r1,r0       \n"  /* get high byte of last word */
+        "mov.b   r0,@(4,%0)  \n"  /* and store it */
+        "bra     .exit       \n"
+        "nop                 \n"
+        ".align  2           \n"
+        /* 16-bit aligned, loop(read and store word) */
+    ".aligned:               \n"
+        "mov.w   @%2,r2      \n"  /* read first word (1st round) */
+        "add     #-12,%1     \n"  /* adjust end address for offsets */
+        "bra     .start4_w   \n"  /* jump into loop after next instr. */
+        "add     #-6,%0      \n"  /* adjust for destination offsets */
+    ".loop4_w:               \n"  /* main loop: copy 4 words in a row */
+        "mov.w   @%2,r2      \n"  /* read first word (2+ round) */
+        "swap.b  r1,r0       \n"  /* swap fourth word (2+ round) */
+        "mov.w   r0,@(4,%0)  \n"  /* store fourth word (2+ round) */
+        "nop                 \n"  /* maintain alignment */
+    ".start4_w:              \n"
+        "mov.w   @%2,r1      \n"  /* read second word */
+        "swap.b  r2,r0       \n"  /* swap first word */
+        "mov.w   r0,@(6,%0)  \n"  /* store first word in buf[6] */
+        "add     #8,%0       \n"  /* buf += 8 */
+        "mov.w   @%2,r2      \n"  /* read third word */
+        "swap.b  r1,r0       \n"  /* swap second word */
+        "mov.w   r0,@%0      \n"  /* store second word in buf[0] */
+        "cmp/hi  %0,%1       \n"  /* check for end */
+        "mov.w   @%2,r1      \n"  /* read fourth word */
+        "swap.b  r2,r0       \n"  /* swap third word */
+        "mov.w   r0,@(2,%0)  \n"  /* store third word */
+        "bt      .loop4_w    \n"
+        /* 16 instructions for 4 copies, takes 18 clock cycles */
+        /* avg. 4.5 cycles per word - 144% faster */
+        "swap.b  r1,r0       \n"  /* swap fourth word (last round) */
+        "mov.w   r0,@(4,%0)  \n"  /* and store it */
+        ".exit:              \n"
+        : /* outputs */
+        : /* inputs */
+        /* %0 */ "r"(buf),
+        /* %1 */ "r"(wordcount),
+        /* %2 */ "r"(&ATA_DATA)
+        : /*trashed */
+        "r0","r1","r2","r3"
+    );
 #endif
-    }
 }
 int ata_read_sectors(unsigned long start,
@@ -958,14 +995,10 @@ int ata_init(void)
        if (coldstart)
        {
-            /* Reset both master and slave, we don't yet know what's in */
+            /* This should reset both master and slave, we don't yet know what's in */
-            /* this is safe because non-present devices don't report busy */
            ata_device = 0;
            if (ata_hard_reset())
                return -1;
-            ata_device = SELECT_DEVICE1;
-            if (ata_hard_reset())
-                return -2;
        }
        rc = master_slave_detect();

diff --git a/firmware/drivers/ata.c b/firmware/drivers/ata.c index 5653466900..7641be7ec1 100644 --- a/firmware/drivers/ata.c +++ b/firmware/drivers/ata.c
@@ -174,92 +174,129 @@ static void copy_read_sectors(unsigned char* buf,
174	__attribute__ ((section (".icode")));	174	__attribute__ ((section (".icode")));
175	static void copy_read_sectors(unsigned char* buf, int wordcount)	175	static void copy_read_sectors(unsigned char* buf, int wordcount)
176	{	176	{
177	unsigned short tmp = 0; /* have to init to prevent warning? */	177	#ifdef PREFER_C
		178	unsigned short tmp = 0;
178		179
179	if ( (unsigned int)buf & 1)	180	if ( (unsigned int)buf & 1)
180	{ /* not 16-bit aligned, copy byte by byte */	181	{ /* not 16-bit aligned, copy byte by byte */
181	unsigned char* bufend = buf + wordcount*2;	182	unsigned char* bufend = buf + wordcount*2;
182	#ifdef PREFER_C
183	do	183	do
184	{ /* loop compiles to 9 assembler instructions */	184	{ /* loop compiles to 9 assembler instructions */
		185	/* takes 13 clock cycles because of 2 pipeline stalls */
185	tmp = ATA_DATA;	186	tmp = ATA_DATA;
186	buf++ = tmp & 0xff; / I assume big endian */	187	buf++ = tmp & 0xff; / I assume big endian */
187	buf++ = tmp >> 8; / and don't use the SWAB16 macro */	188	buf++ = tmp >> 8; / and don't use the SWAB16 macro */
188	} while (buf < bufend); /* tail loop is faster */	189	} while (buf < bufend); /* tail loop is faster */
189	#else
190	/* I can bring it down to 7 instructions/loop, and exploit pipeline */
191	asm (
192	"mov #1, r0 \n" /* r0 = 1; */
193	/* correct for the "early increment" below */
194	"add #-2,%2 \n" /* buf -= 2; */
195	"add #-2,%3 \n" /* bufend -= 2; */
196	"loop_b: \n"
197	"mov.w @%1,%0 \n" /* tmp = ATA_DATA; */
198	/* Now we're reading from the bus, I do something independent we
199	need later, to avoid pipeline stall */
200	"add #0x02,%2 \n" /* buf += 2; */
201	"cmp/hs %3,%2 \n" /* if (buf < bufend) */
202	/* now use the read result */
203	"mov.b %0,@%2 \n" /* buf[0] = lowbyte(tmp); */
204	"shlr8 %0 \n" /* tmp >>= 8; */
205	"mov.b %0,@(r0,%2) \n" /* buf[r0] = lowbyte(tmp); */
206	"bf loop_b \n" /* goto loop_b; */
207	: /* outputs */
208	: /* inputs */
209	/* %0 */ "r"(tmp),
210	/* %1 */ "r"(&ATA_DATA),
211	/* %2 */ "r"(buf),
212	/* %3 */ "r"(bufend)
213	: /* trashed */
214	"r0"
215	);
216	#endif
217	}	190	}
218	else	191	else
219	{ /* 16-bit aligned, can do faster copy */	192	{ /* 16-bit aligned, can do faster copy */
220	unsigned short* wbuf = (unsigned short*)buf;	193	unsigned short* wbuf = (unsigned short*)buf;
221	unsigned short* wbufend = wbuf + wordcount;	194	unsigned short* wbufend = wbuf + wordcount;
222	#ifdef PREFER_C
223	do	195	do
224	{ /* loop compiles to 7 assembler instructions */	196	{ /* loop compiles to 7 assembler instructions */
		197	/* takes 11 clock cycles because of 2 pipeline stalls */
225	*wbuf = SWAB16(ATA_DATA);	198	*wbuf = SWAB16(ATA_DATA);
226	} while (++wbuf < wbufend); /* tail loop is faster */	199	} while (++wbuf < wbufend); /* tail loop is faster */
		200	}
227	#else	201	#else
228	/* I can bring it down to 9 instructions for 2 loops, and pipeline */	202	/* turbo-charged assembler version */
229	asm (	203	/* this assumes wordcount to be a multiple of 4 */
230	"mov #2, r0 \n" /* r0 = 2 */	204	asm (
231	/* correct for the "early increment" below */	205	"add %1,%1 \n" /* wordcount -> bytecount */
232	"add #-4,%2 \n" /* wbuf -= 4; */	206	"add %0,%1 \n" /* bytecount -> bufend */
233	"bra enter_loop \n" /* goto enter_loop, after next instr. */	207	"mov %0,r0 \n"
234	"add #-4,%3 \n" /* wbufend -= 4; */	208	"tst #1,r0 \n" /* 16-bit aligned ? */
235	"loop_w: \n"	209	"bt .aligned \n" /* yes, do word copy */
236	/* use read result and store, from last round */	210
237	"swap.b %0,%0 \n" /* endian_swap(tmp); */	211	".align 2 \n"
238	"mov.w %0,@(r0,%2) \n" /* wbuf[r0] = tmp; */	212	/* not 16-bit aligned */
239	"enter_loop: \n"	213	"mov #-1,r3 \n" /* prepare a bit mask for high byte */
240	"mov.w @%1,%0 \n" /* tmp = ATA_DATA; */	214	"extu.b r3,r3 \n"
241	/* keep the pipeline busy with 2 independent instructions */	215	"swap.b r3,r3 \n" /* r3 = 0x0000FF00 */
242	"add #0x04,%2 \n" /* wbuf += 4; */	216
243	"cmp/hs %3,%2 \n" /* if (wbuf < wbufend) */	217	"mov.w @%2,r2 \n" /* read first word (1st round) */
244	"swap.b %0,%0 \n" /* endian_swap(tmp); */	218	"add #-12,%1 \n" /* adjust end address for offsets */
245	"mov.w %0,@%2 \n" /* wbuf[0] = tmp; */	219	"mov.b r2,@%0 \n" /* store low byte of first word */
246	/* unrolled, do one more */	220	"bra .start4_b \n" /* jump into loop after next instr. */
247	"mov.w @%1,%0 \n" /* tmp = ATA_DATA; */	221	"add #-5,%0 \n" /* adjust for dest. offsets; now even */
248	/* use and store later, to keep pipeline busy */	222
249	"bf loop_w \n" /* goto loop_w; */	223	".loop4_b: \n" /* main loop: copy 4 words in a row */
250	"swap.b %0,%0 \n" /* endian_swap(tmp); */	224	"mov.w @%2,r2 \n" /* read first word (2+ round) */
251	"mov.w %0,@(r0,%2) \n" /* wbuf[r0] = tmp; */	225	"and r3,r1 \n" /* get high byte of fourth word (2+ round) */
252	: /* outputs */	226	"extu.b r2,r0 \n" /* get low byte of first word (2+ round) */
253	: /* inputs */	227	"or r1,r0 \n" /* combine with high byte of fourth word */
254	/* %0 */ "r"(tmp),	228	"mov.w r0,@(4,%0) \n" /* store at buf[4] */
255	/* %1 */ "r"(&ATA_DATA),	229	"nop \n" /* maintain alignment */
256	/* %2 */ "r"(wbuf),	230	".start4_b: \n"
257	/* %3 */ "r"(wbufend)	231	"mov.w @%2,r1 \n" /* read second word */
258	: /* trashed */	232	"and r3,r2 \n" /* get high byte of first word */
259	"r0"	233	"extu.b r1,r0 \n" /* get low byte of second word */
260	);	234	"or r2,r0 \n" /* combine with high byte of first word */
		235	"mov.w r0,@(6,%0) \n" /* store at buf[6] */
		236	"add #8,%0 \n" /* buf += 8 */
		237	"mov.w @%2,r2 \n" /* read third word */
		238	"and r3,r1 \n" /* get high byte of second word */
		239	"extu.b r2,r0 \n" /* get low byte of third word */
		240	"or r1,r0 \n" /* combine with high byte of second word */
		241	"mov.w r0,@%0 \n" /* store at buf[0] */
		242	"cmp/hi %0,%1 \n" /* check for end */
		243	"mov.w @%2,r1 \n" /* read fourth word */
		244	"and r3,r2 \n" /* get high byte of third word */
		245	"extu.b r1,r0 \n" /* get low byte of fourth word */
		246	"or r2,r0 \n" /* combine with high byte of third word */
		247	"mov.w r0,@(2,%0) \n" /* store at buf[2] */
		248	"bt .loop4_b \n"
		249	/* 24 instructions for 4 copies, takes 26 clock cycles */
		250	/* avg. 6.5 cycles per word - 100% faster */
		251
		252	"swap.b r1,r0 \n" /* get high byte of last word */
		253	"mov.b r0,@(4,%0) \n" /* and store it */
		254
		255	"bra .exit \n"
		256	"nop \n"
		257
		258	".align 2 \n"
		259	/* 16-bit aligned, loop(read and store word) */
		260	".aligned: \n"
		261	"mov.w @%2,r2 \n" /* read first word (1st round) */
		262	"add #-12,%1 \n" /* adjust end address for offsets */
		263	"bra .start4_w \n" /* jump into loop after next instr. */
		264	"add #-6,%0 \n" /* adjust for destination offsets */
		265
		266	".loop4_w: \n" /* main loop: copy 4 words in a row */
		267	"mov.w @%2,r2 \n" /* read first word (2+ round) */
		268	"swap.b r1,r0 \n" /* swap fourth word (2+ round) */
		269	"mov.w r0,@(4,%0) \n" /* store fourth word (2+ round) */
		270	"nop \n" /* maintain alignment */
		271	".start4_w: \n"
		272	"mov.w @%2,r1 \n" /* read second word */
		273	"swap.b r2,r0 \n" /* swap first word */
		274	"mov.w r0,@(6,%0) \n" /* store first word in buf[6] */
		275	"add #8,%0 \n" /* buf += 8 */
		276	"mov.w @%2,r2 \n" /* read third word */
		277	"swap.b r1,r0 \n" /* swap second word */
		278	"mov.w r0,@%0 \n" /* store second word in buf[0] */
		279	"cmp/hi %0,%1 \n" /* check for end */
		280	"mov.w @%2,r1 \n" /* read fourth word */
		281	"swap.b r2,r0 \n" /* swap third word */
		282	"mov.w r0,@(2,%0) \n" /* store third word */
		283	"bt .loop4_w \n"
		284	/* 16 instructions for 4 copies, takes 18 clock cycles */
		285	/* avg. 4.5 cycles per word - 144% faster */
		286
		287	"swap.b r1,r0 \n" /* swap fourth word (last round) */
		288	"mov.w r0,@(4,%0) \n" /* and store it */
		289
		290	".exit: \n"
		291	: /* outputs */
		292	: /* inputs */
		293	/* %0 */ "r"(buf),
		294	/* %1 */ "r"(wordcount),
		295	/* %2 */ "r"(&ATA_DATA)
		296	: /trashed /
		297	"r0","r1","r2","r3"
		298	);
261	#endif	299	#endif
262	}
263	}	300	}
264		301
265	int ata_read_sectors(unsigned long start,	302	int ata_read_sectors(unsigned long start,
@@ -958,14 +995,10 @@ int ata_init(void)
958		995
959	if (coldstart)	996	if (coldstart)
960	{	997	{
961	/* Reset both master and slave, we don't yet know what's in */	998	/* This should reset both master and slave, we don't yet know what's in */
962	/* this is safe because non-present devices don't report busy */
963	ata_device = 0;	999	ata_device = 0;
964	if (ata_hard_reset())	1000	if (ata_hard_reset())
965	return -1;	1001	return -1;
966	ata_device = SELECT_DEVICE1;
967	if (ata_hard_reset())
968	return -2;
969	}	1002	}
970		1003
971	rc = master_slave_detect();	1004	rc = master_slave_detect();