1 files changed, 103 insertions, 7 deletions
diff --git a/firmware/drivers/ata.c b/firmware/drivers/ata.c
index 63e603f76f..b6292e5d5c 100644
--- a/firmware/drivers/ata.c
+++ b/firmware/drivers/ata.c
@@ -162,33 +162,129 @@ static int wait_for_end_of_transfer(void)
 }    
+/*
+0x090156A8: 0x4F22      sts.l   pr,@-r15
+0x090156AA: 0x6243      mov     r4,r2
+0x090156AC: 0x6023      mov     r2,r0
+0x090156AE: 0xC901      and     #0x01,r0
+0x090156B0: 0x2008      tst     r0,r0
+0x090156B2: 0x8911      bt      0x090156D8
+0x090156B4: 0x6153      mov     r5,r1
+0x090156B6: 0x311C      add     r1,r1
+0x090156B8: 0x6523      mov     r2,r5
+0x090156BA: 0x351C      add     r1,r5
+0x090156BC: 0xD30E      mov.l   @(0x03C,pc),r3  ; 0x090156F8 (0x06104100) 
+0x090156BE: 0x0009      nop
+0x090156C0: 0x6131      mov.w   @r3,r1
+0x090156C2: 0x611D      extu.w  r1,r1
+0x090156C4: 0x2210      mov.b   r1,@r2
+0x090156C6: 0x7201      add     #0x01,r2
+0x090156C8: 0x4119      shlr8   r1
+0x090156CA: 0x2210      mov.b   r1,@r2
+0x090156CC: 0x7201      add     #0x01,r2
+0x090156CE: 0x3252      cmp/hs  r5,r2
+0x090156D0: 0x8BF6      bf      0x090156C0
+0x090156D2: 0xA00F      bra     0x090156F4
+0x090156D4: 0x4F26      lds.l   @r15+,pr
+0x090156D6: 0x0009      nop
+0x090156D8: 0x6423      mov     r2,r4
+0x090156DA: 0x6153      mov     r5,r1
+0x090156DC: 0x311C      add     r1,r1
+0x090156DE: 0x6543      mov     r4,r5
+0x090156E0: 0x351C      add     r1,r5
+0x090156E2: 0xD205      mov.l   @(0x018,pc),r2  ; 0x090156F8 (0x06104100) 
+0x090156E4: 0x6121      mov.w   @r2,r1
+0x090156E6: 0x611F      exts.w  r1,r1
+0x090156E8: 0x6118      swap.b  r1,r1
+0x090156EA: 0x2411      mov.w   r1,@r4
+0x090156EC: 0x7402      add     #0x02,r4
+0x090156EE: 0x3452      cmp/hs  r5,r4
+0x090156F0: 0x8BF8      bf      0x090156E4
+0x090156F2: 0x4F26      lds.l   @r15+,pr
+0x090156F4: 0x000B      rts
+0x090156F6: 0x0009      nop
+0x090156F8: 0x0610      .long   0x06104100      ; 0x090156E0
+0x090156FA: 0x4100      
+*/
 /* the tight loop of ata_read_sectors(), to avoid the whole in IRAM */
 static void copy_read_sectors(unsigned char* buf,
                         int wordcount)
                         __attribute__ ((section (".icode")));
 static void copy_read_sectors(unsigned char* buf, int wordcount)
 {
-    if (wordcount <= 0)
+    unsigned short tmp = 0; /* have to init to prevent warning? */
-        return; /* should never happen, but to protect my tail loop */
-    if ( (unsigned int)buf & 1 ) 
+    if ( (unsigned int)buf & 1) 
-    {
+    {   /* not 16-bit aligned, copy byte by byte */
        unsigned char* bufend = buf + wordcount*2;
+#ifdef PREFER_C
        do
-        {   /* loop compiles to 8 assembler instructions */
+        {   /* loop compiles to 9 assembler instructions */
-            unsigned short tmp = ATA_DATA;
+            tmp = ATA_DATA;
            *buf++ = tmp & 0xff; /* I assume big endian */
            *buf++ = tmp >> 8;   /*  and don't use the SWAB16 macro */
        } while (buf < bufend); /* tail loop is faster */
+#else
+        asm (
+            "mov    #1, r0 \n"
+            "loop_b: \n"
+            "mov.w      @%1,%0 \n"
+            "mov.b      %0,@%2 \n"
+            "shlr8      %0 \n"
+            "mov.b      %0,@(r0,%2) \n"
+            "add        #0x02,%2 \n"
+            "cmp/hs     %3,%2 \n"
+            "bf     loop_b \n"
+            : /* outputs */
+            : /* inputs */
+            /* %0 */ "r"(tmp),
+            /* %1 */ "r"(&ATA_DATA),
+            /* %2 */ "r"(buf),
+            /* %3 */ "r"(bufend)
+            : /* trashed */
+            "r0"
+        );
+#endif
    }
    else 
-    {
+    {   /* 16-bit aligned, can do faster copy */
        unsigned short* wbuf = (unsigned short*)buf;
        unsigned short* wbufend = wbuf + wordcount;
+#ifdef PREFER_C
        do
        {   /* loop compiles to 7 assembler instructions */
            *wbuf = SWAB16(ATA_DATA);
        } while (++wbuf < wbufend); /* tail loop is faster */
+#else
+        asm (
+            "mov    #2, r0 \n"
+            "loop_w: \n"
+            "mov.w      @%1,%0 \n"
+            "swap.b     %0,%0 \n"
+            "mov.w      %0,@%2 \n"
+            "mov.w      @%1,%0 \n" /* unrolled, do one more */
+            "swap.b     %0,%0 \n"
+            "mov.w      %0,@(r0,%2) \n"
+            "add        #0x04,%2 \n"
+            "cmp/hs     %3,%2 \n"
+            "bf     loop_w \n"
+            : /* outputs */
+            : /* inputs */
+            /* %0 */ "r"(tmp),
+            /* %1 */ "r"(&ATA_DATA),
+            /* %2 */ "r"(wbuf),
+            /* %3 */ "r"(wbufend)
+            : /* trashed */
+            "r0"
+        );
+#endif
    }
 }

diff --git a/firmware/drivers/ata.c b/firmware/drivers/ata.c index 63e603f76f..b6292e5d5c 100644 --- a/firmware/drivers/ata.c +++ b/firmware/drivers/ata.c
@@ -162,33 +162,129 @@ static int wait_for_end_of_transfer(void)
162	}	162	}
163		163
164		164
		165	/*
		166	0x090156A8: 0x4F22 sts.l pr,@-r15
		167	0x090156AA: 0x6243 mov r4,r2
		168	0x090156AC: 0x6023 mov r2,r0
		169	0x090156AE: 0xC901 and #0x01,r0
		170	0x090156B0: 0x2008 tst r0,r0
		171	0x090156B2: 0x8911 bt 0x090156D8
		172	0x090156B4: 0x6153 mov r5,r1
		173	0x090156B6: 0x311C add r1,r1
		174	0x090156B8: 0x6523 mov r2,r5
		175	0x090156BA: 0x351C add r1,r5
		176	0x090156BC: 0xD30E mov.l @(0x03C,pc),r3 ; 0x090156F8 (0x06104100)
		177	0x090156BE: 0x0009 nop
		178
		179	0x090156C0: 0x6131 mov.w @r3,r1
		180	0x090156C2: 0x611D extu.w r1,r1
		181	0x090156C4: 0x2210 mov.b r1,@r2
		182	0x090156C6: 0x7201 add #0x01,r2
		183	0x090156C8: 0x4119 shlr8 r1
		184	0x090156CA: 0x2210 mov.b r1,@r2
		185	0x090156CC: 0x7201 add #0x01,r2
		186	0x090156CE: 0x3252 cmp/hs r5,r2
		187	0x090156D0: 0x8BF6 bf 0x090156C0
		188
		189	0x090156D2: 0xA00F bra 0x090156F4
		190	0x090156D4: 0x4F26 lds.l @r15+,pr
		191	0x090156D6: 0x0009 nop
		192	0x090156D8: 0x6423 mov r2,r4
		193	0x090156DA: 0x6153 mov r5,r1
		194	0x090156DC: 0x311C add r1,r1
		195	0x090156DE: 0x6543 mov r4,r5
		196	0x090156E0: 0x351C add r1,r5
		197	0x090156E2: 0xD205 mov.l @(0x018,pc),r2 ; 0x090156F8 (0x06104100)
		198
		199	0x090156E4: 0x6121 mov.w @r2,r1
		200	0x090156E6: 0x611F exts.w r1,r1
		201	0x090156E8: 0x6118 swap.b r1,r1
		202	0x090156EA: 0x2411 mov.w r1,@r4
		203	0x090156EC: 0x7402 add #0x02,r4
		204	0x090156EE: 0x3452 cmp/hs r5,r4
		205	0x090156F0: 0x8BF8 bf 0x090156E4
		206
		207	0x090156F2: 0x4F26 lds.l @r15+,pr
		208	0x090156F4: 0x000B rts
		209	0x090156F6: 0x0009 nop
		210	0x090156F8: 0x0610 .long 0x06104100 ; 0x090156E0
		211	0x090156FA: 0x4100
		212	*/
		213
		214
		215
165	/* the tight loop of ata_read_sectors(), to avoid the whole in IRAM */	216	/* the tight loop of ata_read_sectors(), to avoid the whole in IRAM */
166	static void copy_read_sectors(unsigned char* buf,	217	static void copy_read_sectors(unsigned char* buf,
167	int wordcount)	218	int wordcount)
168	__attribute__ ((section (".icode")));	219	__attribute__ ((section (".icode")));
169	static void copy_read_sectors(unsigned char* buf, int wordcount)	220	static void copy_read_sectors(unsigned char* buf, int wordcount)
170	{	221	{
171	if (wordcount <= 0)	222	unsigned short tmp = 0; /* have to init to prevent warning? */
172	return; /* should never happen, but to protect my tail loop */
173		223
174	if ( (unsigned int)buf & 1 )	224	if ( (unsigned int)buf & 1)
175	{	225	{ /* not 16-bit aligned, copy byte by byte */
176	unsigned char* bufend = buf + wordcount*2;	226	unsigned char* bufend = buf + wordcount*2;
		227	#ifdef PREFER_C
177	do	228	do
178	{ /* loop compiles to 8 assembler instructions */	229	{ /* loop compiles to 9 assembler instructions */
179	unsigned short tmp = ATA_DATA;	230	tmp = ATA_DATA;
180	buf++ = tmp & 0xff; / I assume big endian */	231	buf++ = tmp & 0xff; / I assume big endian */
181	buf++ = tmp >> 8; / and don't use the SWAB16 macro */	232	buf++ = tmp >> 8; / and don't use the SWAB16 macro */
182	} while (buf < bufend); /* tail loop is faster */	233	} while (buf < bufend); /* tail loop is faster */
		234	#else
		235	asm (
		236	"mov #1, r0 \n"
		237	"loop_b: \n"
		238	"mov.w @%1,%0 \n"
		239	"mov.b %0,@%2 \n"
		240	"shlr8 %0 \n"
		241	"mov.b %0,@(r0,%2) \n"
		242	"add #0x02,%2 \n"
		243	"cmp/hs %3,%2 \n"
		244	"bf loop_b \n"
		245	: /* outputs */
		246	: /* inputs */
		247	/* %0 */ "r"(tmp),
		248	/* %1 */ "r"(&ATA_DATA),
		249	/* %2 */ "r"(buf),
		250	/* %3 */ "r"(bufend)
		251	: /* trashed */
		252	"r0"
		253	);
		254	#endif
183	}	255	}
184	else	256	else
185	{	257	{ /* 16-bit aligned, can do faster copy */
186	unsigned short* wbuf = (unsigned short*)buf;	258	unsigned short* wbuf = (unsigned short*)buf;
187	unsigned short* wbufend = wbuf + wordcount;	259	unsigned short* wbufend = wbuf + wordcount;
		260	#ifdef PREFER_C
188	do	261	do
189	{ /* loop compiles to 7 assembler instructions */	262	{ /* loop compiles to 7 assembler instructions */
190	*wbuf = SWAB16(ATA_DATA);	263	*wbuf = SWAB16(ATA_DATA);
191	} while (++wbuf < wbufend); /* tail loop is faster */	264	} while (++wbuf < wbufend); /* tail loop is faster */
		265	#else
		266	asm (
		267	"mov #2, r0 \n"
		268	"loop_w: \n"
		269	"mov.w @%1,%0 \n"
		270	"swap.b %0,%0 \n"
		271	"mov.w %0,@%2 \n"
		272	"mov.w @%1,%0 \n" /* unrolled, do one more */
		273	"swap.b %0,%0 \n"
		274	"mov.w %0,@(r0,%2) \n"
		275	"add #0x04,%2 \n"
		276	"cmp/hs %3,%2 \n"
		277	"bf loop_w \n"
		278	: /* outputs */
		279	: /* inputs */
		280	/* %0 */ "r"(tmp),
		281	/* %1 */ "r"(&ATA_DATA),
		282	/* %2 */ "r"(wbuf),
		283	/* %3 */ "r"(wbufend)
		284	: /* trashed */
		285	"r0"
		286	);
		287	#endif
192	}	288	}
193	}	289	}
194		290