summaryrefslogtreecommitdiff
path: root/firmware/drivers
diff options
context:
space:
mode:
Diffstat (limited to 'firmware/drivers')
-rw-r--r--firmware/drivers/ata.c177
1 files changed, 105 insertions, 72 deletions
diff --git a/firmware/drivers/ata.c b/firmware/drivers/ata.c
index 5653466900..7641be7ec1 100644
--- a/firmware/drivers/ata.c
+++ b/firmware/drivers/ata.c
@@ -174,92 +174,129 @@ static void copy_read_sectors(unsigned char* buf,
174 __attribute__ ((section (".icode"))); 174 __attribute__ ((section (".icode")));
175static void copy_read_sectors(unsigned char* buf, int wordcount) 175static void copy_read_sectors(unsigned char* buf, int wordcount)
176{ 176{
177 unsigned short tmp = 0; /* have to init to prevent warning? */ 177#ifdef PREFER_C
178 unsigned short tmp = 0;
178 179
179 if ( (unsigned int)buf & 1) 180 if ( (unsigned int)buf & 1)
180 { /* not 16-bit aligned, copy byte by byte */ 181 { /* not 16-bit aligned, copy byte by byte */
181 unsigned char* bufend = buf + wordcount*2; 182 unsigned char* bufend = buf + wordcount*2;
182#ifdef PREFER_C
183 do 183 do
184 { /* loop compiles to 9 assembler instructions */ 184 { /* loop compiles to 9 assembler instructions */
185 /* takes 13 clock cycles because of 2 pipeline stalls */
185 tmp = ATA_DATA; 186 tmp = ATA_DATA;
186 *buf++ = tmp & 0xff; /* I assume big endian */ 187 *buf++ = tmp & 0xff; /* I assume big endian */
187 *buf++ = tmp >> 8; /* and don't use the SWAB16 macro */ 188 *buf++ = tmp >> 8; /* and don't use the SWAB16 macro */
188 } while (buf < bufend); /* tail loop is faster */ 189 } while (buf < bufend); /* tail loop is faster */
189#else
190 /* I can bring it down to 7 instructions/loop, and exploit pipeline */
191 asm (
192 "mov #1, r0 \n" /* r0 = 1; */
193 /* correct for the "early increment" below */
194 "add #-2,%2 \n" /* buf -= 2; */
195 "add #-2,%3 \n" /* bufend -= 2; */
196 "loop_b: \n"
197 "mov.w @%1,%0 \n" /* tmp = ATA_DATA; */
198 /* Now we're reading from the bus, I do something independent we
199 need later, to avoid pipeline stall */
200 "add #0x02,%2 \n" /* buf += 2; */
201 "cmp/hs %3,%2 \n" /* if (buf < bufend) */
202 /* now use the read result */
203 "mov.b %0,@%2 \n" /* buf[0] = lowbyte(tmp); */
204 "shlr8 %0 \n" /* tmp >>= 8; */
205 "mov.b %0,@(r0,%2) \n" /* buf[r0] = lowbyte(tmp); */
206 "bf loop_b \n" /* goto loop_b; */
207 : /* outputs */
208 : /* inputs */
209 /* %0 */ "r"(tmp),
210 /* %1 */ "r"(&ATA_DATA),
211 /* %2 */ "r"(buf),
212 /* %3 */ "r"(bufend)
213 : /* trashed */
214 "r0"
215 );
216#endif
217 } 190 }
218 else 191 else
219 { /* 16-bit aligned, can do faster copy */ 192 { /* 16-bit aligned, can do faster copy */
220 unsigned short* wbuf = (unsigned short*)buf; 193 unsigned short* wbuf = (unsigned short*)buf;
221 unsigned short* wbufend = wbuf + wordcount; 194 unsigned short* wbufend = wbuf + wordcount;
222#ifdef PREFER_C
223 do 195 do
224 { /* loop compiles to 7 assembler instructions */ 196 { /* loop compiles to 7 assembler instructions */
197 /* takes 11 clock cycles because of 2 pipeline stalls */
225 *wbuf = SWAB16(ATA_DATA); 198 *wbuf = SWAB16(ATA_DATA);
226 } while (++wbuf < wbufend); /* tail loop is faster */ 199 } while (++wbuf < wbufend); /* tail loop is faster */
200 }
227#else 201#else
228 /* I can bring it down to 9 instructions for 2 loops, and pipeline */ 202 /* turbo-charged assembler version */
229 asm ( 203 /* this assumes wordcount to be a multiple of 4 */
230 "mov #2, r0 \n" /* r0 = 2 */ 204 asm (
231 /* correct for the "early increment" below */ 205 "add %1,%1 \n" /* wordcount -> bytecount */
232 "add #-4,%2 \n" /* wbuf -= 4; */ 206 "add %0,%1 \n" /* bytecount -> bufend */
233 "bra enter_loop \n" /* goto enter_loop, after next instr. */ 207 "mov %0,r0 \n"
234 "add #-4,%3 \n" /* wbufend -= 4; */ 208 "tst #1,r0 \n" /* 16-bit aligned ? */
235 "loop_w: \n" 209 "bt .aligned \n" /* yes, do word copy */
236 /* use read result and store, from last round */ 210
237 "swap.b %0,%0 \n" /* endian_swap(tmp); */ 211 ".align 2 \n"
238 "mov.w %0,@(r0,%2) \n" /* wbuf[r0] = tmp; */ 212 /* not 16-bit aligned */
239 "enter_loop: \n" 213 "mov #-1,r3 \n" /* prepare a bit mask for high byte */
240 "mov.w @%1,%0 \n" /* tmp = ATA_DATA; */ 214 "extu.b r3,r3 \n"
241 /* keep the pipeline busy with 2 independent instructions */ 215 "swap.b r3,r3 \n" /* r3 = 0x0000FF00 */
242 "add #0x04,%2 \n" /* wbuf += 4; */ 216
243 "cmp/hs %3,%2 \n" /* if (wbuf < wbufend) */ 217 "mov.w @%2,r2 \n" /* read first word (1st round) */
244 "swap.b %0,%0 \n" /* endian_swap(tmp); */ 218 "add #-12,%1 \n" /* adjust end address for offsets */
245 "mov.w %0,@%2 \n" /* wbuf[0] = tmp; */ 219 "mov.b r2,@%0 \n" /* store low byte of first word */
246 /* unrolled, do one more */ 220 "bra .start4_b \n" /* jump into loop after next instr. */
247 "mov.w @%1,%0 \n" /* tmp = ATA_DATA; */ 221 "add #-5,%0 \n" /* adjust for dest. offsets; now even */
248 /* use and store later, to keep pipeline busy */ 222
249 "bf loop_w \n" /* goto loop_w; */ 223 ".loop4_b: \n" /* main loop: copy 4 words in a row */
250 "swap.b %0,%0 \n" /* endian_swap(tmp); */ 224 "mov.w @%2,r2 \n" /* read first word (2+ round) */
251 "mov.w %0,@(r0,%2) \n" /* wbuf[r0] = tmp; */ 225 "and r3,r1 \n" /* get high byte of fourth word (2+ round) */
252 : /* outputs */ 226 "extu.b r2,r0 \n" /* get low byte of first word (2+ round) */
253 : /* inputs */ 227 "or r1,r0 \n" /* combine with high byte of fourth word */
254 /* %0 */ "r"(tmp), 228 "mov.w r0,@(4,%0) \n" /* store at buf[4] */
255 /* %1 */ "r"(&ATA_DATA), 229 "nop \n" /* maintain alignment */
256 /* %2 */ "r"(wbuf), 230 ".start4_b: \n"
257 /* %3 */ "r"(wbufend) 231 "mov.w @%2,r1 \n" /* read second word */
258 : /* trashed */ 232 "and r3,r2 \n" /* get high byte of first word */
259 "r0" 233 "extu.b r1,r0 \n" /* get low byte of second word */
260 ); 234 "or r2,r0 \n" /* combine with high byte of first word */
235 "mov.w r0,@(6,%0) \n" /* store at buf[6] */
236 "add #8,%0 \n" /* buf += 8 */
237 "mov.w @%2,r2 \n" /* read third word */
238 "and r3,r1 \n" /* get high byte of second word */
239 "extu.b r2,r0 \n" /* get low byte of third word */
240 "or r1,r0 \n" /* combine with high byte of second word */
241 "mov.w r0,@%0 \n" /* store at buf[0] */
242 "cmp/hi %0,%1 \n" /* check for end */
243 "mov.w @%2,r1 \n" /* read fourth word */
244 "and r3,r2 \n" /* get high byte of third word */
245 "extu.b r1,r0 \n" /* get low byte of fourth word */
246 "or r2,r0 \n" /* combine with high byte of third word */
247 "mov.w r0,@(2,%0) \n" /* store at buf[2] */
248 "bt .loop4_b \n"
249 /* 24 instructions for 4 copies, takes 26 clock cycles */
250 /* avg. 6.5 cycles per word - 100% faster */
251
252 "swap.b r1,r0 \n" /* get high byte of last word */
253 "mov.b r0,@(4,%0) \n" /* and store it */
254
255 "bra .exit \n"
256 "nop \n"
257
258 ".align 2 \n"
259 /* 16-bit aligned, loop(read and store word) */
260 ".aligned: \n"
261 "mov.w @%2,r2 \n" /* read first word (1st round) */
262 "add #-12,%1 \n" /* adjust end address for offsets */
263 "bra .start4_w \n" /* jump into loop after next instr. */
264 "add #-6,%0 \n" /* adjust for destination offsets */
265
266 ".loop4_w: \n" /* main loop: copy 4 words in a row */
267 "mov.w @%2,r2 \n" /* read first word (2+ round) */
268 "swap.b r1,r0 \n" /* swap fourth word (2+ round) */
269 "mov.w r0,@(4,%0) \n" /* store fourth word (2+ round) */
270 "nop \n" /* maintain alignment */
271 ".start4_w: \n"
272 "mov.w @%2,r1 \n" /* read second word */
273 "swap.b r2,r0 \n" /* swap first word */
274 "mov.w r0,@(6,%0) \n" /* store first word in buf[6] */
275 "add #8,%0 \n" /* buf += 8 */
276 "mov.w @%2,r2 \n" /* read third word */
277 "swap.b r1,r0 \n" /* swap second word */
278 "mov.w r0,@%0 \n" /* store second word in buf[0] */
279 "cmp/hi %0,%1 \n" /* check for end */
280 "mov.w @%2,r1 \n" /* read fourth word */
281 "swap.b r2,r0 \n" /* swap third word */
282 "mov.w r0,@(2,%0) \n" /* store third word */
283 "bt .loop4_w \n"
284 /* 16 instructions for 4 copies, takes 18 clock cycles */
285 /* avg. 4.5 cycles per word - 144% faster */
286
287 "swap.b r1,r0 \n" /* swap fourth word (last round) */
288 "mov.w r0,@(4,%0) \n" /* and store it */
289
290 ".exit: \n"
291 : /* outputs */
292 : /* inputs */
293 /* %0 */ "r"(buf),
294 /* %1 */ "r"(wordcount),
295 /* %2 */ "r"(&ATA_DATA)
296 : /*trashed */
297 "r0","r1","r2","r3"
298 );
261#endif 299#endif
262 }
263} 300}
264 301
265int ata_read_sectors(unsigned long start, 302int ata_read_sectors(unsigned long start,
@@ -958,14 +995,10 @@ int ata_init(void)
958 995
959 if (coldstart) 996 if (coldstart)
960 { 997 {
961 /* Reset both master and slave, we don't yet know what's in */ 998 /* This should reset both master and slave, we don't yet know what's in */
962 /* this is safe because non-present devices don't report busy */
963 ata_device = 0; 999 ata_device = 0;
964 if (ata_hard_reset()) 1000 if (ata_hard_reset())
965 return -1; 1001 return -1;
966 ata_device = SELECT_DEVICE1;
967 if (ata_hard_reset())
968 return -2;
969 } 1002 }
970 1003
971 rc = master_slave_detect(); 1004 rc = master_slave_detect();