diff options
author | Jörg Hohensohn <hohensoh@rockbox.org> | 2004-03-10 14:15:14 +0000 |
---|---|---|
committer | Jörg Hohensohn <hohensoh@rockbox.org> | 2004-03-10 14:15:14 +0000 |
commit | 5fb1e1024fb58d3704261545d63586f1e617f199 (patch) | |
tree | 1ccb23d1a86725272e375f6b154fab742226b5f6 /firmware/drivers | |
parent | dcdb89ca9f5f0c6787be7c072b9476421f87e869 (diff) | |
download | rockbox-5fb1e1024fb58d3704261545d63586f1e617f199.tar.gz rockbox-5fb1e1024fb58d3704261545d63586f1e617f199.zip |
- no second ata_hard_reset() call, saves ~2.5 seconds flash boot time (please test)
- Jens' new assembler code in copy_read_sectors(), but still disabled
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@4358 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'firmware/drivers')
-rw-r--r-- | firmware/drivers/ata.c | 177 |
1 files changed, 105 insertions, 72 deletions
diff --git a/firmware/drivers/ata.c b/firmware/drivers/ata.c index 5653466900..7641be7ec1 100644 --- a/firmware/drivers/ata.c +++ b/firmware/drivers/ata.c | |||
@@ -174,92 +174,129 @@ static void copy_read_sectors(unsigned char* buf, | |||
174 | __attribute__ ((section (".icode"))); | 174 | __attribute__ ((section (".icode"))); |
175 | static void copy_read_sectors(unsigned char* buf, int wordcount) | 175 | static void copy_read_sectors(unsigned char* buf, int wordcount) |
176 | { | 176 | { |
177 | unsigned short tmp = 0; /* have to init to prevent warning? */ | 177 | #ifdef PREFER_C |
178 | unsigned short tmp = 0; | ||
178 | 179 | ||
179 | if ( (unsigned int)buf & 1) | 180 | if ( (unsigned int)buf & 1) |
180 | { /* not 16-bit aligned, copy byte by byte */ | 181 | { /* not 16-bit aligned, copy byte by byte */ |
181 | unsigned char* bufend = buf + wordcount*2; | 182 | unsigned char* bufend = buf + wordcount*2; |
182 | #ifdef PREFER_C | ||
183 | do | 183 | do |
184 | { /* loop compiles to 9 assembler instructions */ | 184 | { /* loop compiles to 9 assembler instructions */ |
185 | /* takes 13 clock cycles because of 2 pipeline stalls */ | ||
185 | tmp = ATA_DATA; | 186 | tmp = ATA_DATA; |
186 | *buf++ = tmp & 0xff; /* I assume big endian */ | 187 | *buf++ = tmp & 0xff; /* I assume big endian */ |
187 | *buf++ = tmp >> 8; /* and don't use the SWAB16 macro */ | 188 | *buf++ = tmp >> 8; /* and don't use the SWAB16 macro */ |
188 | } while (buf < bufend); /* tail loop is faster */ | 189 | } while (buf < bufend); /* tail loop is faster */ |
189 | #else | ||
190 | /* I can bring it down to 7 instructions/loop, and exploit pipeline */ | ||
191 | asm ( | ||
192 | "mov #1, r0 \n" /* r0 = 1; */ | ||
193 | /* correct for the "early increment" below */ | ||
194 | "add #-2,%2 \n" /* buf -= 2; */ | ||
195 | "add #-2,%3 \n" /* bufend -= 2; */ | ||
196 | "loop_b: \n" | ||
197 | "mov.w @%1,%0 \n" /* tmp = ATA_DATA; */ | ||
198 | /* Now we're reading from the bus, I do something independent we | ||
199 | need later, to avoid pipeline stall */ | ||
200 | "add #0x02,%2 \n" /* buf += 2; */ | ||
201 | "cmp/hs %3,%2 \n" /* if (buf < bufend) */ | ||
202 | /* now use the read result */ | ||
203 | "mov.b %0,@%2 \n" /* buf[0] = lowbyte(tmp); */ | ||
204 | "shlr8 %0 \n" /* tmp >>= 8; */ | ||
205 | "mov.b %0,@(r0,%2) \n" /* buf[r0] = lowbyte(tmp); */ | ||
206 | "bf loop_b \n" /* goto loop_b; */ | ||
207 | : /* outputs */ | ||
208 | : /* inputs */ | ||
209 | /* %0 */ "r"(tmp), | ||
210 | /* %1 */ "r"(&ATA_DATA), | ||
211 | /* %2 */ "r"(buf), | ||
212 | /* %3 */ "r"(bufend) | ||
213 | : /* trashed */ | ||
214 | "r0" | ||
215 | ); | ||
216 | #endif | ||
217 | } | 190 | } |
218 | else | 191 | else |
219 | { /* 16-bit aligned, can do faster copy */ | 192 | { /* 16-bit aligned, can do faster copy */ |
220 | unsigned short* wbuf = (unsigned short*)buf; | 193 | unsigned short* wbuf = (unsigned short*)buf; |
221 | unsigned short* wbufend = wbuf + wordcount; | 194 | unsigned short* wbufend = wbuf + wordcount; |
222 | #ifdef PREFER_C | ||
223 | do | 195 | do |
224 | { /* loop compiles to 7 assembler instructions */ | 196 | { /* loop compiles to 7 assembler instructions */ |
197 | /* takes 11 clock cycles because of 2 pipeline stalls */ | ||
225 | *wbuf = SWAB16(ATA_DATA); | 198 | *wbuf = SWAB16(ATA_DATA); |
226 | } while (++wbuf < wbufend); /* tail loop is faster */ | 199 | } while (++wbuf < wbufend); /* tail loop is faster */ |
200 | } | ||
227 | #else | 201 | #else |
228 | /* I can bring it down to 9 instructions for 2 loops, and pipeline */ | 202 | /* turbo-charged assembler version */ |
229 | asm ( | 203 | /* this assumes wordcount to be a multiple of 4 */ |
230 | "mov #2, r0 \n" /* r0 = 2 */ | 204 | asm ( |
231 | /* correct for the "early increment" below */ | 205 | "add %1,%1 \n" /* wordcount -> bytecount */ |
232 | "add #-4,%2 \n" /* wbuf -= 4; */ | 206 | "add %0,%1 \n" /* bytecount -> bufend */ |
233 | "bra enter_loop \n" /* goto enter_loop, after next instr. */ | 207 | "mov %0,r0 \n" |
234 | "add #-4,%3 \n" /* wbufend -= 4; */ | 208 | "tst #1,r0 \n" /* 16-bit aligned ? */ |
235 | "loop_w: \n" | 209 | "bt .aligned \n" /* yes, do word copy */ |
236 | /* use read result and store, from last round */ | 210 | |
237 | "swap.b %0,%0 \n" /* endian_swap(tmp); */ | 211 | ".align 2 \n" |
238 | "mov.w %0,@(r0,%2) \n" /* wbuf[r0] = tmp; */ | 212 | /* not 16-bit aligned */ |
239 | "enter_loop: \n" | 213 | "mov #-1,r3 \n" /* prepare a bit mask for high byte */ |
240 | "mov.w @%1,%0 \n" /* tmp = ATA_DATA; */ | 214 | "extu.b r3,r3 \n" |
241 | /* keep the pipeline busy with 2 independent instructions */ | 215 | "swap.b r3,r3 \n" /* r3 = 0x0000FF00 */ |
242 | "add #0x04,%2 \n" /* wbuf += 4; */ | 216 | |
243 | "cmp/hs %3,%2 \n" /* if (wbuf < wbufend) */ | 217 | "mov.w @%2,r2 \n" /* read first word (1st round) */ |
244 | "swap.b %0,%0 \n" /* endian_swap(tmp); */ | 218 | "add #-12,%1 \n" /* adjust end address for offsets */ |
245 | "mov.w %0,@%2 \n" /* wbuf[0] = tmp; */ | 219 | "mov.b r2,@%0 \n" /* store low byte of first word */ |
246 | /* unrolled, do one more */ | 220 | "bra .start4_b \n" /* jump into loop after next instr. */ |
247 | "mov.w @%1,%0 \n" /* tmp = ATA_DATA; */ | 221 | "add #-5,%0 \n" /* adjust for dest. offsets; now even */ |
248 | /* use and store later, to keep pipeline busy */ | 222 | |
249 | "bf loop_w \n" /* goto loop_w; */ | 223 | ".loop4_b: \n" /* main loop: copy 4 words in a row */ |
250 | "swap.b %0,%0 \n" /* endian_swap(tmp); */ | 224 | "mov.w @%2,r2 \n" /* read first word (2+ round) */ |
251 | "mov.w %0,@(r0,%2) \n" /* wbuf[r0] = tmp; */ | 225 | "and r3,r1 \n" /* get high byte of fourth word (2+ round) */ |
252 | : /* outputs */ | 226 | "extu.b r2,r0 \n" /* get low byte of first word (2+ round) */ |
253 | : /* inputs */ | 227 | "or r1,r0 \n" /* combine with high byte of fourth word */ |
254 | /* %0 */ "r"(tmp), | 228 | "mov.w r0,@(4,%0) \n" /* store at buf[4] */ |
255 | /* %1 */ "r"(&ATA_DATA), | 229 | "nop \n" /* maintain alignment */ |
256 | /* %2 */ "r"(wbuf), | 230 | ".start4_b: \n" |
257 | /* %3 */ "r"(wbufend) | 231 | "mov.w @%2,r1 \n" /* read second word */ |
258 | : /* trashed */ | 232 | "and r3,r2 \n" /* get high byte of first word */ |
259 | "r0" | 233 | "extu.b r1,r0 \n" /* get low byte of second word */ |
260 | ); | 234 | "or r2,r0 \n" /* combine with high byte of first word */ |
235 | "mov.w r0,@(6,%0) \n" /* store at buf[6] */ | ||
236 | "add #8,%0 \n" /* buf += 8 */ | ||
237 | "mov.w @%2,r2 \n" /* read third word */ | ||
238 | "and r3,r1 \n" /* get high byte of second word */ | ||
239 | "extu.b r2,r0 \n" /* get low byte of third word */ | ||
240 | "or r1,r0 \n" /* combine with high byte of second word */ | ||
241 | "mov.w r0,@%0 \n" /* store at buf[0] */ | ||
242 | "cmp/hi %0,%1 \n" /* check for end */ | ||
243 | "mov.w @%2,r1 \n" /* read fourth word */ | ||
244 | "and r3,r2 \n" /* get high byte of third word */ | ||
245 | "extu.b r1,r0 \n" /* get low byte of fourth word */ | ||
246 | "or r2,r0 \n" /* combine with high byte of third word */ | ||
247 | "mov.w r0,@(2,%0) \n" /* store at buf[2] */ | ||
248 | "bt .loop4_b \n" | ||
249 | /* 24 instructions for 4 copies, takes 26 clock cycles */ | ||
250 | /* avg. 6.5 cycles per word - 100% faster */ | ||
251 | |||
252 | "swap.b r1,r0 \n" /* get high byte of last word */ | ||
253 | "mov.b r0,@(4,%0) \n" /* and store it */ | ||
254 | |||
255 | "bra .exit \n" | ||
256 | "nop \n" | ||
257 | |||
258 | ".align 2 \n" | ||
259 | /* 16-bit aligned, loop(read and store word) */ | ||
260 | ".aligned: \n" | ||
261 | "mov.w @%2,r2 \n" /* read first word (1st round) */ | ||
262 | "add #-12,%1 \n" /* adjust end address for offsets */ | ||
263 | "bra .start4_w \n" /* jump into loop after next instr. */ | ||
264 | "add #-6,%0 \n" /* adjust for destination offsets */ | ||
265 | |||
266 | ".loop4_w: \n" /* main loop: copy 4 words in a row */ | ||
267 | "mov.w @%2,r2 \n" /* read first word (2+ round) */ | ||
268 | "swap.b r1,r0 \n" /* swap fourth word (2+ round) */ | ||
269 | "mov.w r0,@(4,%0) \n" /* store fourth word (2+ round) */ | ||
270 | "nop \n" /* maintain alignment */ | ||
271 | ".start4_w: \n" | ||
272 | "mov.w @%2,r1 \n" /* read second word */ | ||
273 | "swap.b r2,r0 \n" /* swap first word */ | ||
274 | "mov.w r0,@(6,%0) \n" /* store first word in buf[6] */ | ||
275 | "add #8,%0 \n" /* buf += 8 */ | ||
276 | "mov.w @%2,r2 \n" /* read third word */ | ||
277 | "swap.b r1,r0 \n" /* swap second word */ | ||
278 | "mov.w r0,@%0 \n" /* store second word in buf[0] */ | ||
279 | "cmp/hi %0,%1 \n" /* check for end */ | ||
280 | "mov.w @%2,r1 \n" /* read fourth word */ | ||
281 | "swap.b r2,r0 \n" /* swap third word */ | ||
282 | "mov.w r0,@(2,%0) \n" /* store third word */ | ||
283 | "bt .loop4_w \n" | ||
284 | /* 16 instructions for 4 copies, takes 18 clock cycles */ | ||
285 | /* avg. 4.5 cycles per word - 144% faster */ | ||
286 | |||
287 | "swap.b r1,r0 \n" /* swap fourth word (last round) */ | ||
288 | "mov.w r0,@(4,%0) \n" /* and store it */ | ||
289 | |||
290 | ".exit: \n" | ||
291 | : /* outputs */ | ||
292 | : /* inputs */ | ||
293 | /* %0 */ "r"(buf), | ||
294 | /* %1 */ "r"(wordcount), | ||
295 | /* %2 */ "r"(&ATA_DATA) | ||
296 | : /*trashed */ | ||
297 | "r0","r1","r2","r3" | ||
298 | ); | ||
261 | #endif | 299 | #endif |
262 | } | ||
263 | } | 300 | } |
264 | 301 | ||
265 | int ata_read_sectors(unsigned long start, | 302 | int ata_read_sectors(unsigned long start, |
@@ -958,14 +995,10 @@ int ata_init(void) | |||
958 | 995 | ||
959 | if (coldstart) | 996 | if (coldstart) |
960 | { | 997 | { |
961 | /* Reset both master and slave, we don't yet know what's in */ | 998 | /* This should reset both master and slave, we don't yet know what's in */ |
962 | /* this is safe because non-present devices don't report busy */ | ||
963 | ata_device = 0; | 999 | ata_device = 0; |
964 | if (ata_hard_reset()) | 1000 | if (ata_hard_reset()) |
965 | return -1; | 1001 | return -1; |
966 | ata_device = SELECT_DEVICE1; | ||
967 | if (ata_hard_reset()) | ||
968 | return -2; | ||
969 | } | 1002 | } |
970 | 1003 | ||
971 | rc = master_slave_detect(); | 1004 | rc = master_slave_detect(); |