diff options
Diffstat (limited to 'apps/recorder')
-rw-r--r-- | apps/recorder/jpeg_load.c | 144 |
1 files changed, 141 insertions, 3 deletions
diff --git a/apps/recorder/jpeg_load.c b/apps/recorder/jpeg_load.c index de5c95eab7..f5e0ea2cf3 100644 --- a/apps/recorder/jpeg_load.c +++ b/apps/recorder/jpeg_load.c | |||
@@ -253,6 +253,11 @@ INLINE unsigned range_limit(int value) | |||
253 | #define BUFAC 227 | 253 | #define BUFAC 227 |
254 | #define COMPONENT_SHIFT 15 | 254 | #define COMPONENT_SHIFT 15 |
255 | 255 | ||
256 | /* Some of the below have inline ASM optimizations of the loop contents. To | ||
257 | make comparison with the C versions easier, the C variable names are used | ||
258 | in comments whenever intermediate values are labeled. | ||
259 | */ | ||
260 | |||
256 | /* horizontal-pass 1-point IDCT */ | 261 | /* horizontal-pass 1-point IDCT */ |
257 | static void idct1h(int16_t *ws, unsigned char *out, int rows, int rowstep) | 262 | static void idct1h(int16_t *ws, unsigned char *out, int rows, int rowstep) |
258 | { | 263 | { |
@@ -303,6 +308,64 @@ static void idct4v(int16_t *ws, int cols) | |||
303 | int col; | 308 | int col; |
304 | for (col = 0; col < cols; col++, ws++) | 309 | for (col = 0; col < cols; col++, ws++) |
305 | { | 310 | { |
311 | #if defined(CPU_ARM) | ||
312 | int t0, t1, t2, t3, t4; | ||
313 | #if ARM_ARCH <= 4 | ||
314 | int t5; | ||
315 | #endif | ||
316 | asm volatile( | ||
317 | "ldrsh %[t4], [%[ws]]\n\t" /* t4 = tmp0 (ws[8*0]) */ | ||
318 | "ldrsh %[t1], [%[ws], #32]\n\t" /* t1 = tmp2 (ws[8*2]) */ | ||
319 | "ldrsh %[t2], [%[ws], #16]\n\t" /* t2 = z2 (ws[8*1]) */ | ||
320 | "add %[t0], %[t4], %[t1]\n\t" /* t0 = tmp10 >> 2 | ||
321 | (tmp0 + tmp2) */ | ||
322 | "sub %[t1], %[t4], %[t1]\n\t" /* t1 = tmp12 >> 2 | ||
323 | (tmp0 - tmp2) */ | ||
324 | "ldrsh %[t3], [%[ws], #48]\n\t" /* t3 = z3 (ws[8*3] */ | ||
325 | "add %[t4], %[t2], %[t3]\n\t" /* t4 = z2 + z3 */ | ||
326 | #if ARM_ARCH > 4 | ||
327 | "smulbb %[t4], %[c1], %[t4]\n\t" | ||
328 | "add %[t4], %[t4], #1024\n\t" /* t4 = z1 */ | ||
329 | "smlatb %[t3], %[c2c3], %[t3], %[t4]\n\t" | ||
330 | "smlabb %[t2], %[c2c3], %[t2], %[t4]\n\t" | ||
331 | "mov %[t3], %[t3], asr #11\n\t" /* t3 = tmp0 */ | ||
332 | "mov %[t2], %[t2], asr #11\n\t" /* t2 = tmp2 */ | ||
333 | #else | ||
334 | "add %[t5], %[t4], %[t4], lsl #3\n\t" | ||
335 | "rsb %[t4], %[t4], %[t5], lsl #4\n\t" | ||
336 | "rsb %[t4], %[t4], %[t4], lsl #5\n\t" | ||
337 | "add %[t4], %[t4], #1024\n\t" /*z1*/ | ||
338 | "mla %[t3], %[c2], %[t3], %[t4]\n\t" | ||
339 | "mla %[t2], %[c3], %[t2], %[t4]\n\t" | ||
340 | "mov %[t3], %[t3], asr #11\n\t" /* t3 = tmp0 */ | ||
341 | "mov %[t2], %[t2], asr #11\n\t" /* t2 = tmp2 */ | ||
342 | #endif | ||
343 | "add %[t4], %[t2], %[t0], lsl #2\n\t" /* t4 = tmp10 + tmp2 */ | ||
344 | "rsb %[t0], %[t2], %[t0], lsl #2\n\t" /* t0 = tmp10 - tmp2 */ | ||
345 | "add %[t2], %[t3], %[t1], lsl #2\n\t" /* t2 = tmp12 + tmp0 */ | ||
346 | "rsb %[t3], %[t3], %[t1], lsl #2\n\t" /* t3 = tmp12 - tmp0 */ | ||
347 | "strh %[t4], [%[ws]]\n\t" | ||
348 | "strh %[t0], [%[ws], #48]\n\t" | ||
349 | "strh %[t2], [%[ws], #16]\n\t" | ||
350 | "strh %[t3], [%[ws], #32]\n\t" | ||
351 | : [t0] "=&r" (t0), | ||
352 | [t1] "=&r" (t1), | ||
353 | [t2] "=&r" (t2), | ||
354 | [t3] "=&r" (t3), | ||
355 | [t4] "=&r" (t4) | ||
356 | #if ARM_ARCH <= 4 | ||
357 | ,[t5] "=&r" (t5) | ||
358 | #endif | ||
359 | : [ws] "r" (ws), | ||
360 | #if ARM_ARCH > 4 | ||
361 | [c1] "r" (FIX_0_541196100), | ||
362 | [c2c3] "r" (((-FIX_1_847759065)<<16)|FIX_0_765366865) | ||
363 | #else | ||
364 | [c2] "r" (-FIX_1_847759065), | ||
365 | [c3] "r" (FIX_0_765366865) | ||
366 | #endif | ||
367 | ); | ||
368 | #else | ||
306 | int tmp0, tmp2, tmp10, tmp12; | 369 | int tmp0, tmp2, tmp10, tmp12; |
307 | int z1, z2, z3; | 370 | int z1, z2, z3; |
308 | /* Even part */ | 371 | /* Even part */ |
@@ -332,17 +395,91 @@ static void idct4v(int16_t *ws, int cols) | |||
332 | ws[8*3] = (int) (tmp10 - tmp2); | 395 | ws[8*3] = (int) (tmp10 - tmp2); |
333 | ws[8*1] = (int) (tmp12 + tmp0); | 396 | ws[8*1] = (int) (tmp12 + tmp0); |
334 | ws[8*2] = (int) (tmp12 - tmp0); | 397 | ws[8*2] = (int) (tmp12 - tmp0); |
398 | #endif | ||
335 | } | 399 | } |
336 | } | 400 | } |
337 | 401 | ||
338 | /* horizontal-pass 4-point IDCT */ | 402 | /* horizontal-pass 4-point IDCT */ |
339 | static void idct4h(int16_t *ws, unsigned char *out, int rows, int rowstep) | 403 | static void idct4h(int16_t *ws, unsigned char *out, int rows, int rowstep) |
340 | { | 404 | { |
341 | int tmp0, tmp2, tmp10, tmp12; | ||
342 | int z1, z2, z3; | ||
343 | int row; | 405 | int row; |
344 | for (row = 0; row < rows; row++, out += rowstep, ws += 8) | 406 | for (row = 0; row < rows; row++, out += rowstep, ws += 8) |
345 | { | 407 | { |
408 | #if defined(CPU_ARM) | ||
409 | int t0, t1, t2, t3, t4; | ||
410 | #if ARM_ARCH <= 4 | ||
411 | int t5; | ||
412 | #endif | ||
413 | asm volatile( | ||
414 | "ldrsh %[t4], [%[ws]]\n\t" /* t4 = tmp0 (ws[0]) */ | ||
415 | "ldrsh %[t1], [%[ws], #4]\n\t" /* t1 = tmp2 (ws[2]) */ | ||
416 | "add %[t4], %[t4], #16\n\t" /* add rounding to DC */ | ||
417 | "add %[t4], %[t4], #4096\n\t" /* pre-add offset */ | ||
418 | "ldrsh %[t2], [%[ws], #2]\n\t" /* t2 = z2 (ws[1]) */ | ||
419 | "add %[t0], %[t4], %[t1]\n\t" /* t0 = tmp10 >> 13 | ||
420 | (tmp0 + tmp2) */ | ||
421 | "sub %[t1], %[t4], %[t1]\n\t" /* t1 = tmp12 >> 13 | ||
422 | (tmp0 - tmp2) */ | ||
423 | "ldrsh %[t3], [%[ws], #6]\n\t" /* t3 = z3 (ws[3] */ | ||
424 | "add %[t4], %[t2], %[t3]\n\t" /* t4 = z2 + z3 */ | ||
425 | #if ARM_ARCH > 4 | ||
426 | "smulbb %[t4], %[c1], %[t4]\n\t" | ||
427 | "smlatb %[t3], %[c2c3], %[t3], %[t4]\n\t" | ||
428 | "smlabb %[t2], %[c2c3], %[t2], %[t4]\n\t" | ||
429 | #else | ||
430 | "add %[t5], %[t4], %[t4], lsl #3\n\t" | ||
431 | "rsb %[t4], %[t4], %[t5], lsl #4\n\t" | ||
432 | "rsb %[t4], %[t4], %[t4], lsl #5\n\t" /* t4 = z1 */ | ||
433 | "mla %[t3], %[c2], %[t3], %[t4]\n\t" | ||
434 | "mla %[t2], %[c3], %[t2], %[t4]\n\t" | ||
435 | #endif | ||
436 | "add %[t4], %[t2], %[t0], lsl #13\n\t" /* t4 = tmp10 + tmp2 */ | ||
437 | "rsb %[t0], %[t2], %[t0], lsl #13\n\t" /* t0 = tmp10 - tmp2 */ | ||
438 | "add %[t2], %[t3], %[t1], lsl #13\n\t" /* t2 = tmp12 + tmp0 */ | ||
439 | "rsb %[t3], %[t3], %[t1], lsl #13\n\t" /* t3 = tmp12 - tmp0 */ | ||
440 | "mov %[t4], %[t4], asr #18\n\t" /* descale results */ | ||
441 | "mov %[t0], %[t0], asr #18\n\t" | ||
442 | "mov %[t2], %[t2], asr #18\n\t" | ||
443 | "mov %[t3], %[t3], asr #18\n\t" | ||
444 | "cmp %[t4], #255\n\t" /* range limit results */ | ||
445 | "mvnhi %[t4], %[t4], asr #31\n\t" | ||
446 | "cmp %[t0], #255\n\t" | ||
447 | "mvnhi %[t0], %[t0], asr #31\n\t" | ||
448 | "cmp %[t2], #255\n\t" | ||
449 | "mvnhi %[t2], %[t2], asr #31\n\t" | ||
450 | "cmp %[t3], #255\n\t" | ||
451 | "mvnhi %[t3], %[t3], asr #31\n\t" | ||
452 | "cmp %[t4], #255\n\t" | ||
453 | "mvnhi %[t4], %[t4], asr #31\n\t" | ||
454 | "strb %[t4], [%[out]]\n\t" | ||
455 | "strb %[t0], [%[out], %[o3]]\n\t" | ||
456 | "strb %[t2], [%[out], %[o1]]\n\t" | ||
457 | "strb %[t3], [%[out], %[o2]]\n\t" | ||
458 | : [t0] "=&r" (t0), | ||
459 | [t1] "=&r" (t1), | ||
460 | [t2] "=&r" (t2), | ||
461 | [t3] "=&r" (t3), | ||
462 | [t4] "=&r" (t4) | ||
463 | #if ARM_ARCH <= 4 | ||
464 | |||
465 | ,[t5] "=&r" (t5) | ||
466 | #endif | ||
467 | : [ws] "r" (ws), | ||
468 | [out] "r" (out), | ||
469 | [o1] "i" (JPEG_PIX_SZ), | ||
470 | [o2] "i" (JPEG_PIX_SZ*2), | ||
471 | [o3] "i" (JPEG_PIX_SZ*3), | ||
472 | #if ARM_ARCH > 4 | ||
473 | [c1] "r" (FIX_0_541196100), | ||
474 | [c2c3] "r" (((-FIX_1_847759065)<<16)|FIX_0_765366865) | ||
475 | #else | ||
476 | [c2] "r" (-FIX_1_847759065), | ||
477 | [c3] "r" (FIX_0_765366865) | ||
478 | #endif | ||
479 | ); | ||
480 | #else | ||
481 | int tmp0, tmp2, tmp10, tmp12; | ||
482 | int z1, z2, z3; | ||
346 | /* Even part */ | 483 | /* Even part */ |
347 | 484 | ||
348 | tmp0 = (int) ws[0] + (ONE << (PASS1_BITS + 2) | 485 | tmp0 = (int) ws[0] + (ONE << (PASS1_BITS + 2) |
@@ -359,7 +496,7 @@ static void idct4h(int16_t *ws, unsigned char *out, int rows, int rowstep) | |||
359 | z3 = (int) ws[3]; | 496 | z3 = (int) ws[3]; |
360 | 497 | ||
361 | z1 = MULTIPLY16(z2 + z3, FIX_0_541196100); | 498 | z1 = MULTIPLY16(z2 + z3, FIX_0_541196100); |
362 | tmp0 = z1 + MULTIPLY16(z3, - FIX_1_847759065); | 499 | tmp0 = z1 - MULTIPLY16(z3, FIX_1_847759065); |
363 | tmp2 = z1 + MULTIPLY16(z2, FIX_0_765366865); | 500 | tmp2 = z1 + MULTIPLY16(z2, FIX_0_765366865); |
364 | 501 | ||
365 | /* Final output stage */ | 502 | /* Final output stage */ |
@@ -372,6 +509,7 @@ static void idct4h(int16_t *ws, unsigned char *out, int rows, int rowstep) | |||
372 | DS_OUT)); | 509 | DS_OUT)); |
373 | out[JPEG_PIX_SZ*2] = range_limit((int) RIGHT_SHIFT(tmp12 - tmp0, | 510 | out[JPEG_PIX_SZ*2] = range_limit((int) RIGHT_SHIFT(tmp12 - tmp0, |
374 | DS_OUT)); | 511 | DS_OUT)); |
512 | #endif | ||
375 | } | 513 | } |
376 | } | 514 | } |
377 | 515 | ||