Mandelbrot plugin: New fixed point routines, loosing a tiny bit of precision (3 bits), but way faster than before. Full precision routine uses the EMAC on coldfire. No more 64 bit arithmetics used except within the asm inlines.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@7636 a1c6a512-1295-4272-9138-f99709370657
author: Jens Arnold <amiconn@rockbox.org> 2005-10-16 22:24:00 +0000
committer: Jens Arnold <amiconn@rockbox.org> 2005-10-16 22:24:00 +0000
commit: 69ffcd5356cde94ec83417455081b701b7a6b633 (patch)
tree: 39f9c0f8d75784e6ee4aa18301cd9f7e50354b2a
parent: 0293dba5d28c336d2438660f194bd6b9a2f00a90 (diff)
download: rockbox-69ffcd5356cde94ec83417455081b701b7a6b633.tar.gz
rockbox-69ffcd5356cde94ec83417455081b701b7a6b633.zip
1 files changed, 182 insertions, 154 deletions
diff --git a/apps/plugins/mandelbrot.c b/apps/plugins/mandelbrot.c
index 8a3c14a194..1f80ff395f 100644
--- a/apps/plugins/mandelbrot.c
+++ b/apps/plugins/mandelbrot.c
@@ -55,15 +55,15 @@
 static struct plugin_api* rb;
 static char buff[32];
-/* Fixed point format: 6 bits integer part incl. sign, 58 bits fractional part */
+/* Fixed point format: 6 bits integer part incl. sign, 26 bits fractional part */
-static long long x_min;
+static long x_min;
-static long long x_max;
+static long x_max;
-static long long x_step;
+static long x_step;
-static long long x_delta;
+static long x_delta;
-static long long y_min;
+static long y_min;
-static long long y_max;
+static long y_max;
-static long long y_step;
+static long y_step;
-static long long y_delta;
+static long y_delta;
 static int px_min = 0;
 static int px_max = LCD_WIDTH;
@@ -78,117 +78,148 @@ static unsigned int gbuf_size = 0;
 static unsigned char graybuffer[LCD_HEIGHT];   
 #if CONFIG_CPU == SH7034
-long long mul64(long long f1, long long f2);
+#define MULS16_ASR10(a, b) muls16_asr10(a, b)
-asm (
+static inline short muls16_asr10(short a, short b)
-    /* 64bit * 64bit -> 64bit multiplication. Works for both signed and unsigned */
+{
-    ".global _mul64      \n"  /* Notation: abcd * efgh, where each letter */
+    short r;
-    ".type   _mul64,@function\n" /* represents 16 bits. Called with: */
+    asm (
-"_mul64:                 \n"  /* r4 = ab, r5 = cd, r6 = ef, r7 = gh */
+        "muls    %[a],%[b]   \n"
-    "swap.w  r4,r2       \n"  /* r2 = ba */
+        "sts     macl,%[r]   \n"
-    "mulu    r2,r7       \n"  /* a * h */
+        "shlr8   %[r]        \n"
-    "swap.w  r6,r3       \n"  /* r3 = fe */
+        "shlr2   %[r]        \n"
-    "sts     macl,r0     \n"  /* r0 = a * h */
+        : /* outputs */
-    "mulu    r5,r3       \n"  /* d * e */
+        [r]"=r"(r)
-    "swap.w  r7,r3       \n"  /* r3 = hg */
+        : /* inputs */
-    "sts     macl,r1     \n"  /* r1 = d * e */
+        [a]"r"(a),
-    "mulu    r4,r3       \n"  /* b * g */
+        [b]"r"(b)
-    "add     r1,r0       \n"  /* r0 += r1 */
+    );
-    "swap.w  r5,r2       \n"  /* r2 = dc */
+    return r;
-    "sts     macl,r1     \n"  /* r1 = b * g */
+}
-    "mulu    r2,r6       \n"  /* c * f */
-    "add     r1,r0       \n"  /* r0 += r1 */
+#define MULS32_ASR26(a, b) muls32_asr26(a, b)
-    "sts     macl,r1     \n"  /* r1 = c * f */
+static inline long muls32_asr26(long a, long b)
-    "mulu    r4,r7       \n"  /* b * h */
+{
-    "add     r1,r0       \n"  /* r0 += r1 */
+    long r, t1, t2, t3;
-    "shll16  r0          \n"  /* r0 <<= 16 */
+    asm (
-    "sts     macl,r1     \n"  /* r1 = b * h */
+        /* Signed 32bit * 32bit -> 64bit multiplication.
-    "mulu    r2,r3       \n"  /* c * g */
+           Notation: xxab * xxcd, where each letter represents 16 bits.
-    "add     r1,r0       \n"  /* r0 += r1 */
+           xx is the 64 bit sign extension.  */
-    "sts     macl,r1     \n"  /* r1 = c * g */
+        "swap.w  %[a],%[t1]  \n" /* t1 = ba */
-    "mulu    r5,r6       \n"  /* d * f */
+        "mulu    %[t1],%[b]  \n" /* a * d */
-    "add     r1,r0       \n"  /* r0 += r1 */
+        "swap.w  %[b],%[t3]  \n" /* t3 = dc */
-    "sts     macl,r1     \n"  /* r1 = d * f */
+        "sts     macl,%[t2]  \n" /* t2 = a * d */
-    "mulu    r2,r7       \n"  /* c * h */
+        "mulu    %[t1],%[t3] \n" /* a * c */
-    "add     r1,r0       \n"  /* r0 += r1 */
+        "sts     macl,%[r]   \n" /* hi = a * c */
-    "sts     macl,r2     \n"  /* r2 = c * h */
+        "mulu    %[a],%[t3]  \n" /* b * c */
-    "mulu    r5,r3       \n"  /* d * g */
+        "clrt                \n"
-    "clrt                \n"
+        "sts     macl,%[t3]  \n" /* t3 = b * c */
-    "sts     macl,r3     \n"  /* r3 = d * g */
+        "addc    %[t2],%[t3] \n" /* t3 += t2, carry -> t2 */
-    "addc    r2,r3       \n"  /* r3 += r2, carry->r2 */
+        "movt    %[t2]       \n"
-    "movt    r2          \n"
+        "mulu    %[a],%[b]   \n" /* b * d */
-    "mulu    r5,r7       \n"  /* d * h */
+        "mov     %[t3],%[t1] \n" /* t2t3 <<= 16 */
-    "mov     r3,r1       \n"  /* r2r3 <<= 16 */
+        "xtrct   %[t2],%[t1] \n"
-    "xtrct   r2,r1       \n"
+        "mov     %[t1],%[t2] \n"
-    "mov     r1,r2       \n"
+        "shll16  %[t3]       \n"
-    "shll16  r3          \n"
+        "sts     macl,%[t1]  \n" /* lo = b * d */
-    "sts     macl,r1     \n"  /* r1 = d * h */
+        "clrt                \n" /* hi.lo += t2t3 */
-    "clrt                \n"  /* r0r1 += r2r3 */
+        "addc    %[t3],%[t1] \n"
-    "addc    r3,r1       \n"
+        "addc    %[t2],%[r]  \n"
-    "rts                 \n"
+        "cmp/pz  %[a]        \n" /* ab >= 0 ? */
-    "addc    r2,r0       \n"
+        "bt      1f          \n"
-);
+        "sub     %[b],%[r]   \n" /* no: hi -= cd (sign extension of ab is -1) */
-#define MUL64(a, b) mul64(a, b)
+    "1:                      \n"
+        "cmp/pz  %[b]        \n" /* cd >= 0 ? */
+        "bt      2f          \n"
+        "sub     %[a],%[r]   \n" /* no: hi -= ab (sign extension of cd is -1) */
+    "2:                      \n"
+        /* Shift right by 26 and return low 32 bits */
+        "shll2   %[r]        \n" /* hi <<= 6 */
+        "shll2   %[r]        \n"
+        "shll2   %[r]        \n"
+        "shlr16  %[t1]       \n" /* (unsigned)lo >>= 26 */
+        "shlr8   %[t1]       \n"
+        "shlr2   %[t1]       \n"
+        "or      %[t1],%[r]  \n" /* combine result */
+        : /* outputs */
+        [r] "=&r"(r),
+        [t1]"=&r"(t1),
+        [t2]"=&r"(t2),
+        [t3]"=&r"(t3)
+        : /* inputs */
+        [a] "r"  (a),
+        [b] "r"  (b)
+    );
+    return r;
+}
 #elif defined CPU_COLDFIRE
-long long mul64(long long f1, long long f2);
+#define MULS16_ASR10(a, b) muls16_asr10(a, b)
-asm (
+static inline short muls16_asr10(short a, short b)
-    /* 64bit * 64bit -> 64bit multiplication. Works for both signed and unsigned */
+{
-    ".section .text,\"ax\",@progbits\n"
+    asm (
-    ".global mul64       \n"    /* Notation: abcd * efgh, where each letter */
+        "muls.w  %[a],%[b]   \n"
-    ".type   mul64,@function\n" /* represents 16 bits. */
+        "asr.l   #8,%[b]     \n"
-"mul64:                  \n"
+        "asr.l   #2,%[b]     \n"
-    "lea.l   (-16,%sp),%sp   \n"
+        : /* outputs */
-    "movem.l %d2-%d5,(%sp)   \n"
+        [b]"+d"(b)
+        : /* inputs */
-    "movem.l (20,%sp),%d0-%d3\n" /* %d0%d1 = abcd, %d2%d3 = efgh */
+        [a]"d" (a)
-    "mulu.l  %d3,%d0     \n"  /* %d0 = ab * gh */
+    );
-    "mulu.l  %d1,%d2     \n"  /* %d2 = cd * ef */
+    return b;
-    "add.l   %d2,%d0     \n"  /* %d0 += %d2 */
+}
-    "move.l  %d1,%d4     \n"  
-    "swap    %d4         \n"  /* %d4 = dc */
+/* Needs the EMAC initialised to fractional mode w/o rounding and saturation */
-    "move.l  %d3,%d5     \n"
+#define MULS32_INIT() coldfire_set_macsr(EMAC_FRACTIONAL)
-    "swap    %d5         \n"  /* %d5 = hg */
+#define MULS32_ASR26(a, b) muls32_asr26(a, b)
-    "move.l  %d4,%d2     \n"
+static inline long muls32_asr26(long a, long b)
-    "mulu.w  %d5,%d2     \n"  /* %d2 = c * g */
+{
-    "add.l   %d2,%d0     \n"  /* %d0 += %d2 */
+    long r, t1;
-    "mulu.w  %d3,%d4     \n"  /* %d4 = c * h */
+    asm (
-    "mulu.w  %d1,%d5     \n"  /* %d5 = d * h */
+        "mac.l   %[a],%[b],%%acc0\n" /* multiply */
-    "add.l   %d4,%d5     \n"  /* %d5 += %d4 */
+        "mulu.l  %[a],%[b]       \n" /* get lower half */
-    "subx.l  %d4,%d4     \n"
+        "movclr.l %%acc0,%[r]    \n" /* get higher half */
-    "neg.l   %d4         \n"  /* carry -> %d4 */
+        "asl.l   #5,%[r]         \n" /* hi <<= 5, plus one free */
-    "swap    %d4         \n"
+        "moveq.l #26,%[t1]       \n"
-    "swap    %d5         \n" 
+        "lsr.l   %[t1],%[b]      \n" /* (unsigned)lo >>= 26 */
-    "move.w  %d5,%d4     \n"
+        "or.l    %[b],%[r]       \n" /* combine result */
-    "clr.w   %d5         \n"  /* %d4%d5 <<= 16 */
+        : /* outputs */
-    "mulu.w  %d3,%d1     \n"  /* %d1 = d * h */
+        [r]"=&d"(r),
-    "add.l   %d5,%d1     \n"
+        [t1]"=&d"(t1),
-    "addx.l  %d4,%d0     \n"  /* %d0%d1 += %d4%d5 */
+        [b] "+d" (b)
+        : /* inputs */
-    "movem.l (%sp),%d2-%d5   \n"
+        [a] "d"  (a)
-    "lea.l   (16,%sp),%sp\n"
+    );
-    "rts                 \n"
+    return r;
-);
+}
-#define MUL64(a, b) mul64(a, b)
+#endif /* CPU */
-#else
-#define MUL64(a, b) ((a)*(b))
+/* default macros */
+#ifndef MULS16_ASR10 
+#define MULS16_ASR10(a, b) ((short)(((long)(a) * (long)(b)) >> 10))
+#endif
+#ifndef MULS32_ASR26
+#define MULS32_ASR26(a, b) ((long)(((long long)(a) * (long long)(b)) >> 26))
+#endif
+#ifndef MULS32_INIT
+#define MULS32_INIT()
 #endif
-int ilog2_fp(long long value) /* calculate integer log2(value_fp_6.58) */
+int ilog2_fp(long value) /* calculate integer log2(value_fp_6.26) */
 {
    int i = 0;
    if (value <= 0) {
        return -32767;
-    } else if (value > (1LL<<58)) {
+    } else if (value > (1L<<26)) {
-        while (value >= (2LL<<58)) {
+        while (value >= (2L<<26)) {
            value >>= 1;
            i++;
        }
    } else {
-        while (value < (1LL<<58)) {
+        while (value < (1L<<26)) {
            value <<= 1;
            i--;
        }
@@ -199,59 +230,57 @@ int ilog2_fp(long long value) /* calculate integer log2(value_fp_6.58) */
 void recalc_parameters(void)
 {
    x_step = (x_max - x_min) / LCD_WIDTH;
-    x_delta = MUL64(x_step, (LCD_WIDTH/8));
+    x_delta = x_step * (LCD_WIDTH/8);
    y_step = (y_max - y_min) / LCD_HEIGHT;
-    y_delta = MUL64(y_step, (LCD_HEIGHT/8));
+    y_delta = y_step * (LCD_HEIGHT/8);
-    step_log2 = MIN(ilog2_fp(x_step), ilog2_fp(y_step));
+    step_log2 = ilog2_fp(MIN(x_step, y_step));
-    max_iter = MAX(10, -15 * step_log2 - 50);
+    max_iter = MAX(15, -15 * step_log2 - 45);
 }
 void init_mandelbrot_set(void)
 {
 #if CONFIG_LCD == LCD_SSD1815 /* Recorder, Ondio. */
-    x_min = -38LL<<54;  // -2.375<<58
+    x_min = -38L<<22;  // -2.375<<26
-    x_max =  15LL<<54;  //  0.9375<<58
+    x_max =  15L<<22;  //  0.9375<<26
 #else  /* Iriver H1x0 */
-    x_min = -36LL<<54;  // -2.25<<58
+    x_min = -36L<<22;  // -2.25<<26
-    x_max =  12LL<<54;  //  0.75<<58
+    x_max =  12L<<22;  //  0.75<<26
 #endif
-    y_min = -19LL<<54;  // -1.1875<<58
+    y_min = -19L<<22;  // -1.1875<<26
-    y_max =  19LL<<54;  //  1.1875<<58
+    y_max =  19L<<22;  //  1.1875<<26
    recalc_parameters();
 }
-void calc_mandelbrot_32(void)
+void calc_mandelbrot_low_prec(void)
 {
    long start_tick, last_yield;
    unsigned n_iter;
-    long long a64, b64;
+    long a32, b32;
-    long x, x2, y, y2, a, b;
+    short x, x2, y, y2, a, b;
    int p_x, p_y;
    int brightness;
    start_tick = last_yield = *rb->current_tick;
    
-    for (p_x = 0, a64 = x_min; p_x <= px_max; p_x++, a64 += x_step) {
+    for (p_x = 0, a32 = x_min; p_x < px_max; p_x++, a32 += x_step) {
        if (p_x < px_min)
            continue;
-        a = a64 >> 32;
+        a = a32 >> 16;
-        for (p_y = LCD_HEIGHT-1, b64 = y_min; p_y >= py_min; p_y--, b64 += y_step) {
+        for (p_y = LCD_HEIGHT-1, b32 = y_min; p_y >= py_min; p_y--, b32 += y_step) {
            if (p_y >= py_max)
                continue;
-            b = b64 >> 32;
+            b = b32 >> 16;
-            x = 0;
+            x = a;
-            y = 0;
+            y = b;
            n_iter = 0;
            while (++n_iter <= max_iter) {
-                x >>= 13;
+                x2 = MULS16_ASR10(x, x);
-                y >>= 13;
+                y2 = MULS16_ASR10(y, y);
-                x2 = x * x;
-                y2 = y * y;
+                if (x2 + y2 > (4<<10)) break;
-                
-                if (x2 + y2 > (4<<26)) break;
+                y = 2 * MULS16_ASR10(x, y) + b;
-                
-                y = 2 * x * y + b;
                x = x2 - y2 + a;
            }
@@ -261,7 +290,7 @@ void calc_mandelbrot_32(void)
            } else {
                brightness = 255 - (32 * (n_iter & 7));
            }
-            graybuffer[p_y]=brightness;
+            graybuffer[p_y] = brightness;
            /* be nice to other threads:
             * if at least one tick has passed, yield */
            if  (*rb->current_tick > last_yield) {
@@ -274,35 +303,34 @@ void calc_mandelbrot_32(void)
    }
 }
-void calc_mandelbrot_64(void)
+void calc_mandelbrot_high_prec(void)
 {
    long start_tick, last_yield;
    unsigned n_iter;
-    long long x, x2, y, y2, a, b;
+    long x, x2, y, y2, a, b;
    int p_x, p_y;
    int brightness;
-    start_tick = last_yield = *rb->current_tick;
    
+    MULS32_INIT();
+    start_tick = last_yield = *rb->current_tick;
    for (p_x = 0, a = x_min; p_x < px_max; p_x++, a += x_step) {
        if (p_x < px_min)
            continue;
        for (p_y = LCD_HEIGHT-1, b = y_min; p_y >= py_min; p_y--, b += y_step) {
            if (p_y >= py_max)
                continue;
-            x = 0;
+            x = a;
-            y = 0;
+            y = b;
            n_iter = 0;
-            while (++n_iter<=max_iter) {
+            while (++n_iter <= max_iter) {
-                x >>= 29;
+                x2 = MULS32_ASR26(x, x);
-                y >>= 29;
+                y2 = MULS32_ASR26(y, y);
-                x2 = MUL64(x, x);
-                y2 = MUL64(y, y);
+                if (x2 + y2 > (4L<<26)) break;
-                
-                if (x2 + y2 > (4LL<<58)) break;
                
-                y = 2 * MUL64(x, y) + b;
+                y = 2 * MULS32_ASR26(x, y) + b;
                x = x2 - y2 + a;
            }
@@ -312,10 +340,10 @@ void calc_mandelbrot_64(void)
            } else {
                brightness = 255 - (32 * (n_iter & 7));
            }
-            graybuffer[p_y]=brightness;
+            graybuffer[p_y] = brightness;
            /* be nice to other threads:
             * if at least one tick has passed, yield */
-            if  (*rb->current_tick > last_yield){
+            if  (*rb->current_tick > last_yield) {
                rb->yield();
                last_yield = *rb->current_tick;
            }
@@ -375,10 +403,10 @@ enum plugin_status plugin_start(struct plugin_api* api, void* parameter)
            if (redraw == REDRAW_FULL)
                gray_ub_clear_display();
-            if (step_log2 <= -13) /* select precision */
+            if (step_log2 <= -10) /* select precision */
-                calc_mandelbrot_64();
+                calc_mandelbrot_high_prec();
            else
-                calc_mandelbrot_32();
+                calc_mandelbrot_low_prec();
 #if !defined(SIMULATOR) && defined(HAVE_ADJUSTABLE_CPU_FREQ)
            rb->cpu_boost(false);
author	Jens Arnold <amiconn@rockbox.org>	2005-10-16 22:24:00 +0000
committer	Jens Arnold <amiconn@rockbox.org>	2005-10-16 22:24:00 +0000
commit	69ffcd5356cde94ec83417455081b701b7a6b633 (patch)
tree	39f9c0f8d75784e6ee4aa18301cd9f7e50354b2a
parent	0293dba5d28c336d2438660f194bd6b9a2f00a90 (diff)
download	rockbox-69ffcd5356cde94ec83417455081b701b7a6b633.tar.gz rockbox-69ffcd5356cde94ec83417455081b701b7a6b633.zip

diff --git a/apps/plugins/mandelbrot.c b/apps/plugins/mandelbrot.c index 8a3c14a194..1f80ff395f 100644 --- a/apps/plugins/mandelbrot.c +++ b/apps/plugins/mandelbrot.c
@@ -55,15 +55,15 @@
55	static struct plugin_api* rb;	55	static struct plugin_api* rb;
56	static char buff[32];	56	static char buff[32];
57		57
58	/* Fixed point format: 6 bits integer part incl. sign, 58 bits fractional part */	58	/* Fixed point format: 6 bits integer part incl. sign, 26 bits fractional part */
59	static long long x_min;	59	static long x_min;
60	static long long x_max;	60	static long x_max;
61	static long long x_step;	61	static long x_step;
62	static long long x_delta;	62	static long x_delta;
63	static long long y_min;	63	static long y_min;
64	static long long y_max;	64	static long y_max;
65	static long long y_step;	65	static long y_step;
66	static long long y_delta;	66	static long y_delta;
67		67
68	static int px_min = 0;	68	static int px_min = 0;
69	static int px_max = LCD_WIDTH;	69	static int px_max = LCD_WIDTH;
@@ -78,117 +78,148 @@ static unsigned int gbuf_size = 0;
78	static unsigned char graybuffer[LCD_HEIGHT];	78	static unsigned char graybuffer[LCD_HEIGHT];
79		79
80	#if CONFIG_CPU == SH7034	80	#if CONFIG_CPU == SH7034
81	long long mul64(long long f1, long long f2);	81
82		82	#define MULS16_ASR10(a, b) muls16_asr10(a, b)
83	asm (	83	static inline short muls16_asr10(short a, short b)
84	/* 64bit * 64bit -> 64bit multiplication. Works for both signed and unsigned */	84	{
85	".global _mul64 \n" /* Notation: abcd * efgh, where each letter */	85	short r;
86	".type _mul64,@function\n" /* represents 16 bits. Called with: */	86	asm (
87	"_mul64: \n" /* r4 = ab, r5 = cd, r6 = ef, r7 = gh */	87	"muls %[a],%[b] \n"
88	"swap.w r4,r2 \n" /* r2 = ba */	88	"sts macl,%[r] \n"
89	"mulu r2,r7 \n" /* a * h */	89	"shlr8 %[r] \n"
90	"swap.w r6,r3 \n" /* r3 = fe */	90	"shlr2 %[r] \n"
91	"sts macl,r0 \n" /* r0 = a * h */	91	: /* outputs */
92	"mulu r5,r3 \n" /* d * e */	92	[r]"=r"(r)
93	"swap.w r7,r3 \n" /* r3 = hg */	93	: /* inputs */
94	"sts macl,r1 \n" /* r1 = d * e */	94	[a]"r"(a),
95	"mulu r4,r3 \n" /* b * g */	95	[b]"r"(b)
96	"add r1,r0 \n" /* r0 += r1 */	96	);
97	"swap.w r5,r2 \n" /* r2 = dc */	97	return r;
98	"sts macl,r1 \n" /* r1 = b * g */	98	}
99	"mulu r2,r6 \n" /* c * f */	99
100	"add r1,r0 \n" /* r0 += r1 */	100	#define MULS32_ASR26(a, b) muls32_asr26(a, b)
101	"sts macl,r1 \n" /* r1 = c * f */	101	static inline long muls32_asr26(long a, long b)
102	"mulu r4,r7 \n" /* b * h */	102	{
103	"add r1,r0 \n" /* r0 += r1 */	103	long r, t1, t2, t3;
104	"shll16 r0 \n" /* r0 <<= 16 */	104	asm (
105	"sts macl,r1 \n" /* r1 = b * h */	105	/* Signed 32bit * 32bit -> 64bit multiplication.
106	"mulu r2,r3 \n" /* c * g */	106	Notation: xxab * xxcd, where each letter represents 16 bits.
107	"add r1,r0 \n" /* r0 += r1 */	107	xx is the 64 bit sign extension. */
108	"sts macl,r1 \n" /* r1 = c * g */	108	"swap.w %[a],%[t1] \n" /* t1 = ba */
109	"mulu r5,r6 \n" /* d * f */	109	"mulu %[t1],%[b] \n" /* a * d */
110	"add r1,r0 \n" /* r0 += r1 */	110	"swap.w %[b],%[t3] \n" /* t3 = dc */
111	"sts macl,r1 \n" /* r1 = d * f */	111	"sts macl,%[t2] \n" /* t2 = a * d */
112	"mulu r2,r7 \n" /* c * h */	112	"mulu %[t1],%[t3] \n" /* a * c */
113	"add r1,r0 \n" /* r0 += r1 */	113	"sts macl,%[r] \n" /* hi = a * c */
114	"sts macl,r2 \n" /* r2 = c * h */	114	"mulu %[a],%[t3] \n" /* b * c */
115	"mulu r5,r3 \n" /* d * g */	115	"clrt \n"
116	"clrt \n"	116	"sts macl,%[t3] \n" /* t3 = b * c */
117	"sts macl,r3 \n" /* r3 = d * g */	117	"addc %[t2],%[t3] \n" /* t3 += t2, carry -> t2 */
118	"addc r2,r3 \n" /* r3 += r2, carry->r2 */	118	"movt %[t2] \n"
119	"movt r2 \n"	119	"mulu %[a],%[b] \n" /* b * d */
120	"mulu r5,r7 \n" /* d * h */	120	"mov %[t3],%[t1] \n" /* t2t3 <<= 16 */
121	"mov r3,r1 \n" /* r2r3 <<= 16 */	121	"xtrct %[t2],%[t1] \n"
122	"xtrct r2,r1 \n"	122	"mov %[t1],%[t2] \n"
123	"mov r1,r2 \n"	123	"shll16 %[t3] \n"
124	"shll16 r3 \n"	124	"sts macl,%[t1] \n" /* lo = b * d */
125	"sts macl,r1 \n" /* r1 = d * h */	125	"clrt \n" /* hi.lo += t2t3 */
126	"clrt \n" /* r0r1 += r2r3 */	126	"addc %[t3],%[t1] \n"
127	"addc r3,r1 \n"	127	"addc %[t2],%[r] \n"
128	"rts \n"	128	"cmp/pz %[a] \n" /* ab >= 0 ? */
129	"addc r2,r0 \n"	129	"bt 1f \n"
130	);	130	"sub %[b],%[r] \n" /* no: hi -= cd (sign extension of ab is -1) */
131	#define MUL64(a, b) mul64(a, b)	131	"1: \n"
		132	"cmp/pz %[b] \n" /* cd >= 0 ? */
		133	"bt 2f \n"
		134	"sub %[a],%[r] \n" /* no: hi -= ab (sign extension of cd is -1) */
		135	"2: \n"
		136	/* Shift right by 26 and return low 32 bits */
		137	"shll2 %[r] \n" /* hi <<= 6 */
		138	"shll2 %[r] \n"
		139	"shll2 %[r] \n"
		140	"shlr16 %[t1] \n" /* (unsigned)lo >>= 26 */
		141	"shlr8 %[t1] \n"
		142	"shlr2 %[t1] \n"
		143	"or %[t1],%[r] \n" /* combine result */
		144	: /* outputs */
		145	[r] "=&r"(r),
		146	[t1]"=&r"(t1),
		147	[t2]"=&r"(t2),
		148	[t3]"=&r"(t3)
		149	: /* inputs */
		150	[a] "r" (a),
		151	[b] "r" (b)
		152	);
		153	return r;
		154	}
132		155
133	#elif defined CPU_COLDFIRE	156	#elif defined CPU_COLDFIRE
134	long long mul64(long long f1, long long f2);	157
135		158	#define MULS16_ASR10(a, b) muls16_asr10(a, b)
136	asm (	159	static inline short muls16_asr10(short a, short b)
137	/* 64bit * 64bit -> 64bit multiplication. Works for both signed and unsigned */	160	{
138	".section .text,\"ax\",@progbits\n"	161	asm (
139	".global mul64 \n" /* Notation: abcd * efgh, where each letter */	162	"muls.w %[a],%[b] \n"
140	".type mul64,@function\n" /* represents 16 bits. */	163	"asr.l #8,%[b] \n"
141	"mul64: \n"	164	"asr.l #2,%[b] \n"
142	"lea.l (-16,%sp),%sp \n"	165	: /* outputs */
143	"movem.l %d2-%d5,(%sp) \n"	166	[b]"+d"(b)
144		167	: /* inputs */
145	"movem.l (20,%sp),%d0-%d3\n" /* %d0%d1 = abcd, %d2%d3 = efgh */	168	[a]"d" (a)
146	"mulu.l %d3,%d0 \n" /* %d0 = ab * gh */	169	);
147	"mulu.l %d1,%d2 \n" /* %d2 = cd * ef */	170	return b;
148	"add.l %d2,%d0 \n" /* %d0 += %d2 */	171	}
149	"move.l %d1,%d4 \n"	172
150	"swap %d4 \n" /* %d4 = dc */	173	/* Needs the EMAC initialised to fractional mode w/o rounding and saturation */
151	"move.l %d3,%d5 \n"	174	#define MULS32_INIT() coldfire_set_macsr(EMAC_FRACTIONAL)
152	"swap %d5 \n" /* %d5 = hg */	175	#define MULS32_ASR26(a, b) muls32_asr26(a, b)
153	"move.l %d4,%d2 \n"	176	static inline long muls32_asr26(long a, long b)
154	"mulu.w %d5,%d2 \n" /* %d2 = c * g */	177	{
155	"add.l %d2,%d0 \n" /* %d0 += %d2 */	178	long r, t1;
156	"mulu.w %d3,%d4 \n" /* %d4 = c * h */	179	asm (
157	"mulu.w %d1,%d5 \n" /* %d5 = d * h */	180	"mac.l %[a],%[b],%%acc0\n" /* multiply */
158	"add.l %d4,%d5 \n" /* %d5 += %d4 */	181	"mulu.l %[a],%[b] \n" /* get lower half */
159	"subx.l %d4,%d4 \n"	182	"movclr.l %%acc0,%[r] \n" /* get higher half */
160	"neg.l %d4 \n" /* carry -> %d4 */	183	"asl.l #5,%[r] \n" /* hi <<= 5, plus one free */
161	"swap %d4 \n"	184	"moveq.l #26,%[t1] \n"
162	"swap %d5 \n"	185	"lsr.l %[t1],%[b] \n" /* (unsigned)lo >>= 26 */
163	"move.w %d5,%d4 \n"	186	"or.l %[b],%[r] \n" /* combine result */
164	"clr.w %d5 \n" /* %d4%d5 <<= 16 */	187	: /* outputs */
165	"mulu.w %d3,%d1 \n" /* %d1 = d * h */	188	[r]"=&d"(r),
166	"add.l %d5,%d1 \n"	189	[t1]"=&d"(t1),
167	"addx.l %d4,%d0 \n" /* %d0%d1 += %d4%d5 */	190	[b] "+d" (b)
168		191	: /* inputs */
169	"movem.l (%sp),%d2-%d5 \n"	192	[a] "d" (a)
170	"lea.l (16,%sp),%sp\n"	193	);
171	"rts \n"	194	return r;
172	);	195	}
173	#define MUL64(a, b) mul64(a, b)	196
174		197	#endif /* CPU */
175	#else	198
176	#define MUL64(a, b) ((a)*(b))	199	/* default macros */
		200	#ifndef MULS16_ASR10
		201	#define MULS16_ASR10(a, b) ((short)(((long)(a) * (long)(b)) >> 10))
		202	#endif
		203	#ifndef MULS32_ASR26
		204	#define MULS32_ASR26(a, b) ((long)(((long long)(a) * (long long)(b)) >> 26))
		205	#endif
		206	#ifndef MULS32_INIT
		207	#define MULS32_INIT()
177	#endif	208	#endif
178		209
179	int ilog2_fp(long long value) /* calculate integer log2(value_fp_6.58) */	210	int ilog2_fp(long value) /* calculate integer log2(value_fp_6.26) */
180	{	211	{
181	int i = 0;	212	int i = 0;
182		213
183	if (value <= 0) {	214	if (value <= 0) {
184	return -32767;	215	return -32767;
185	} else if (value > (1LL<<58)) {	216	} else if (value > (1L<<26)) {
186	while (value >= (2LL<<58)) {	217	while (value >= (2L<<26)) {
187	value >>= 1;	218	value >>= 1;
188	i++;	219	i++;
189	}	220	}
190	} else {	221	} else {
191	while (value < (1LL<<58)) {	222	while (value < (1L<<26)) {
192	value <<= 1;	223	value <<= 1;
193	i--;	224	i--;
194	}	225	}
@@ -199,59 +230,57 @@ int ilog2_fp(long long value) /* calculate integer log2(value_fp_6.58) */
199	void recalc_parameters(void)	230	void recalc_parameters(void)
200	{	231	{
201	x_step = (x_max - x_min) / LCD_WIDTH;	232	x_step = (x_max - x_min) / LCD_WIDTH;
202	x_delta = MUL64(x_step, (LCD_WIDTH/8));	233	x_delta = x_step * (LCD_WIDTH/8);
203	y_step = (y_max - y_min) / LCD_HEIGHT;	234	y_step = (y_max - y_min) / LCD_HEIGHT;
204	y_delta = MUL64(y_step, (LCD_HEIGHT/8));	235	y_delta = y_step * (LCD_HEIGHT/8);
205	step_log2 = MIN(ilog2_fp(x_step), ilog2_fp(y_step));	236	step_log2 = ilog2_fp(MIN(x_step, y_step));
206	max_iter = MAX(10, -15 * step_log2 - 50);	237	max_iter = MAX(15, -15 * step_log2 - 45);
207	}	238	}
208		239
209	void init_mandelbrot_set(void)	240	void init_mandelbrot_set(void)
210	{	241	{
211	#if CONFIG_LCD == LCD_SSD1815 /* Recorder, Ondio. */	242	#if CONFIG_LCD == LCD_SSD1815 /* Recorder, Ondio. */
212	x_min = -38LL<<54; // -2.375<<58	243	x_min = -38L<<22; // -2.375<<26
213	x_max = 15LL<<54; // 0.9375<<58	244	x_max = 15L<<22; // 0.9375<<26
214	#else /* Iriver H1x0 */	245	#else /* Iriver H1x0 */
215	x_min = -36LL<<54; // -2.25<<58	246	x_min = -36L<<22; // -2.25<<26
216	x_max = 12LL<<54; // 0.75<<58	247	x_max = 12L<<22; // 0.75<<26
217	#endif	248	#endif
218	y_min = -19LL<<54; // -1.1875<<58	249	y_min = -19L<<22; // -1.1875<<26
219	y_max = 19LL<<54; // 1.1875<<58	250	y_max = 19L<<22; // 1.1875<<26
220	recalc_parameters();	251	recalc_parameters();
221	}	252	}
222		253
223	void calc_mandelbrot_32(void)	254	void calc_mandelbrot_low_prec(void)
224	{	255	{
225	long start_tick, last_yield;	256	long start_tick, last_yield;
226	unsigned n_iter;	257	unsigned n_iter;
227	long long a64, b64;	258	long a32, b32;
228	long x, x2, y, y2, a, b;	259	short x, x2, y, y2, a, b;
229	int p_x, p_y;	260	int p_x, p_y;
230	int brightness;	261	int brightness;
231		262
232	start_tick = last_yield = *rb->current_tick;	263	start_tick = last_yield = *rb->current_tick;
233		264
234	for (p_x = 0, a64 = x_min; p_x <= px_max; p_x++, a64 += x_step) {	265	for (p_x = 0, a32 = x_min; p_x < px_max; p_x++, a32 += x_step) {
235	if (p_x < px_min)	266	if (p_x < px_min)
236	continue;	267	continue;
237	a = a64 >> 32;	268	a = a32 >> 16;
238	for (p_y = LCD_HEIGHT-1, b64 = y_min; p_y >= py_min; p_y--, b64 += y_step) {	269	for (p_y = LCD_HEIGHT-1, b32 = y_min; p_y >= py_min; p_y--, b32 += y_step) {
239	if (p_y >= py_max)	270	if (p_y >= py_max)
240	continue;	271	continue;
241	b = b64 >> 32;	272	b = b32 >> 16;
242	x = 0;	273	x = a;
243	y = 0;	274	y = b;
244	n_iter = 0;	275	n_iter = 0;
245		276
246	while (++n_iter <= max_iter) {	277	while (++n_iter <= max_iter) {
247	x >>= 13;	278	x2 = MULS16_ASR10(x, x);
248	y >>= 13;	279	y2 = MULS16_ASR10(y, y);
249	x2 = x * x;	280
250	y2 = y * y;	281	if (x2 + y2 > (4<<10)) break;
251		282
252	if (x2 + y2 > (4<<26)) break;	283	y = 2 * MULS16_ASR10(x, y) + b;
253
254	y = 2 * x * y + b;
255	x = x2 - y2 + a;	284	x = x2 - y2 + a;
256	}	285	}
257		286
@@ -261,7 +290,7 @@ void calc_mandelbrot_32(void)
261	} else {	290	} else {
262	brightness = 255 - (32 * (n_iter & 7));	291	brightness = 255 - (32 * (n_iter & 7));
263	}	292	}
264	graybuffer[p_y]=brightness;	293	graybuffer[p_y] = brightness;
265	/* be nice to other threads:	294	/* be nice to other threads:
266	* if at least one tick has passed, yield */	295	* if at least one tick has passed, yield */
267	if (*rb->current_tick > last_yield) {	296	if (*rb->current_tick > last_yield) {
@@ -274,35 +303,34 @@ void calc_mandelbrot_32(void)
274	}	303	}
275	}	304	}
276		305
277	void calc_mandelbrot_64(void)	306	void calc_mandelbrot_high_prec(void)
278	{	307	{
279	long start_tick, last_yield;	308	long start_tick, last_yield;
280	unsigned n_iter;	309	unsigned n_iter;
281	long long x, x2, y, y2, a, b;	310	long x, x2, y, y2, a, b;
282	int p_x, p_y;	311	int p_x, p_y;
283	int brightness;	312	int brightness;
284
285	start_tick = last_yield = *rb->current_tick;
286		313
		314	MULS32_INIT();
		315	start_tick = last_yield = *rb->current_tick;
		316
287	for (p_x = 0, a = x_min; p_x < px_max; p_x++, a += x_step) {	317	for (p_x = 0, a = x_min; p_x < px_max; p_x++, a += x_step) {
288	if (p_x < px_min)	318	if (p_x < px_min)
289	continue;	319	continue;
290	for (p_y = LCD_HEIGHT-1, b = y_min; p_y >= py_min; p_y--, b += y_step) {	320	for (p_y = LCD_HEIGHT-1, b = y_min; p_y >= py_min; p_y--, b += y_step) {
291	if (p_y >= py_max)	321	if (p_y >= py_max)
292	continue;	322	continue;
293	x = 0;	323	x = a;
294	y = 0;	324	y = b;
295	n_iter = 0;	325	n_iter = 0;
296		326
297	while (++n_iter<=max_iter) {	327	while (++n_iter <= max_iter) {
298	x >>= 29;	328	x2 = MULS32_ASR26(x, x);
299	y >>= 29;	329	y2 = MULS32_ASR26(y, y);
300	x2 = MUL64(x, x);	330
301	y2 = MUL64(y, y);	331	if (x2 + y2 > (4L<<26)) break;
302
303	if (x2 + y2 > (4LL<<58)) break;
304		332
305	y = 2 * MUL64(x, y) + b;	333	y = 2 * MULS32_ASR26(x, y) + b;
306	x = x2 - y2 + a;	334	x = x2 - y2 + a;
307	}	335	}
308		336
@@ -312,10 +340,10 @@ void calc_mandelbrot_64(void)
312	} else {	340	} else {
313	brightness = 255 - (32 * (n_iter & 7));	341	brightness = 255 - (32 * (n_iter & 7));
314	}	342	}
315	graybuffer[p_y]=brightness;	343	graybuffer[p_y] = brightness;
316	/* be nice to other threads:	344	/* be nice to other threads:
317	* if at least one tick has passed, yield */	345	* if at least one tick has passed, yield */
318	if (*rb->current_tick > last_yield){	346	if (*rb->current_tick > last_yield) {
319	rb->yield();	347	rb->yield();
320	last_yield = *rb->current_tick;	348	last_yield = *rb->current_tick;
321	}	349	}
@@ -375,10 +403,10 @@ enum plugin_status plugin_start(struct plugin_api* api, void* parameter)
375	if (redraw == REDRAW_FULL)	403	if (redraw == REDRAW_FULL)
376	gray_ub_clear_display();	404	gray_ub_clear_display();
377		405
378	if (step_log2 <= -13) /* select precision */	406	if (step_log2 <= -10) /* select precision */
379	calc_mandelbrot_64();	407	calc_mandelbrot_high_prec();
380	else	408	else
381	calc_mandelbrot_32();	409	calc_mandelbrot_low_prec();
382		410
383	#if !defined(SIMULATOR) && defined(HAVE_ADJUSTABLE_CPU_FREQ)	411	#if !defined(SIMULATOR) && defined(HAVE_ADJUSTABLE_CPU_FREQ)
384	rb->cpu_boost(false);	412	rb->cpu_boost(false);