From 69ffcd5356cde94ec83417455081b701b7a6b633 Mon Sep 17 00:00:00 2001
From: Jens Arnold <amiconn@rockbox.org>
Date: Sun, 16 Oct 2005 22:24:00 +0000
Subject: Mandelbrot plugin: New fixed point routines, loosing a tiny bit of
 precision (3 bits), but way faster than before. Full precision routine uses
 the EMAC on coldfire. No more 64 bit arithmetics used except within the asm
 inlines.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@7636 a1c6a512-1295-4272-9138-f99709370657
---
 apps/plugins/mandelbrot.c | 336 +++++++++++++++++++++++++---------------------
 1 file changed, 182 insertions(+), 154 deletions(-)

(limited to 'apps/plugins/mandelbrot.c')

diff --git a/apps/plugins/mandelbrot.c b/apps/plugins/mandelbrot.c
index 8a3c14a194..1f80ff395f 100644
--- a/apps/plugins/mandelbrot.c
+++ b/apps/plugins/mandelbrot.c
@@ -55,15 +55,15 @@
 static struct plugin_api* rb;
 static char buff[32];
 
-/* Fixed point format: 6 bits integer part incl. sign, 58 bits fractional part */
-static long long x_min;
-static long long x_max;
-static long long x_step;
-static long long x_delta;
-static long long y_min;
-static long long y_max;
-static long long y_step;
-static long long y_delta;
+/* Fixed point format: 6 bits integer part incl. sign, 26 bits fractional part */
+static long x_min;
+static long x_max;
+static long x_step;
+static long x_delta;
+static long y_min;
+static long y_max;
+static long y_step;
+static long y_delta;
 
 static int px_min = 0;
 static int px_max = LCD_WIDTH;
@@ -78,117 +78,148 @@ static unsigned int gbuf_size = 0;
 static unsigned char graybuffer[LCD_HEIGHT];   
 
 #if CONFIG_CPU == SH7034
-long long mul64(long long f1, long long f2);
-
-asm (
-    /* 64bit * 64bit -> 64bit multiplication. Works for both signed and unsigned */
-    ".global _mul64      \n"  /* Notation: abcd * efgh, where each letter */
-    ".type   _mul64,@function\n" /* represents 16 bits. Called with: */
-"_mul64:                 \n"  /* r4 = ab, r5 = cd, r6 = ef, r7 = gh */
-    "swap.w  r4,r2       \n"  /* r2 = ba */
-    "mulu    r2,r7       \n"  /* a * h */
-    "swap.w  r6,r3       \n"  /* r3 = fe */
-    "sts     macl,r0     \n"  /* r0 = a * h */
-    "mulu    r5,r3       \n"  /* d * e */
-    "swap.w  r7,r3       \n"  /* r3 = hg */
-    "sts     macl,r1     \n"  /* r1 = d * e */
-    "mulu    r4,r3       \n"  /* b * g */
-    "add     r1,r0       \n"  /* r0 += r1 */
-    "swap.w  r5,r2       \n"  /* r2 = dc */
-    "sts     macl,r1     \n"  /* r1 = b * g */
-    "mulu    r2,r6       \n"  /* c * f */
-    "add     r1,r0       \n"  /* r0 += r1 */
-    "sts     macl,r1     \n"  /* r1 = c * f */
-    "mulu    r4,r7       \n"  /* b * h */
-    "add     r1,r0       \n"  /* r0 += r1 */
-    "shll16  r0          \n"  /* r0 <<= 16 */
-    "sts     macl,r1     \n"  /* r1 = b * h */
-    "mulu    r2,r3       \n"  /* c * g */
-    "add     r1,r0       \n"  /* r0 += r1 */
-    "sts     macl,r1     \n"  /* r1 = c * g */
-    "mulu    r5,r6       \n"  /* d * f */
-    "add     r1,r0       \n"  /* r0 += r1 */
-    "sts     macl,r1     \n"  /* r1 = d * f */
-    "mulu    r2,r7       \n"  /* c * h */
-    "add     r1,r0       \n"  /* r0 += r1 */
-    "sts     macl,r2     \n"  /* r2 = c * h */
-    "mulu    r5,r3       \n"  /* d * g */
-    "clrt                \n"
-    "sts     macl,r3     \n"  /* r3 = d * g */
-    "addc    r2,r3       \n"  /* r3 += r2, carry->r2 */
-    "movt    r2          \n"
-    "mulu    r5,r7       \n"  /* d * h */
-    "mov     r3,r1       \n"  /* r2r3 <<= 16 */
-    "xtrct   r2,r1       \n"
-    "mov     r1,r2       \n"
-    "shll16  r3          \n"
-    "sts     macl,r1     \n"  /* r1 = d * h */
-    "clrt                \n"  /* r0r1 += r2r3 */
-    "addc    r3,r1       \n"
-    "rts                 \n"
-    "addc    r2,r0       \n"
-);
-#define MUL64(a, b) mul64(a, b)
+
+#define MULS16_ASR10(a, b) muls16_asr10(a, b)
+static inline short muls16_asr10(short a, short b)
+{
+    short r;
+    asm (
+        "muls    %[a],%[b]   \n"
+        "sts     macl,%[r]   \n"
+        "shlr8   %[r]        \n"
+        "shlr2   %[r]        \n"
+        : /* outputs */
+        [r]"=r"(r)
+        : /* inputs */
+        [a]"r"(a),
+        [b]"r"(b)
+    );
+    return r;
+}
+
+#define MULS32_ASR26(a, b) muls32_asr26(a, b)
+static inline long muls32_asr26(long a, long b)
+{
+    long r, t1, t2, t3;
+    asm (
+        /* Signed 32bit * 32bit -> 64bit multiplication.
+           Notation: xxab * xxcd, where each letter represents 16 bits.
+           xx is the 64 bit sign extension.  */
+        "swap.w  %[a],%[t1]  \n" /* t1 = ba */
+        "mulu    %[t1],%[b]  \n" /* a * d */
+        "swap.w  %[b],%[t3]  \n" /* t3 = dc */
+        "sts     macl,%[t2]  \n" /* t2 = a * d */
+        "mulu    %[t1],%[t3] \n" /* a * c */
+        "sts     macl,%[r]   \n" /* hi = a * c */
+        "mulu    %[a],%[t3]  \n" /* b * c */
+        "clrt                \n"
+        "sts     macl,%[t3]  \n" /* t3 = b * c */
+        "addc    %[t2],%[t3] \n" /* t3 += t2, carry -> t2 */
+        "movt    %[t2]       \n"
+        "mulu    %[a],%[b]   \n" /* b * d */
+        "mov     %[t3],%[t1] \n" /* t2t3 <<= 16 */
+        "xtrct   %[t2],%[t1] \n"
+        "mov     %[t1],%[t2] \n"
+        "shll16  %[t3]       \n"
+        "sts     macl,%[t1]  \n" /* lo = b * d */
+        "clrt                \n" /* hi.lo += t2t3 */
+        "addc    %[t3],%[t1] \n"
+        "addc    %[t2],%[r]  \n"
+        "cmp/pz  %[a]        \n" /* ab >= 0 ? */
+        "bt      1f          \n"
+        "sub     %[b],%[r]   \n" /* no: hi -= cd (sign extension of ab is -1) */
+    "1:                      \n"
+        "cmp/pz  %[b]        \n" /* cd >= 0 ? */
+        "bt      2f          \n"
+        "sub     %[a],%[r]   \n" /* no: hi -= ab (sign extension of cd is -1) */
+    "2:                      \n"
+        /* Shift right by 26 and return low 32 bits */
+        "shll2   %[r]        \n" /* hi <<= 6 */
+        "shll2   %[r]        \n"
+        "shll2   %[r]        \n"
+        "shlr16  %[t1]       \n" /* (unsigned)lo >>= 26 */
+        "shlr8   %[t1]       \n"
+        "shlr2   %[t1]       \n"
+        "or      %[t1],%[r]  \n" /* combine result */
+        : /* outputs */
+        [r] "=&r"(r),
+        [t1]"=&r"(t1),
+        [t2]"=&r"(t2),
+        [t3]"=&r"(t3)
+        : /* inputs */
+        [a] "r"  (a),
+        [b] "r"  (b)
+    );
+    return r;
+}
 
 #elif defined CPU_COLDFIRE
-long long mul64(long long f1, long long f2);
-
-asm (
-    /* 64bit * 64bit -> 64bit multiplication. Works for both signed and unsigned */
-    ".section .text,\"ax\",@progbits\n"
-    ".global mul64       \n"    /* Notation: abcd * efgh, where each letter */
-    ".type   mul64,@function\n" /* represents 16 bits. */
-"mul64:                  \n"
-    "lea.l   (-16,%sp),%sp   \n"
-    "movem.l %d2-%d5,(%sp)   \n"
-
-    "movem.l (20,%sp),%d0-%d3\n" /* %d0%d1 = abcd, %d2%d3 = efgh */
-    "mulu.l  %d3,%d0     \n"  /* %d0 = ab * gh */
-    "mulu.l  %d1,%d2     \n"  /* %d2 = cd * ef */
-    "add.l   %d2,%d0     \n"  /* %d0 += %d2 */
-    "move.l  %d1,%d4     \n"  
-    "swap    %d4         \n"  /* %d4 = dc */
-    "move.l  %d3,%d5     \n"
-    "swap    %d5         \n"  /* %d5 = hg */
-    "move.l  %d4,%d2     \n"
-    "mulu.w  %d5,%d2     \n"  /* %d2 = c * g */
-    "add.l   %d2,%d0     \n"  /* %d0 += %d2 */
-    "mulu.w  %d3,%d4     \n"  /* %d4 = c * h */
-    "mulu.w  %d1,%d5     \n"  /* %d5 = d * h */
-    "add.l   %d4,%d5     \n"  /* %d5 += %d4 */
-    "subx.l  %d4,%d4     \n"
-    "neg.l   %d4         \n"  /* carry -> %d4 */
-    "swap    %d4         \n"
-    "swap    %d5         \n" 
-    "move.w  %d5,%d4     \n"
-    "clr.w   %d5         \n"  /* %d4%d5 <<= 16 */
-    "mulu.w  %d3,%d1     \n"  /* %d1 = d * h */
-    "add.l   %d5,%d1     \n"
-    "addx.l  %d4,%d0     \n"  /* %d0%d1 += %d4%d5 */
-
-    "movem.l (%sp),%d2-%d5   \n"
-    "lea.l   (16,%sp),%sp\n"
-    "rts                 \n"
-);
-#define MUL64(a, b) mul64(a, b)
-
-#else
-#define MUL64(a, b) ((a)*(b))
+
+#define MULS16_ASR10(a, b) muls16_asr10(a, b)
+static inline short muls16_asr10(short a, short b)
+{
+    asm (
+        "muls.w  %[a],%[b]   \n"
+        "asr.l   #8,%[b]     \n"
+        "asr.l   #2,%[b]     \n"
+        : /* outputs */
+        [b]"+d"(b)
+        : /* inputs */
+        [a]"d" (a)
+    );
+    return b;
+}
+
+/* Needs the EMAC initialised to fractional mode w/o rounding and saturation */
+#define MULS32_INIT() coldfire_set_macsr(EMAC_FRACTIONAL)
+#define MULS32_ASR26(a, b) muls32_asr26(a, b)
+static inline long muls32_asr26(long a, long b)
+{
+    long r, t1;
+    asm (
+        "mac.l   %[a],%[b],%%acc0\n" /* multiply */
+        "mulu.l  %[a],%[b]       \n" /* get lower half */
+        "movclr.l %%acc0,%[r]    \n" /* get higher half */
+        "asl.l   #5,%[r]         \n" /* hi <<= 5, plus one free */
+        "moveq.l #26,%[t1]       \n"
+        "lsr.l   %[t1],%[b]      \n" /* (unsigned)lo >>= 26 */
+        "or.l    %[b],%[r]       \n" /* combine result */
+        : /* outputs */
+        [r]"=&d"(r),
+        [t1]"=&d"(t1),
+        [b] "+d" (b)
+        : /* inputs */
+        [a] "d"  (a)
+    );
+    return r;
+}
+
+#endif /* CPU */
+
+/* default macros */
+#ifndef MULS16_ASR10 
+#define MULS16_ASR10(a, b) ((short)(((long)(a) * (long)(b)) >> 10))
+#endif
+#ifndef MULS32_ASR26
+#define MULS32_ASR26(a, b) ((long)(((long long)(a) * (long long)(b)) >> 26))
+#endif
+#ifndef MULS32_INIT
+#define MULS32_INIT()
 #endif
 
-int ilog2_fp(long long value) /* calculate integer log2(value_fp_6.58) */
+int ilog2_fp(long value) /* calculate integer log2(value_fp_6.26) */
 {
     int i = 0;
 
     if (value <= 0) {
         return -32767;
-    } else if (value > (1LL<<58)) {
-        while (value >= (2LL<<58)) {
+    } else if (value > (1L<<26)) {
+        while (value >= (2L<<26)) {
             value >>= 1;
             i++;
         }
     } else {
-        while (value < (1LL<<58)) {
+        while (value < (1L<<26)) {
             value <<= 1;
             i--;
         }
@@ -199,59 +230,57 @@ int ilog2_fp(long long value) /* calculate integer log2(value_fp_6.58) */
 void recalc_parameters(void)
 {
     x_step = (x_max - x_min) / LCD_WIDTH;
-    x_delta = MUL64(x_step, (LCD_WIDTH/8));
+    x_delta = x_step * (LCD_WIDTH/8);
     y_step = (y_max - y_min) / LCD_HEIGHT;
-    y_delta = MUL64(y_step, (LCD_HEIGHT/8));
-    step_log2 = MIN(ilog2_fp(x_step), ilog2_fp(y_step));
-    max_iter = MAX(10, -15 * step_log2 - 50);
+    y_delta = y_step * (LCD_HEIGHT/8);
+    step_log2 = ilog2_fp(MIN(x_step, y_step));
+    max_iter = MAX(15, -15 * step_log2 - 45);
 }
 
 void init_mandelbrot_set(void)
 {
 #if CONFIG_LCD == LCD_SSD1815 /* Recorder, Ondio. */
-    x_min = -38LL<<54;  // -2.375<<58
-    x_max =  15LL<<54;  //  0.9375<<58
+    x_min = -38L<<22;  // -2.375<<26
+    x_max =  15L<<22;  //  0.9375<<26
 #else  /* Iriver H1x0 */
-    x_min = -36LL<<54;  // -2.25<<58
-    x_max =  12LL<<54;  //  0.75<<58
+    x_min = -36L<<22;  // -2.25<<26
+    x_max =  12L<<22;  //  0.75<<26
 #endif
-    y_min = -19LL<<54;  // -1.1875<<58
-    y_max =  19LL<<54;  //  1.1875<<58
+    y_min = -19L<<22;  // -1.1875<<26
+    y_max =  19L<<22;  //  1.1875<<26
     recalc_parameters();
 }
 
-void calc_mandelbrot_32(void)
+void calc_mandelbrot_low_prec(void)
 {
     long start_tick, last_yield;
     unsigned n_iter;
-    long long a64, b64;
-    long x, x2, y, y2, a, b;
+    long a32, b32;
+    short x, x2, y, y2, a, b;
     int p_x, p_y;
     int brightness;
 
     start_tick = last_yield = *rb->current_tick;
     
-    for (p_x = 0, a64 = x_min; p_x <= px_max; p_x++, a64 += x_step) {
+    for (p_x = 0, a32 = x_min; p_x < px_max; p_x++, a32 += x_step) {
         if (p_x < px_min)
             continue;
-        a = a64 >> 32;
-        for (p_y = LCD_HEIGHT-1, b64 = y_min; p_y >= py_min; p_y--, b64 += y_step) {
+        a = a32 >> 16;
+        for (p_y = LCD_HEIGHT-1, b32 = y_min; p_y >= py_min; p_y--, b32 += y_step) {
             if (p_y >= py_max)
                 continue;
-            b = b64 >> 32;
-            x = 0;
-            y = 0;
+            b = b32 >> 16;
+            x = a;
+            y = b;
             n_iter = 0;
 
             while (++n_iter <= max_iter) {
-                x >>= 13;
-                y >>= 13;
-                x2 = x * x;
-                y2 = y * y;
-                
-                if (x2 + y2 > (4<<26)) break;
-                
-                y = 2 * x * y + b;
+                x2 = MULS16_ASR10(x, x);
+                y2 = MULS16_ASR10(y, y);
+
+                if (x2 + y2 > (4<<10)) break;
+
+                y = 2 * MULS16_ASR10(x, y) + b;
                 x = x2 - y2 + a;
             }
 
@@ -261,7 +290,7 @@ void calc_mandelbrot_32(void)
             } else {
                 brightness = 255 - (32 * (n_iter & 7));
             }
-            graybuffer[p_y]=brightness;
+            graybuffer[p_y] = brightness;
             /* be nice to other threads:
              * if at least one tick has passed, yield */
             if  (*rb->current_tick > last_yield) {
@@ -274,35 +303,34 @@ void calc_mandelbrot_32(void)
     }
 }
 
-void calc_mandelbrot_64(void)
+void calc_mandelbrot_high_prec(void)
 {
     long start_tick, last_yield;
     unsigned n_iter;
-    long long x, x2, y, y2, a, b;
+    long x, x2, y, y2, a, b;
     int p_x, p_y;
     int brightness;
-
-    start_tick = last_yield = *rb->current_tick;
     
+    MULS32_INIT();
+    start_tick = last_yield = *rb->current_tick;
+
     for (p_x = 0, a = x_min; p_x < px_max; p_x++, a += x_step) {
         if (p_x < px_min)
             continue;
         for (p_y = LCD_HEIGHT-1, b = y_min; p_y >= py_min; p_y--, b += y_step) {
             if (p_y >= py_max)
                 continue;
-            x = 0;
-            y = 0;
+            x = a;
+            y = b;
             n_iter = 0;
 
-            while (++n_iter<=max_iter) {
-                x >>= 29;
-                y >>= 29;
-                x2 = MUL64(x, x);
-                y2 = MUL64(y, y);
-                
-                if (x2 + y2 > (4LL<<58)) break;
+            while (++n_iter <= max_iter) {
+                x2 = MULS32_ASR26(x, x);
+                y2 = MULS32_ASR26(y, y);
+
+                if (x2 + y2 > (4L<<26)) break;
                 
-                y = 2 * MUL64(x, y) + b;
+                y = 2 * MULS32_ASR26(x, y) + b;
                 x = x2 - y2 + a;
             }
 
@@ -312,10 +340,10 @@ void calc_mandelbrot_64(void)
             } else {
                 brightness = 255 - (32 * (n_iter & 7));
             }
-            graybuffer[p_y]=brightness;
+            graybuffer[p_y] = brightness;
             /* be nice to other threads:
              * if at least one tick has passed, yield */
-            if  (*rb->current_tick > last_yield){
+            if  (*rb->current_tick > last_yield) {
                 rb->yield();
                 last_yield = *rb->current_tick;
             }
@@ -375,10 +403,10 @@ enum plugin_status plugin_start(struct plugin_api* api, void* parameter)
             if (redraw == REDRAW_FULL)
                 gray_ub_clear_display();
 
-            if (step_log2 <= -13) /* select precision */
-                calc_mandelbrot_64();
+            if (step_log2 <= -10) /* select precision */
+                calc_mandelbrot_high_prec();
             else
-                calc_mandelbrot_32();
+                calc_mandelbrot_low_prec();
 
 #if !defined(SIMULATOR) && defined(HAVE_ADJUSTABLE_CPU_FREQ)
             rb->cpu_boost(false);
-- 
cgit v1.2.3