From f924e2bbf798a791dba3f60f03aa69658d0d3850 Mon Sep 17 00:00:00 2001
From: Tomas Frydrych <tf@openedhand.com>
Date: Fri, 19 Jan 2007 16:04:06 +0000
Subject: [PATCH] more fixed point work

---
 ChangeLog                        |  16 +++
 clutter/clutter-alpha.c          |   4 +-
 clutter/clutter-behaviour-path.c |   6 +-
 clutter/clutter-fixed.c          | 163 ++++++++++++++++++++-----------
 clutter/clutter-fixed.h          |  15 ++-
 5 files changed, 136 insertions(+), 68 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index f4b5fd115..a6fcc3f7b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,19 @@
+2007-01-19  Tomas Frydrych  <tf@openedhand.com>
+
+	* clutter/clutter-fixed.h.: 
+	* clutter/clutter-fixed.c:
+	Added fast double to int and double to fixed point conversion
+	routines; changed CLUTTER_FLOAT_TO_FIXED to use it.
+	Replaced clutter_sqrti with fixed point implementation of the QIII
+	algorithm.
+	
+
+	* clutter/clutter-behavior-path.c: use clutter_sqrti always
+
+	* clutter/clutter-alpha.c: 
+	(sinc_func): replaced double -> int cast with CLUTTER_FLOAT_TO_INT
+	
+	
 2007-01-18  Emmanuele Bassi  <ebassi@openedhand.com>
 
 	* configure.ac: Post release bump to 0.3.0.
diff --git a/clutter/clutter-alpha.c b/clutter/clutter-alpha.c
index 70978309d..c4c272131 100644
--- a/clutter/clutter-alpha.c
+++ b/clutter/clutter-alpha.c
@@ -519,7 +519,6 @@ sincx1024_func (ClutterAlpha *alpha,
 
   return CLUTTER_FIXED_INT (sine * CLUTTER_ALPHA_MAX_ALPHA);
 }
-
 #if 0
 /*
  * The following two functions are left in place for reference
@@ -570,9 +569,10 @@ sinc_func (ClutterAlpha *alpha,
 
   CLUTTER_NOTE (ALPHA, "sine: %2f\n", sine);
 
-  return (guint32) (sine * (gdouble) CLUTTER_ALPHA_MAX_ALPHA);
+  return CLUTTER_FLOAT_TO_INT ((sine * (gdouble) CLUTTER_ALPHA_MAX_ALPHA));
 }
 #endif
+
 /**
  * clutter_sine_func:
  * @alpha: a #ClutterAlpha
diff --git a/clutter/clutter-behaviour-path.c b/clutter/clutter-behaviour-path.c
index ba7d9ddf4..ecee1c8c1 100644
--- a/clutter/clutter-behaviour-path.c
+++ b/clutter/clutter-behaviour-path.c
@@ -196,12 +196,12 @@ node_distance (const ClutterKnot *begin,
   if (clutter_knot_equal (begin, end))
         return 0;
 
-#ifdef CFX_NO_FPU
+#if 1
   return clutter_sqrti ((end->x - begin->x) * (end->x - begin->x) +
 			(end->y - begin->y) * (end->y - begin->y));
 #else
-  return (gint) sqrt ((end->x - begin->x) * (end->x - begin->x) +
-		      (end->y - begin->y) * (end->y - begin->y));
+  return CLUTTER_FLOAT_TO_INT(sqrt((end->x - begin->x) * (end->x - begin->x) +
+				   (end->y - begin->y) * (end->y - begin->y)));
 #endif
 }
 
diff --git a/clutter/clutter-fixed.c b/clutter/clutter-fixed.c
index b174795ac..92d37d7ae 100644
--- a/clutter/clutter-fixed.c
+++ b/clutter/clutter-fixed.c
@@ -353,14 +353,6 @@ clutter_sqrtx (ClutterFixed x)
      * on ARM this function is about 5 times faster than c-lib sqrt, whilst
      * producing errors < 1%.
      *
-     * (There are faster algorithm's available; the Carmack 'magic'
-     * algorithm, http://www.codemaestro.com/reviews/review00000105.html,
-     * is about five times faster than this one when implemented
-     * as fixed point, but it's error is much greater and grows with the
-     * size of the argument (reaches about 10% around x == 800).
-     *
-     * Note: on systems with FPU, the clib sqrt can be noticeably faster
-     *       than this function.
      */
     int t = 0;
     int sh = 0;
@@ -448,68 +440,121 @@ clutter_sqrtx (ClutterFixed x)
  * clutter_sqrti:
  * @x: integer value
  *
- * A fixed point implementation of square root for integers
+ * Very fast fixed point implementation of square root for integers.
+ * 
+ * This function is about 10x faster than clib sqrt() on x86, and (this is
+ * not a typo!) more than 800x faster on ARM without FPU. It's error is < 5%
+ * for arguments < 132 and < 10% for arguments < 5591.
  *
- * Return value: integer square root (truncated).
+ * Return value: integer square root.
  *
  *
  * Since: 0.2
  */
 gint
-clutter_sqrti (gint x)
+clutter_sqrti (gint number)
 {
-    int t = 0;
-    int sh = 0;
-    unsigned int mask = 0x40000000;
+    /* This is a fixed point implementation of the Quake III sqrt algorithm,
+     * described, for example, at
+     *   http://www.codemaestro.com/reviews/review00000105.html
+     *
+     * While the original QIII is extremely fast, the use of floating division
+     * and multiplication makes it perform very on arm processors without FPU.
+     *
+     * The key to successfully replacing the floating point operations with
+     * fixed point is in the choice of the fixed point format. The QIII
+     * algorithm does not calculate the square root, but its reciprocal ('y'
+     * below), which is only at the end turned to the inverse value. In order
+     * for the algorithm to produce satisfactory results, the reciprocal value
+     * must be represented with sufficient precission; the 16.16 we use
+     * elsewhere in clutter is not good enough, and 10.22 is used instead.
+     */
+    ClutterFixed x;
+    unsigned long y, y1;        /* 10.22 fixed point */
+    unsigned long f = 0x600000; /* '1.5' as 10.22 fixed */
+    float flt = number;
+    float flt2;
     
-    if (x <= 0)
-	return 0;
+    x = CLUTTER_INT_TO_FIXED (number) / 2;
 
-    if (x > (sizeof (sqrt_tbl)/sizeof(ClutterFixed) - 1))
-    {
-	/*
-	 * Find the highest bit set
-	 */
-#if __arm__
-	/* This actually requires at least arm v5, but gcc does not seem
-	 * to set the architecture defines correctly, and it is probably
-	 * very unlikely that anyone will want to use clutter on anything
-	 * less than v5.
-	 */
-	int bit;
-	__asm__ ("clz  %0, %1\n"
-		 "rsb  %0, %0, #31\n"
-		 :"=r"(bit)
-		 :"r" (x));
+    /* The QIII initial estimate */
+    y   = * ( unsigned long * ) &flt;
+    y   = 0x5f3759df - ( y >> 1 );
+    flt = * ( float * ) &y;
 
-	/* make even (2n) */
-	bit &= 0xfffffffe;
-#else
-	/* TODO -- add i386 branch using bshr */
-	int bit = 30;
-	while (bit >= 0)
-	{
-	    if (x & mask)
-		break;
+    /* Now, we convert the float to 10.22 fixed. We exploit the mechanism
+     * described at http://www.d6.com/users/checker/pdfs/gdmfp.pdf.
+     * 
+     * We want 22 bit fraction; a single precission float uses 23 bit
+     * mantisa, so we only need to add 2^(23-22) (no need for the 1.5
+     * multiplier as we are only dealing with positive numbers).
+     * 
+     * Note: we have to use two separate variables here -- for some reason,
+     * if we try to use just the flt variable, gcc on ARM optimises the whole
+     * addition out, and it all goes pear shape, since without it, the bits
+     * in the float will not be correctly aligned.
+     */
+    flt2 = flt + 2.0;
+    y   = * ( long * ) &flt2;
+    y &= 0x7FFFFF;
 
-	    mask = (mask >> 1 | mask >> 2);
-	    bit -= 2;
-	}
-#endif
-	sh = ((bit - 6) >> 1);
-	t = (x >> (bit - 6));
-    }
-    else
-    {
-	return (sqrt_tbl[x] >> CFX_Q);
-    }
+    /* Now we correct the estimate, only single iterration is needed */
+    y1 = (y >> 11) * (y >> 11);
+    y1 = (y1 >> 8) * (x >> 8);
 
-    x = sqrt_tbl[t];
+    y1 = f - y1;
+    y  = (y >> 11) * (y1 >> 11);
 
-    if (sh > 0)
-	x = x << sh;
-    else if (sh < 0)
-	x = (x >> (1 + ~sh));
-    
-    return (x >> CFX_Q);
+    /* Invert, round and convert from 10.22 to an integer
+     * 0x1e3c68 is a magical rounding constant that produces slightly
+     * better results than 0x200000.
+     */
+    return (number * y + 0x1e3c68) >> 22;
 }
+
+/* <private> */
+const double _magic = 68719476736.0*1.5;
+
+/* Where in the 64 bits of double is the mantisa */
+#ifdef LITTLE_ENDIAN
+#define _CFX_MAN			0
+#else
+#define _CFX_MAN			1
+#endif
+
+/* 
+ * clutter_double_to_fixed :
+ * @value: value to be converted
+ *
+ * A fast conversion from double precision floating to fixed point
+ *
+ * Return value: Fixed point representation of the value
+ *
+ * Since: 0.2
+ */
+ClutterFixed
+_clutter_double_to_fixed (double val)
+{
+    val = val + _magic;
+    return ((gint32*)&val)[_CFX_MAN]; 
+}
+
+/*
+ * clutter_double_to_int :
+ * @value: value to be converted
+ *
+ * A fast conversion from doulbe precision floatint point  to int;
+ * used this instead of casting double/float to int.
+ *
+ * Return value: Integer part of the double
+ *
+ * Since: 0.2
+ */
+ClutterFixed
+_clutter_double_to_int (double val)
+{
+    val = val + _magic;
+    return ((gint32*)&val)[_CFX_MAN] >> 16; 
+}
+
+#undef _CFX_MAN
diff --git a/clutter/clutter-fixed.h b/clutter/clutter-fixed.h
index 206ac62d3..2eca15689 100644
--- a/clutter/clutter-fixed.h
+++ b/clutter/clutter-fixed.h
@@ -118,10 +118,9 @@ typedef gint32 ClutterAngle;    /* angle such that 1024 == 2*PI */
 #define CLUTTER_FIXED_TO_FLOAT(x)       ((float) ((int)(x) / 65536.0))
 #define CLUTTER_FIXED_TO_DOUBLE(x)      ((double) ((int)(x) / 65536.0))
 
-#define CLUTTER_FLOAT_TO_FIXED(x)                                           \
-        ( (ABS(x) > 32767.0) ? (((x) / (x)) * 0x7fffffff)                   \
-                             : ((long)((x) * 65536.0  + ((x) < 0 ? -0.5     \
-                                                                 : 0.5))) )
+#define CLUTTER_FLOAT_TO_FIXED(x) _clutter_double_to_fixed((x))
+#define CLUTTER_FLOAT_TO_INT(x)   _clutter_double_to_int((x))
+
 #define CLUTTER_INT_TO_FIXED(x) ((x) << CFX_Q)
 
 #define CLUTTER_FIXED_INT(x)            ((x) >> CFX_Q)
@@ -180,6 +179,14 @@ ClutterFixed clutter_sini (ClutterAngle angle);
 ClutterFixed clutter_sqrtx (ClutterFixed x);
 gint         clutter_sqrti (gint         x);
 
+
+/* <private> */
+extern inline
+ClutterFixed _clutter_double_to_fixed (double value);
+
+extern inline
+ClutterFixed _clutter_double_to_int (double value);
+
 G_END_DECLS
 
 #endif