cogl: Use SSE2 when possible for premultiplying
This adds a fast path for premultiplying an RGBA image using SSE2 instructions. SSE registers are 128-bit and we need at least 16-bits per component for the intermediate result of the multiplication so we can do two pixels in parallel with one register. The function interleaves 2 SSE registers to multiply 4 pixels in one function call with the hope that this will pipeline better. http://bugzilla.openedhand.com/show_bug.cgi?id=1939 Signed-off-by: Emmanuele Bassi <ebassi@linux.intel.com>
This commit is contained in:
parent
2ecb6f7b20
commit
f16fd0ddbb
1 changed files with 103 additions and 1 deletions
|
@ -215,6 +215,91 @@ _cogl_premult_alpha_first (guchar *dst)
|
|||
|
||||
#undef MULT
|
||||
|
||||
/* Use the SSE optimized version to premult four pixels at once when
|
||||
it is available. The same assembler code works for x86 and x86-64
|
||||
because it doesn't refer to any non-SSE registers directly */
|
||||
#if defined(__SSE2__) && defined(__GNUC__) \
|
||||
&& (defined(__x86_64) || defined(__i386))
|
||||
#define COGL_USE_PREMULT_SSE2
|
||||
#endif
|
||||
|
||||
#ifdef COGL_USE_PREMULT_SSE2
|
||||
|
||||
inline static void
|
||||
_cogl_premult_alpha_last_four_pixels_sse2 (const guint8 *p)
|
||||
{
|
||||
/* 8 copies of 128 used below */
|
||||
static const gint16 eight_halves[8] __attribute__ ((aligned (16))) =
|
||||
{ 128, 128, 128, 128, 128, 128, 128, 128 };
|
||||
/* Mask of the rgb components of the four pixels */
|
||||
static const gint8 just_rgb[16] __attribute__ ((aligned (16))) =
|
||||
{ 0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x00,
|
||||
0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x00 };
|
||||
/* Each SSE register only holds two pixels because we need to work
|
||||
with 16-bit intermediate values. We still do four pixels by
|
||||
interleaving two registers in the hope that it will pipeline
|
||||
better */
|
||||
asm (/* Load eight_halves into xmm5 for later */
|
||||
"movdqa (%1), %%xmm5\n"
|
||||
/* Clear xmm3 */
|
||||
"pxor %%xmm3, %%xmm3\n"
|
||||
/* Load two pixels from p into the low half of xmm0 */
|
||||
"movlps (%0), %%xmm0\n"
|
||||
/* Load the next set of two pixels from p into the low half of xmm1 */
|
||||
"movlps 8(%0), %%xmm1\n"
|
||||
/* Unpack 8 bytes from the low quad-words in each register to 8
|
||||
16-bit values */
|
||||
"punpcklbw %%xmm3, %%xmm0\n"
|
||||
"punpcklbw %%xmm3, %%xmm1\n"
|
||||
/* Copy alpha values of the first pixel in xmm0 to all
|
||||
components of the first pixel in xmm2 */
|
||||
"pshuflw $255, %%xmm0, %%xmm2\n"
|
||||
/* same for xmm1 and xmm3 */
|
||||
"pshuflw $255, %%xmm1, %%xmm3\n"
|
||||
/* The above also copies the second pixel directly so we now
|
||||
want to replace the RGB components with copies of the alpha
|
||||
components */
|
||||
"pshufhw $255, %%xmm2, %%xmm2\n"
|
||||
"pshufhw $255, %%xmm3, %%xmm3\n"
|
||||
/* Multiply the rgb components by the alpha */
|
||||
"pmullw %%xmm2, %%xmm0\n"
|
||||
"pmullw %%xmm3, %%xmm1\n"
|
||||
/* Add 128 to each component */
|
||||
"paddw %%xmm5, %%xmm0\n"
|
||||
"paddw %%xmm5, %%xmm1\n"
|
||||
/* Copy the results to temporary registers xmm4 and xmm5 */
|
||||
"movdqa %%xmm0, %%xmm4\n"
|
||||
"movdqa %%xmm1, %%xmm5\n"
|
||||
/* Divide the results by 256 */
|
||||
"psrlw $8, %%xmm0\n"
|
||||
"psrlw $8, %%xmm1\n"
|
||||
/* Add the temporaries back in */
|
||||
"paddw %%xmm4, %%xmm0\n"
|
||||
"paddw %%xmm5, %%xmm1\n"
|
||||
/* Divide again */
|
||||
"psrlw $8, %%xmm0\n"
|
||||
"psrlw $8, %%xmm1\n"
|
||||
/* Pack the results back as bytes */
|
||||
"packuswb %%xmm1, %%xmm0\n"
|
||||
/* Load just_rgb into xmm3 for later */
|
||||
"movdqa (%2), %%xmm3\n"
|
||||
/* Reload all four pixels into xmm2 */
|
||||
"movups (%0), %%xmm2\n"
|
||||
/* Mask out the alpha from the results */
|
||||
"andps %%xmm3, %%xmm0\n"
|
||||
/* Mask out the RGB from the original four pixels */
|
||||
"andnps %%xmm2, %%xmm3\n"
|
||||
/* Combine the two to get the right alpha values */
|
||||
"orps %%xmm3, %%xmm0\n"
|
||||
/* Write to memory */
|
||||
"movdqu %%xmm0, (%0)\n"
|
||||
: /* no outputs */
|
||||
: "r" (p), "r" (eight_halves), "r" (just_rgb)
|
||||
: "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
|
||||
}
|
||||
|
||||
#endif /* COGL_USE_PREMULT_SSE2 */
|
||||
|
||||
gboolean
|
||||
_cogl_bitmap_fallback_can_convert (CoglPixelFormat src, CoglPixelFormat dst)
|
||||
{
|
||||
|
@ -408,7 +493,24 @@ _cogl_bitmap_fallback_premult (CoglBitmap *bmp)
|
|||
}
|
||||
else
|
||||
{
|
||||
for (x = 0; x < bmp->width; x++)
|
||||
x = bmp->width;
|
||||
|
||||
#ifdef COGL_USE_PREMULT_SSE2
|
||||
|
||||
/* Process 4 pixels at a time */
|
||||
while (x >= 4)
|
||||
{
|
||||
_cogl_premult_alpha_last_four_pixels_sse2 (p);
|
||||
p += 4 * 4;
|
||||
x -= 4;
|
||||
}
|
||||
|
||||
/* If there are any pixels left we will fall through and
|
||||
handle them below */
|
||||
|
||||
#endif /* COGL_USE_PREMULT_SSE2 */
|
||||
|
||||
while (x-- > 0)
|
||||
{
|
||||
_cogl_premult_alpha_last (p);
|
||||
p += 4;
|
||||
|
|
Loading…
Reference in a new issue