11 files changed, 4318 insertions, 0 deletions
diff --git a/apps/plugins/sdl/src/video/ps3/SDL_ps3events.c b/apps/plugins/sdl/src/video/ps3/SDL_ps3events.c
new file mode 100644
index 0000000000..e39efcc4f0
--- /dev/null
+++ b/apps/plugins/sdl/src/video/ps3/SDL_ps3events.c
@@ -0,0 +1,44 @@
+/*
+ * SDL - Simple DirectMedia Layer
+ * CELL BE Support for PS3 Framebuffer
+ * Copyright (C) 2008, 2009 International Business Machines Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ *
+ *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
+ *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
+ *  SPE code based on research by:
+ *  Rene Becker
+ *  Thimo Emmerich
+ */
+#include "SDL_config.h"
+#include "../../events/SDL_sysevents.h"
+#include "../../events/SDL_events_c.h"
+#include "SDL_ps3video.h"
+#include "SDL_ps3events_c.h"
+void PS3_PumpEvents(_THIS)
+{
+        return;
+}
+void PS3_InitOSKeymap(_THIS)
+{
+        return;
+}
diff --git a/apps/plugins/sdl/src/video/ps3/SDL_ps3events_c.h b/apps/plugins/sdl/src/video/ps3/SDL_ps3events_c.h
new file mode 100644
index 0000000000..fd11209af1
--- /dev/null
+++ b/apps/plugins/sdl/src/video/ps3/SDL_ps3events_c.h
@@ -0,0 +1,41 @@
+/*
+ * SDL - Simple DirectMedia Layer
+ * CELL BE Support for PS3 Framebuffer
+ * Copyright (C) 2008, 2009 International Business Machines Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ *
+ *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
+ *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
+ *  SPE code based on research by:
+ *  Rene Becker
+ *  Thimo Emmerich
+ */
+#include "SDL_config.h"
+#ifndef _SDL_ps3events_h
+#define _SDL_ps3events_h
+#include "SDL_ps3video.h"
+extern void PS3_InitOSKeymap(_THIS);
+extern void PS3_PumpEvents(_THIS);
+extern void enable_cursor(int enable);
+#endif /* _SDL_ps3events_h */
diff --git a/apps/plugins/sdl/src/video/ps3/SDL_ps3video.c b/apps/plugins/sdl/src/video/ps3/SDL_ps3video.c
new file mode 100644
index 0000000000..d5519e051e
--- /dev/null
+++ b/apps/plugins/sdl/src/video/ps3/SDL_ps3video.c
@@ -0,0 +1,621 @@
+/*
+ * SDL - Simple DirectMedia Layer
+ * CELL BE Support for PS3 Framebuffer
+ * Copyright (C) 2008, 2009 International Business Machines Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ *
+ *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
+ *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
+ *  SPE code based on research by:
+ *  Rene Becker
+ *  Thimo Emmerich
+ */
+#include "SDL_config.h"
+#include "SDL_video.h"
+#include "../SDL_sysvideo.h"
+#include "SDL_ps3events_c.h"
+#include "SDL_ps3video.h"
+#include "SDL_ps3yuv_c.h"
+#include "spulibs/spu_common.h"
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <linux/kd.h>
+#include <sys/mman.h>
+#include <linux/fb.h>
+#include <asm/ps3fb.h>
+#include <libspe2.h>
+#include <malloc.h>
+/* SDL_VideoDevice functions */
+static int PS3_Available();
+static SDL_VideoDevice *PS3_CreateDevice(int devindex);
+static int PS3_VideoInit(_THIS, SDL_PixelFormat * vformat);
+static void PS3_VideoQuit(_THIS);
+static void PS3_DeleteDevice(SDL_VideoDevice * device);
+static SDL_Surface *PS3_SetVideoMode(_THIS, SDL_Surface * current, int width, int height, int bpp, Uint32 flags);
+static SDL_Rect **PS3_ListModes(_THIS, SDL_PixelFormat * format, Uint32 flags);
+/* Hardware surface functions */
+static int PS3_AllocHWSurface(_THIS, SDL_Surface * surface);
+static void PS3_FreeHWSurface(_THIS, SDL_Surface * surface);
+static int PS3_LockHWSurface(_THIS, SDL_Surface * surface);
+static void PS3_UnlockHWSurface(_THIS, SDL_Surface * surface);
+static int PS3_FlipDoubleBuffer(_THIS, SDL_Surface * surface);
+static void PS3_DoubleBufferUpdate(_THIS, int numrects, SDL_Rect * rects);
+/* SPU specific functions */
+int SPE_Start(_THIS, spu_data_t * spe_data);
+int SPE_Stop(_THIS, spu_data_t * spe_data);
+int SPE_Boot(_THIS, spu_data_t * spe_data);
+int SPE_Shutdown(_THIS, spu_data_t * spe_data);
+int SPE_SendMsg(_THIS, spu_data_t * spe_data, unsigned int msg);
+int SPE_WaitForMsg(_THIS, spu_data_t * spe_data, unsigned int msg);
+void SPE_RunContext(void *thread_argp);
+/* Helpers */
+void enable_cursor(int enable);
+/* Stores the SPE executable name of fb_writer_spu */
+extern spe_program_handle_t fb_writer_spu;
+/* SDL PS3 bootstrap function for checking availability */
+static int PS3_Available()
+{
+        return 1;
+}
+/* SDL PS3 bootstrap function for creating the device */
+static SDL_VideoDevice *PS3_CreateDevice(int devindex)
+{
+        SDL_VideoDevice *this;
+        /* Initialise SDL_VideoDevice */
+        this = (SDL_VideoDevice *) SDL_malloc(sizeof(SDL_VideoDevice));
+        if (this) {
+                memset(this, 0, sizeof *this);
+                this->hidden = (struct SDL_PrivateVideoData *)
+                    SDL_malloc(sizeof(struct SDL_PrivateVideoData));
+        }
+        /* Error handling */
+        if ((this == NULL) || (this->hidden == NULL)) {
+                SDL_OutOfMemory();
+                if (this)
+                        SDL_free(this);
+                return 0;
+        }
+        memset(this->hidden, 0, sizeof(struct SDL_PrivateVideoData));
+        /* Set the function pointers */
+        this->VideoInit = PS3_VideoInit;
+        this->ListModes = PS3_ListModes;
+        this->SetVideoMode = PS3_SetVideoMode;
+        this->SetColors = 0;
+        this->CreateYUVOverlay = PS3_CreateYUVOverlay;
+        this->UpdateRects = 0;
+        this->VideoQuit = PS3_VideoQuit;
+        this->AllocHWSurface = PS3_AllocHWSurface;
+        this->CheckHWBlit = 0;
+        this->FillHWRect = 0;
+        this->SetHWColorKey = 0;
+        this->SetHWAlpha = 0;
+        this->LockHWSurface = PS3_LockHWSurface;
+        this->UnlockHWSurface = PS3_UnlockHWSurface;
+        this->FlipHWSurface = PS3_FlipDoubleBuffer;
+        this->FreeHWSurface = PS3_FreeHWSurface;
+        this->SetCaption = 0;
+        this->SetIcon = 0;
+        this->IconifyWindow = 0;
+        this->GrabInput = 0;
+        this->GetWMInfo = 0;
+        this->InitOSKeymap = PS3_InitOSKeymap;
+        this->PumpEvents = PS3_PumpEvents;
+        this->free = PS3_DeleteDevice;
+        return this;
+}
+/* Bootstraping (see SDL_sysvideo.h) */
+VideoBootStrap PS3_bootstrap = {
+        "ps3", "PS3 Cell SPU Driver",
+        PS3_Available, PS3_CreateDevice
+};
+/* Delete the device */
+static void PS3_DeleteDevice(SDL_VideoDevice * device)
+{
+        free(device->hidden);
+        free(device);
+}
+/* Initialise the PS3 video device */
+static int PS3_VideoInit(_THIS, SDL_PixelFormat * vformat)
+{
+        /* Hide the cursor */
+        enable_cursor(0);
+        /* Create SPU fb_parms and thread structure */
+        fb_parms = (struct fb_writer_parms_t *)
+            memalign(16, sizeof(struct fb_writer_parms_t));
+        fb_thread_data = (spu_data_t *) malloc(sizeof(spu_data_t));
+        if (fb_parms == NULL || fb_thread_data == NULL) {
+                SDL_OutOfMemory();
+                return -1;
+        }
+        fb_thread_data->program = fb_writer_spu;
+        fb_thread_data->program_name = "fb_writer_spu";
+        fb_thread_data->argp = (void *)fb_parms;
+        fb_thread_data->keepalive = 1;
+        fb_thread_data->booted = 0;
+        SPE_Start(this, fb_thread_data);
+        /* Open the device */
+        fb_dev_fd = open(PS3_DEV_FB, O_RDWR);
+        if (fb_dev_fd < 0) {
+                SDL_SetError("[PS3] Unable to open device %s", PS3_DEV_FB);
+                return -1;
+        }
+        /* Get vscreeninfo */
+        if (ioctl(fb_dev_fd, FBIOGET_VSCREENINFO, &fb_vinfo)) {
+                SDL_SetError("[PS3] Can't get VSCREENINFO");
+                if (fb_dev_fd >= 0)
+                        close(fb_dev_fd);
+                fb_dev_fd = -1;
+                return -1;
+        }
+        /* Fill in our hardware acceleration capabilities */
+        this->info.current_w = fb_vinfo.xres;
+        this->info.current_h = fb_vinfo.yres;
+        this->info.wm_available = 0;
+        this->info.hw_available = 1;
+        /* Backup the original vinfo to restore later */
+        fb_orig_vinfo = fb_vinfo;
+        /* 16 and 15 bpp is reported as 16 bpp */
+        fb_bits_per_pixel = fb_vinfo.bits_per_pixel;
+        if (fb_bits_per_pixel == 16)
+                fb_bits_per_pixel =
+                    fb_vinfo.red.length + fb_vinfo.green.length +
+                    fb_vinfo.blue.length;
+        /* Set SDL_PixelFormat */
+        vformat->BitsPerPixel = fb_vinfo.bits_per_pixel;
+        fb_vinfo.xres_virtual = fb_vinfo.xres;
+        fb_vinfo.yres_virtual = fb_vinfo.yres;
+        /* Put vscreeninfo */
+        if (ioctl(fb_dev_fd, FBIOPUT_VSCREENINFO, &fb_vinfo)) {
+                SDL_SetError("[PS3] Can't put VSCREENINFO");
+                if (fb_dev_fd >= 0)
+                        close(fb_dev_fd);
+                fb_dev_fd = -1;
+                return -1;
+        }
+        s_fb_pixel_size = fb_vinfo.bits_per_pixel / 8;
+        s_writeable_width = fb_vinfo.xres;
+        s_writeable_height = fb_vinfo.yres;
+        /* Get ps3 screeninfo */
+        if (ioctl(fb_dev_fd, PS3FB_IOCTL_SCREENINFO, (unsigned long)&res) < 0) {
+                SDL_SetError("[PS3] PS3FB_IOCTL_SCREENINFO failed");
+        }
+        deprintf(1, "[PS3] xres:%d yres:%d xoff:%d yoff:%d\n", res.xres, res.yres, res.xoff, res.yoff);
+        /* Only use double buffering if enough fb memory is available */
+        if (res.num_frames < 2) {
+                double_buffering = 0;
+        } else {
+                double_buffering = 1;
+        }
+        real_width = res.xres;
+        real_height = res.yres;
+        /*
+         * Take control of frame buffer from kernel, for details see
+         * http://felter.org/wesley/files/ps3/linux-20061110-docs/ApplicationProgrammingEnvironment.html
+         * kernel will no longer flip the screen itself
+         */
+        ioctl(fb_dev_fd, PS3FB_IOCTL_ON, 0);
+        /* Unblank screen */
+        ioctl(fb_dev_fd, FBIOBLANK, 0);
+        return 0;
+}
+/* List available PS3 resolutions */
+static SDL_Rect **PS3_ListModes(_THIS, SDL_PixelFormat * format, Uint32 flags)
+{
+        /* A list of video resolutions that we query for (sorted largest to
+         * smallest)
+         */
+        static SDL_Rect PS3_resolutions[] = {
+                {0, 0, 1920, 1080}, // 1080p 16:9 HD
+                {0, 0, 1600, 1200}, // WUXGA
+                {0, 0, 1280, 1024}, // SXGA
+                {0, 0, 1280, 720},  // 720p 16:9 HD
+                {0, 0, 1024, 768},  // WXGA
+                {0, 0, 1024, 576},  // 576p 16:9
+                {0, 0, 853, 480},   // 480p 16:9
+                {0, 0, 720, 576},   // 576p 4:3 (PAL)
+                {0, 0, 720, 480},   // 480p 16:9 (NTSC)
+        };
+        static SDL_Rect *PS3_modes[] = {
+                &PS3_resolutions[0],
+                &PS3_resolutions[1],
+                &PS3_resolutions[2],
+                &PS3_resolutions[3],
+                &PS3_resolutions[4],
+                &PS3_resolutions[5],
+                &PS3_resolutions[6],
+                &PS3_resolutions[7],
+                &PS3_resolutions[8],
+                NULL
+        };
+        SDL_Rect **modes = PS3_modes;
+        return modes;
+}
+/* Get a list of the available display modes */
+static SDL_Surface *PS3_SetVideoMode(_THIS, SDL_Surface * current, int width, int height, int bpp, Uint32 flags)
+{
+        s_bounded_input_width = width < s_writeable_width ? width : s_writeable_width;
+        s_bounded_input_height = height < s_writeable_height ? height : s_writeable_height;
+        s_bounded_input_width_offset = (s_writeable_width - s_bounded_input_width) >> 1;
+        s_bounded_input_height_offset = (s_writeable_height - s_bounded_input_height) >> 1;
+        s_input_line_length = width * s_fb_pixel_size;
+        current->flags |= flags;
+        if (ioctl(fb_dev_fd, FBIOGET_FSCREENINFO, &fb_finfo)) {
+                SDL_SetError("[PS3] Can't get fixed screeninfo");
+                return NULL;
+        }
+        if (fb_finfo.type != FB_TYPE_PACKED_PIXELS) {
+                SDL_SetError("[PS3] type %s not supported",
+                             fb_finfo.type);
+                return NULL;
+        }
+        /* Note: on PS3, fb_finfo.smem_len is enough for double buffering */
+        if ((frame_buffer =
+             (uint8_t *) mmap(0, fb_finfo.smem_len,
+                              PROT_READ | PROT_WRITE, MAP_SHARED,
+                              fb_dev_fd, 0)) == (uint8_t *) - 1) {
+                SDL_SetError("[PS3] Can't mmap for %s", PS3_DEV_FB);
+                return NULL;
+        } else {
+                current->flags |= SDL_DOUBLEBUF;
+        }
+        if (!SDL_ReallocFormat(current, fb_bits_per_pixel, 0, 0, 0, 0)) {
+                return (NULL);
+        }
+        /* Blank screen */
+        memset(frame_buffer, 0x00, fb_finfo.smem_len);
+        /* Centering */
+        s_center[0] =
+            frame_buffer + s_bounded_input_width_offset * s_fb_pixel_size +
+            s_bounded_input_height_offset * fb_finfo.line_length;
+        s_center[1] = s_center[0] + real_height * fb_finfo.line_length;
+        s_center_index = 0;
+        current->flags |= SDL_FULLSCREEN;
+        current->w = width;
+        current->h = height;
+        current->pitch = SDL_CalculatePitch(current);
+        /* Alloc aligned mem for current->pixels */
+        s_pixels = memalign(16, current->h * current->pitch);
+        current->pixels = (void *)s_pixels;
+        if (!current->pixels) {
+                SDL_OutOfMemory();
+                return NULL;
+        }
+        /* Set the update rectangle function */
+        this->UpdateRects = PS3_DoubleBufferUpdate;
+        return current;
+}
+/* Copy screen to framebuffer and flip */
+void PS3_DoubleBufferUpdate(_THIS, int numrects, SDL_Rect * rects)
+{
+        if (converter_thread_data && converter_thread_data->booted)
+                SPE_WaitForMsg(this, converter_thread_data, SPU_FIN);
+        /* Adjust centering */
+        s_bounded_input_width_offset = (s_writeable_width - s_bounded_input_width) >> 1;
+        s_bounded_input_height_offset = (s_writeable_height - s_bounded_input_height) >> 1;
+        s_center[0] = frame_buffer + s_bounded_input_width_offset * s_fb_pixel_size +
+                s_bounded_input_height_offset * fb_finfo.line_length;
+        s_center[1] = s_center[0] + real_height * fb_finfo.line_length;
+        /* Set SPU parms for copying the surface to framebuffer */
+        fb_parms->data = (unsigned char *)s_pixels;
+        fb_parms->center = s_center[s_center_index];
+        fb_parms->out_line_stride = fb_finfo.line_length;
+        fb_parms->in_line_stride = s_input_line_length;
+        fb_parms->bounded_input_height = s_bounded_input_height;
+        fb_parms->bounded_input_width = s_bounded_input_width;
+        fb_parms->fb_pixel_size = s_fb_pixel_size;
+        deprintf(3, "[PS3->SPU] fb_thread_data->argp = 0x%x\n", fb_thread_data->argp);
+        /* Copying.. */
+        SPE_SendMsg(this, fb_thread_data, SPU_START);
+        SPE_SendMsg(this, fb_thread_data, (unsigned int)fb_thread_data->argp);
+        SPE_WaitForMsg(this, fb_thread_data, SPU_FIN);
+        /* Flip the pages */
+        if (double_buffering)
+                s_center_index = s_center_index ^ 0x01;
+        PS3_FlipDoubleBuffer(this, this->screen);
+}
+/* Enable/Disable cursor */
+void enable_cursor(int enable)
+{
+        int fd = open("/dev/console", O_RDWR | O_NONBLOCK);
+        if (fd >= 0) {
+                ioctl(fd, KDSETMODE, enable ? KD_TEXT : KD_GRAPHICS);
+                close(fd);
+        }
+}
+static int PS3_AllocHWSurface(_THIS, SDL_Surface * surface)
+{
+        return -1;
+}
+static void PS3_FreeHWSurface(_THIS, SDL_Surface * surface)
+{
+        return;
+}
+static int PS3_LockHWSurface(_THIS, SDL_Surface * surface)
+{
+        return 0;
+}
+static void PS3_UnlockHWSurface(_THIS, SDL_Surface * surface)
+{
+        return;
+}
+/* Blit/Flip buffer to the screen. Must be called after each frame! */
+int PS3_FlipDoubleBuffer(_THIS, SDL_Surface * surface)
+{
+        unsigned long crt = 0;
+        /* Wait for vsync */
+        deprintf(1, "[PS3] Wait for vsync\n");
+        ioctl(fb_dev_fd, FBIO_WAITFORVSYNC, &crt);
+        /* Page flip */
+        deprintf(1, "[PS3] Page flip to buffer #%u 0x%x\n", s_center_index, s_center[s_center_index]);
+        ioctl(fb_dev_fd, PS3FB_IOCTL_FSEL, (unsigned long)&s_center_index);
+        return 1;
+}
+/* Start the SPE thread */
+int SPE_Start(_THIS, spu_data_t * spe_data)
+{
+        deprintf(2, "[PS3->SPU] Start SPE: %s\n", spe_data->program_name);
+        if (!(spe_data->booted))
+                SPE_Boot(this, spe_data);
+        /* To allow re-running of context, spe_ctx_entry has to be set before each call */
+        spe_data->entry = SPE_DEFAULT_ENTRY;
+        spe_data->error_code = 0;
+        /* Create SPE thread and run */
+        deprintf(2, "[PS3->SPU] Create Thread: %s\n", spe_data->program_name);
+        if (pthread_create
+            (&spe_data->thread, NULL, (void *)&SPE_RunContext, (void *)spe_data)) {
+                deprintf(2, "[PS3->SPU] Could not create pthread for spe: %s\n", spe_data->program_name);
+                SDL_SetError("[PS3->SPU] Could not create pthread for spe");
+                return -1;
+        }
+        if (spe_data->keepalive)
+                SPE_WaitForMsg(this, spe_data, SPU_READY);
+}
+/* Stop the SPE thread */
+int SPE_Stop(_THIS, spu_data_t * spe_data)
+{
+        deprintf(2, "[PS3->SPU] Stop SPE: %s\n", spe_data->program_name);
+        /* Wait for SPE thread to complete */
+        deprintf(2, "[PS3->SPU] Wait for SPE thread to complete: %s\n", spe_data->program_name);
+        if (pthread_join(spe_data->thread, NULL)) {
+                deprintf(2, "[PS3->SPU] Failed joining the thread: %s\n", spe_data->program_name);
+                SDL_SetError("[PS3->SPU] Failed joining the thread");
+                return -1;
+        }
+        return 0;
+}
+/* Create SPE context and load program */
+int SPE_Boot(_THIS, spu_data_t * spe_data)
+{
+        /* Create SPE context */
+        deprintf(2, "[PS3->SPU] Create SPE Context: %s\n", spe_data->program_name);
+        spe_data->ctx = spe_context_create(0, NULL);
+        if (spe_data->ctx == NULL) {
+                deprintf(2, "[PS3->SPU] Failed creating SPE context: %s\n", spe_data->program_name);
+                SDL_SetError("[PS3->SPU] Failed creating SPE context");
+                return -1;
+        }
+        /* Load SPE object into SPE local store */
+        deprintf(2, "[PS3->SPU] Load Program into SPE: %s\n", spe_data->program_name);
+        if (spe_program_load(spe_data->ctx, &spe_data->program)) {
+                deprintf(2, "[PS3->SPU] Failed loading program into SPE context: %s\n", spe_data->program_name);
+                SDL_SetError
+                    ("[PS3->SPU] Failed loading program into SPE context");
+                return -1;
+        }
+        spe_data->booted = 1;
+        deprintf(2, "[PS3->SPU] SPE boot successful\n");
+        return 0;
+}
+/* (Stop and) shutdown the SPE */
+int SPE_Shutdown(_THIS, spu_data_t * spe_data)
+{
+        if (spe_data->keepalive && spe_data->booted) {
+                SPE_SendMsg(this, spe_data, SPU_EXIT);
+                SPE_Stop(this, spe_data);
+        }
+        /* Destroy SPE context */
+        deprintf(2, "[PS3->SPU] Destroy SPE context: %s\n", spe_data->program_name);
+        if (spe_context_destroy(spe_data->ctx)) {
+                deprintf(2, "[PS3->SPU] Failed destroying context: %s\n", spe_data->program_name);
+                SDL_SetError("[PS3->SPU] Failed destroying context");
+                return -1;
+        }
+        deprintf(2, "[PS3->SPU] SPE shutdown successful: %s\n", spe_data->program_name);
+        return 0;
+}
+/* Send message to the SPE via mailboxe */
+int SPE_SendMsg(_THIS, spu_data_t * spe_data, unsigned int msg)
+{
+        deprintf(2, "[PS3->SPU] Sending message %u to %s\n", msg, spe_data->program_name);
+        /* Send one message, block until message was sent */
+        unsigned int spe_in_mbox_msgs[1];
+        spe_in_mbox_msgs[0] = msg;
+        int in_mbox_write = spe_in_mbox_write(spe_data->ctx, spe_in_mbox_msgs, 1, SPE_MBOX_ALL_BLOCKING);
+        if (1 > in_mbox_write) {
+                deprintf(2, "[PS3->SPU] No message could be written to %s\n", spe_data->program_name);
+                SDL_SetError("[PS3->SPU] No message could be written");
+                return -1;
+        }
+        return 0;
+}
+/* Read 1 message from SPE, block until at least 1 message was received */
+int SPE_WaitForMsg(_THIS, spu_data_t * spe_data, unsigned int msg)
+{
+        deprintf(2, "[PS3->SPU] Waiting for message from %s\n", spe_data->program_name);
+        unsigned int out_messages[1];
+        while (!spe_out_mbox_status(spe_data->ctx));
+        int mbox_read = spe_out_mbox_read(spe_data->ctx, out_messages, 1);
+        deprintf(2, "[PS3->SPU] Got message from %s, message was %u\n", spe_data->program_name, out_messages[0]);
+        if (out_messages[0] == msg)
+                return 0;
+        else
+                return -1;
+}
+/* Re-runnable invocation of the spe_context_run call */
+void SPE_RunContext(void *thread_argp)
+{
+        /* argp is the pointer to argument to be passed to the SPE program */
+        spu_data_t *args = (spu_data_t *) thread_argp;
+        deprintf(3, "[PS3->SPU] void* argp=0x%x\n", (unsigned int)args->argp);
+        /* Run it.. */
+        deprintf(2, "[PS3->SPU] Run SPE program: %s\n", args->program_name);
+        if (spe_context_run
+            (args->ctx, &args->entry, 0, (void *)args->argp, NULL,
+             NULL) < 0) {
+                deprintf(2, "[PS3->SPU] Failed running SPE context: %s\n", args->program_name);
+                SDL_SetError("[PS3->SPU] Failed running SPE context: %s", args->program_name);
+                exit(1);
+        }
+        pthread_exit(NULL);
+}
+/* Quits the video driver */
+static void PS3_VideoQuit(_THIS)
+{
+        if (fb_dev_fd > 0) {
+                /* Restore the original video mode */
+                if (ioctl(fb_dev_fd, FBIOPUT_VSCREENINFO, &fb_orig_vinfo))
+                        SDL_SetError("[PS3] Can't restore original fb_var_screeninfo");
+                /* Give control of frame buffer to kernel */
+                ioctl(fb_dev_fd, PS3FB_IOCTL_OFF, 0);
+                close(fb_dev_fd);
+                fb_dev_fd = -1;
+        }
+        if (frame_buffer) {
+                munmap(frame_buffer, fb_finfo.smem_len);
+                frame_buffer = 0;
+        }
+        if (fb_parms)
+                free((void *)fb_parms);
+        if (fb_thread_data) {
+                SPE_Shutdown(this, fb_thread_data);
+                free((void *)fb_thread_data);
+        }
+        if (this->screen) {
+                if (double_buffering && this->screen->pixels) {
+                        free(this->screen->pixels);
+                }
+                this->screen->pixels = NULL;
+        }
+        enable_cursor(1);
+        deprintf(1, "[PS3] VideoQuit\n");
+}
diff --git a/apps/plugins/sdl/src/video/ps3/SDL_ps3video.h b/apps/plugins/sdl/src/video/ps3/SDL_ps3video.h
new file mode 100644
index 0000000000..4fe5a2b42b
--- /dev/null
+++ b/apps/plugins/sdl/src/video/ps3/SDL_ps3video.h
@@ -0,0 +1,165 @@
+/*
+ * SDL - Simple DirectMedia Layer
+ * CELL BE Support for PS3 Framebuffer
+ * Copyright (C) 2008, 2009 International Business Machines Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ *
+ *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
+ *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
+ *  SPE code based on research by:
+ *  Rene Becker
+ *  Thimo Emmerich
+ */
+#include "SDL_config.h"
+#include "../SDL_sysvideo.h"
+#include "SDL_mouse.h"
+#include "SDL_mutex.h"
+#include "spulibs/spu_common.h"
+#include <libspe2.h>
+#include <pthread.h>
+#include <linux/types.h>
+#include <linux/fb.h>
+#include <asm/ps3fb.h>
+#include <linux/vt.h>
+#include <termios.h>
+#ifndef _SDL_ps3video_h
+#define _SDL_ps3video_h
+/* Debugging
+ * 0: No debug messages
+ * 1: Video debug messages
+ * 2: SPE debug messages
+ * 3: Memory adresses
+ */
+#define DEBUG_LEVEL 0
+#ifdef DEBUG_LEVEL
+#define deprintf( level, fmt, args... ) \
+    do \
+{ \
+    if ( (unsigned)(level) <= DEBUG_LEVEL ) \
+    { \
+        fprintf( stdout, fmt, ##args ); \
+        fflush( stdout ); \
+    } \
+} while ( 0 )
+#else
+#define deprintf( level, fmt, args... )
+#endif
+/* Framebuffer device */
+#define PS3_DEV_FB "/dev/fb0"
+/* Hidden "this" pointer for the video functions */
+#define _THIS   SDL_VideoDevice * this
+/* SPU thread data */
+typedef struct spu_data {
+    spe_context_ptr_t ctx;
+    pthread_t thread;
+    spe_program_handle_t program;
+    char * program_name;
+    unsigned int booted;
+    unsigned int keepalive;
+    unsigned int entry;
+    int error_code;
+    void * argp;
+} spu_data_t;
+/* Private video driver data needed for Cell support */
+struct SDL_PrivateVideoData
+{
+    const char * const fb_dev_name; /* FB-device name */
+    int fb_dev_fd; /* Descriptor-handle for fb_dev_name */
+    uint8_t * frame_buffer; /* mmap'd access to fbdev */
+    /* SPE threading stuff */
+    spu_data_t * fb_thread_data;
+    spu_data_t * scaler_thread_data;
+    spu_data_t * converter_thread_data;
+    /* screeninfo (from linux/fb.h) */
+    struct fb_fix_screeninfo fb_finfo;
+    struct fb_var_screeninfo fb_vinfo;
+    struct fb_var_screeninfo fb_orig_vinfo;
+    /* screeninfo (from asm/ps3fb.h) */
+    struct ps3fb_ioctl_res res;
+    unsigned int double_buffering;
+    uint32_t real_width;      // real width of screen
+    uint32_t real_height;     // real height of screen
+    uint32_t s_fb_pixel_size;   // 32:  4  24:  3  16:  2  15:  2
+    uint32_t fb_bits_per_pixel;   // 32: 32  24: 24  16: 16  15: 15
+    uint32_t config_count;
+    uint32_t s_input_line_length;   // precalculated: input_width * fb_pixel_size
+    uint32_t s_bounded_input_width; // width of input (bounded by writeable width)
+    uint32_t s_bounded_input_height;// height of input (bounded by writeable height)
+    uint32_t s_bounded_input_width_offset;  // offset from the left side (used for centering)
+    uint32_t s_bounded_input_height_offset; // offset from the upper side (used for centering)
+    uint32_t s_writeable_width; // width of screen which is writeable
+    uint32_t s_writeable_height;    // height of screen which is writeable
+    uint8_t * s_center[2]; // where to begin writing our image (centered?)
+    uint32_t s_center_index;
+    volatile void * s_pixels __attribute__((aligned(128)));
+    /* Framebuffer data */
+    volatile struct fb_writer_parms_t * fb_parms __attribute__((aligned(128)));
+};
+#define fb_dev_name     (this->hidden->fb_dev_name)
+#define fb_dev_fd       (this->hidden->fb_dev_fd)
+#define frame_buffer       (this->hidden->frame_buffer)
+#define fb_thread_data      (this->hidden->fb_thread_data)
+#define scaler_thread_data      (this->hidden->scaler_thread_data)
+#define converter_thread_data      (this->hidden->converter_thread_data)
+#define fb_parms           (this->hidden->fb_parms)
+#define SDL_nummodes            (this->hidden->SDL_nummodes)
+#define SDL_modelist            (this->hidden->SDL_modelist)
+#define SDL_videomode           (this->hidden->SDL_videomode)
+#define fb_finfo        (this->hidden->fb_finfo)
+#define fb_vinfo        (this->hidden->fb_vinfo)
+#define fb_orig_vinfo   (this->hidden->fb_orig_vinfo)
+#define res             (this->hidden->res)
+#define double_buffering (this->hidden->double_buffering)
+#define real_width      (this->hidden->real_width)
+#define real_height     (this->hidden->real_height)
+#define s_fb_pixel_size   (this->hidden->s_fb_pixel_size)
+#define fb_bits_per_pixel (this->hidden->fb_bits_per_pixel)
+#define config_count (this->hidden->config_count)
+#define s_input_line_length (this->hidden->s_input_line_length)
+#define s_bounded_input_width (this->hidden->s_bounded_input_width)
+#define s_bounded_input_height (this->hidden->s_bounded_input_height)
+#define s_bounded_input_width_offset (this->hidden->s_bounded_input_width_offset)
+#define s_bounded_input_height_offset (this->hidden->s_bounded_input_height_offset)
+#define s_writeable_width (this->hidden->s_writeable_width)
+#define s_writeable_height (this->hidden->s_writeable_height)
+#define s_center          (this->hidden->s_center)
+#define s_center_index    (this->hidden->s_center_index)
+#define s_pixels           (this->hidden->s_pixels)
+#endif /* _SDL_ps3video_h */
diff --git a/apps/plugins/sdl/src/video/ps3/SDL_ps3yuv.c b/apps/plugins/sdl/src/video/ps3/SDL_ps3yuv.c
new file mode 100644
index 0000000000..b1e17dae6d
--- /dev/null
+++ b/apps/plugins/sdl/src/video/ps3/SDL_ps3yuv.c
@@ -0,0 +1,340 @@
+/*
+ * SDL - Simple DirectMedia Layer
+ * CELL BE Support for PS3 Framebuffer
+ * Copyright (C) 2008, 2009 International Business Machines Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ *
+ *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
+ *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
+ *  SPE code based on research by:
+ *  Rene Becker
+ *  Thimo Emmerich
+ */
+#include "SDL_config.h"
+#include "SDL_video.h"
+#include "SDL_ps3video.h"
+#include "SDL_ps3yuv_c.h"
+#include "../SDL_yuvfuncs.h"
+#include "spulibs/spu_common.h"
+/* Stores the executable name */
+extern spe_program_handle_t yuv2rgb_spu;
+extern spe_program_handle_t bilin_scaler_spu;
+int SPE_Start(_THIS, spu_data_t * spe_data);
+int SPE_Stop(_THIS, spu_data_t * spe_data);
+int SPE_Boot(_THIS, spu_data_t * spe_data);
+int SPE_Shutdown(_THIS, spu_data_t * spe_data);
+int SPE_SendMsg(_THIS, spu_data_t * spe_data, unsigned int msg);
+int SPE_WaitForMsg(_THIS, spu_data_t * spe_data, unsigned int msg);
+void SPE_RunContext(void *thread_argp);
+/* The functions used to manipulate software video overlays */
+static struct private_yuvhwfuncs ps3_yuvfuncs = {
+  PS3_LockYUVOverlay,
+  PS3_UnlockYUVOverlay,
+  PS3_DisplayYUVOverlay,
+  PS3_FreeYUVOverlay
+};
+struct private_yuvhwdata {
+        SDL_Surface *display;
+        SDL_Surface *stretch;
+    volatile void * pixels __attribute__((aligned(128)));
+        /* These are just so we don't have to allocate them separately */
+        Uint16 pitches[3];
+        Uint8 * planes[3];
+        unsigned int scale;
+        /* Scaled YUV picture */
+        Uint8 * scaler_out __attribute__((aligned(128)));
+        /* YUV2RGB converter data */
+    volatile struct yuv2rgb_parms_t * converter_parms __attribute__((aligned(128)));
+        /* Scaler data */
+    volatile struct scale_parms_t * scaler_parms __attribute__((aligned(128)));
+        Uint8 locked;
+};
+SDL_Overlay *PS3_CreateYUVOverlay(_THIS, int width, int height, Uint32 format, SDL_Surface *display) {
+        /* Only RGB packed pixel conversion supported */
+        if ((display->format->BytesPerPixel != 2) &&
+                        (display->format->BytesPerPixel != 3) &&
+                        (display->format->BytesPerPixel != 4))
+        {
+                SDL_SetError ("Can't use YUV data on non 16/24/32 bit surfaces");
+                return NULL;
+        }
+        /* Double-check the requested format. We'll only support YV12 */
+        switch (format) {
+            case SDL_IYUV_OVERLAY:
+                case SDL_YV12_OVERLAY:
+                        /* Supported YUV format */
+                        break;
+                default:
+                        SDL_SetError("Unsupported YUV format");
+                        return NULL;
+        }
+        SDL_Overlay* overlay;
+        struct private_yuvhwdata* hwdata;
+        /* Create the overlay structure */
+        overlay = (SDL_Overlay *) SDL_calloc(1, sizeof(SDL_Overlay));
+        if (overlay == NULL) {
+                SDL_OutOfMemory();
+                return NULL;
+        }
+        SDL_memset(overlay, 0, (sizeof *overlay));
+        /* Set the basic attributes */
+        overlay->format = format;
+        overlay->w = width;
+        overlay->h = height;
+        overlay->hwdata = NULL;
+        /* Set up the PS3 YUV surface function structure */
+        overlay->hwfuncs = &ps3_yuvfuncs;
+        /* Create the pixel data and lookup tables */
+        hwdata = (struct private_yuvhwdata *) SDL_calloc(1, sizeof(struct private_yuvhwdata));
+        if (hwdata == NULL) {
+                SDL_OutOfMemory();
+                SDL_FreeYUVOverlay(overlay);
+                return NULL;
+        }
+        overlay->hwdata = hwdata;
+        hwdata->stretch = NULL;
+        hwdata->display = display;
+        /* Create SPU parms structure */
+        hwdata->converter_parms = (struct yuv2rgb_parms_t *) memalign(16, sizeof(struct yuv2rgb_parms_t));
+        hwdata->scaler_parms = (struct scale_parms_t *) memalign(16, sizeof(struct scale_parms_t));
+        if (hwdata->converter_parms == NULL || hwdata->scaler_parms == NULL) {
+                SDL_FreeYUVOverlay(overlay);
+                SDL_OutOfMemory();
+                return(NULL);
+        }
+        /* Set up the SPEs */
+        scaler_thread_data = (spu_data_t *) malloc(sizeof(spu_data_t));
+        converter_thread_data = (spu_data_t *) malloc(sizeof(spu_data_t));
+        if (converter_thread_data == NULL || scaler_thread_data == NULL) {
+                SDL_FreeYUVOverlay(overlay);
+                SDL_OutOfMemory();
+                return(NULL);
+        }
+        scaler_thread_data->program = bilin_scaler_spu;
+        scaler_thread_data->program_name = "bilin_scaler_spu";
+        scaler_thread_data->keepalive = 0;
+        scaler_thread_data->booted = 0;
+        converter_thread_data->program = yuv2rgb_spu;
+        converter_thread_data->program_name = "yuv2rgb_spu";
+        converter_thread_data->keepalive = 1;
+        converter_thread_data->booted = 0;
+        SPE_Start(this, converter_thread_data);
+        hwdata->pixels = (Uint8 *) memalign(16, width * height + ((width * height) >> 1));
+        if (hwdata->pixels == NULL) {
+                SDL_FreeYUVOverlay(overlay);
+                SDL_OutOfMemory();
+                return(NULL);
+        }
+        /* Find the pitch and offset values for the overlay */
+        overlay->pitches = hwdata->pitches;
+        overlay->pixels = hwdata->planes;
+        switch (format) {
+            case SDL_YV12_OVERLAY:
+            case SDL_IYUV_OVERLAY:
+                        overlay->pitches[0] = overlay->w;
+                        overlay->pitches[1] = overlay->pitches[0] / 2;
+                        overlay->pitches[2] = overlay->pitches[0] / 2;
+                        overlay->pixels[0] = (Uint8 *)hwdata->pixels;
+                        overlay->pixels[1] = overlay->pixels[0] +
+                                overlay->pitches[0] * overlay->h;
+                        overlay->pixels[2] = overlay->pixels[1] +
+                                overlay->pitches[1] * overlay->h / 2;
+                        overlay->planes = 3;
+                break;
+            default:
+                /* We should never get here (caught above) */
+                break;
+        }
+        /* We're all done.. */
+        return overlay;
+}
+int PS3_LockYUVOverlay(_THIS, SDL_Overlay *overlay) {
+        if (overlay == NULL) {
+                return -1;
+        }
+        overlay->hwdata->locked = 1;
+        return 0;
+}
+void PS3_UnlockYUVOverlay(_THIS, SDL_Overlay *overlay) {
+        if (overlay == NULL) {
+                return;
+        }
+        overlay->hwdata->locked = 0;
+        return;
+}
+int PS3_DisplayYUVOverlay(_THIS, SDL_Overlay *overlay, SDL_Rect *src, SDL_Rect *dst) {
+        if ((overlay == NULL) || (overlay->hwdata == NULL)) {
+                return -1;
+        }
+        Uint8 *lum, *Cr, *Cb;
+        struct private_yuvhwdata *hwdata;
+        SDL_Surface *display;
+        hwdata = overlay->hwdata;
+        display = hwdata->display;
+        /* Do we have to scale? */
+        if ((src->w != dst->w) || (src->h != dst->h) ) {
+                hwdata->scale = 1;
+                deprintf(1, "[PS3] We need to scale\n");
+        } else {
+                hwdata->scale = 0;
+                deprintf(1, "[PS3] No scaling\n");
+        }
+        /* Find out where the various portions of the image are */
+        switch (overlay->format) {
+                case SDL_YV12_OVERLAY:
+                        lum = (Uint8 *)overlay->pixels[0];
+                        Cr =  (Uint8 *)overlay->pixels[1];
+                        Cb =  (Uint8 *)overlay->pixels[2];
+                        break;
+                case SDL_IYUV_OVERLAY:
+                        lum = (Uint8 *)overlay->pixels[0];
+                        Cr =  (Uint8 *)overlay->pixels[2];
+                        Cb =  (Uint8 *)overlay->pixels[1];
+                        break;
+                default:
+                        SDL_SetError("Unsupported YUV format in blit");
+                        return -1;
+        }
+        if (hwdata->scale) {
+                /* Alloc mem for scaled YUV picture */
+                hwdata->scaler_out = (Uint8 *) memalign(16, dst->w * dst->h + ((dst->w * dst->h) >> 1));
+                if (hwdata->scaler_out == NULL) {
+                        SDL_FreeYUVOverlay(overlay);
+                        SDL_OutOfMemory();
+                        return -1;
+                }
+                /* Set parms for scaling */
+                hwdata->scaler_parms->src_pixel_width = src->w;
+                hwdata->scaler_parms->src_pixel_height = src->h;
+                hwdata->scaler_parms->dst_pixel_width = dst->w;
+                hwdata->scaler_parms->dst_pixel_height = dst->h;
+                hwdata->scaler_parms->y_plane = lum;
+                hwdata->scaler_parms->v_plane = Cr;
+                hwdata->scaler_parms->u_plane = Cb;
+                hwdata->scaler_parms->dstBuffer = hwdata->scaler_out;
+                scaler_thread_data->argp = (void *)hwdata->scaler_parms;
+                /* Scale the YUV overlay to given size */
+                SPE_Start(this, scaler_thread_data);
+                SPE_Stop(this, scaler_thread_data);
+                /* Set parms for converting after scaling */
+                hwdata->converter_parms->y_plane = hwdata->scaler_out;
+                hwdata->converter_parms->v_plane = hwdata->scaler_out + dst->w * dst->h;
+                hwdata->converter_parms->u_plane = hwdata->scaler_out + dst->w * dst->h + ((dst->w * dst->h) >> 2);
+        } else {
+                /* Set parms for converting */
+                hwdata->converter_parms->y_plane = lum;
+                hwdata->converter_parms->v_plane = Cr;
+                hwdata->converter_parms->u_plane = Cb;
+        }
+        hwdata->converter_parms->src_pixel_width = dst->w;
+        hwdata->converter_parms->src_pixel_height = dst->h;
+        hwdata->converter_parms->dstBuffer = (Uint8 *) s_pixels;
+        converter_thread_data->argp = (void *)hwdata->converter_parms;
+        /* Convert YUV overlay to RGB */
+        SPE_SendMsg(this, converter_thread_data, SPU_START);
+        SPE_SendMsg(this, converter_thread_data, (unsigned int)converter_thread_data->argp);
+        /* Centering */
+        s_bounded_input_width = dst->w;
+        s_bounded_input_height = dst->h;
+        /* UpdateRects() will do the rest.. */
+        SDL_UpdateRects(display, 1, dst);
+        if (hwdata->scale)
+                SDL_free((void *)hwdata->scaler_out);
+        return 0;
+}
+void PS3_FreeYUVOverlay(_THIS, SDL_Overlay *overlay) {
+        if (overlay == NULL) {
+                return;
+        }
+        if (overlay->hwdata == NULL) {
+                return;
+        }
+        struct private_yuvhwdata * hwdata;
+        hwdata = overlay->hwdata;
+        if (scaler_thread_data)
+                SDL_free(scaler_thread_data);
+        if (converter_thread_data) {
+                SPE_Shutdown(this, converter_thread_data);
+                SDL_free(converter_thread_data);
+        }
+        if (hwdata) {
+                if (hwdata->pixels)
+                        SDL_free((void *)hwdata->pixels);
+                SDL_free(hwdata);
+        }
+        return;
+}
diff --git a/apps/plugins/sdl/src/video/ps3/SDL_ps3yuv_c.h b/apps/plugins/sdl/src/video/ps3/SDL_ps3yuv_c.h
new file mode 100644
index 0000000000..49f9d70953
--- /dev/null
+++ b/apps/plugins/sdl/src/video/ps3/SDL_ps3yuv_c.h
@@ -0,0 +1,44 @@
+/*
+ * SDL - Simple DirectMedia Layer
+ * CELL BE Support for PS3 Framebuffer
+ * Copyright (C) 2008, 2009 International Business Machines Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ *
+ *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
+ *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
+ *  SPE code based on research by:
+ *  Rene Becker
+ *  Thimo Emmerich
+ */
+#include "SDL_config.h"
+#ifndef _SDL_ps3yuv_h
+#define _SDL_ps3yuv_h
+/* This is the PS3 implementation of YUV video overlays */
+#include "SDL_video.h"
+extern SDL_Overlay *PS3_CreateYUVOverlay(_THIS, int width, int height, Uint32 format, SDL_Surface *display);
+extern int PS3_DisplayYUVOverlay(_THIS, SDL_Overlay *overlay, SDL_Rect *src, SDL_Rect *dst);
+extern int PS3_LockYUVOverlay(_THIS, SDL_Overlay *overlay);
+extern void PS3_UnlockYUVOverlay(_THIS, SDL_Overlay *overlay);
+extern void PS3_FreeYUVOverlay(_THIS, SDL_Overlay *overlay);
+#endif /* _SDL_ps3yuv_h */
diff --git a/apps/plugins/sdl/src/video/ps3/spulibs/Makefile b/apps/plugins/sdl/src/video/ps3/spulibs/Makefile
new file mode 100644
index 0000000000..dc580d9436
--- /dev/null
+++ b/apps/plugins/sdl/src/video/ps3/spulibs/Makefile
@@ -0,0 +1,83 @@
+# This Makefile is for building the CELL BE SPU libs
+# libfb_writer_spu.so, libyuv2rgb_spu.so, libbilin_scaler_spu.so
+# Toolchain
+SPU_GCC=/usr/bin/spu-gcc
+PPU_GCC=/usr/bin/gcc
+PPU_EMBEDSPU=/usr/bin/embedspu
+PPU_AR=/usr/bin/ar
+PPU_LD=/usr/bin/ld
+INSTALL=/usr/bin/install
+SPU_CFLAGS=-W -Wall -Winline -Wno-main -I. -I /usr/spu/include -I /opt/cell/sdk/usr/spu/include -finline-limit=10000 -Winline -ftree-vectorize -funroll-loops -fmodulo-sched -ffast-math -fPIC -O2
+# Usually /usr/lib, depending on your distribution
+PREFIX=/usr/lib
+all: libfb_writer_spu.a libfb_writer_spu.so \
+                                libyuv2rgb_spu.so libyuv2rgb_spu.a \
+                                libbilin_scaler_spu.so libbilin_scaler_spu.a
+# fb_writer
+fb_writer_spu-embed.o: fb_writer.c spu_common.h
+        $(SPU_GCC) $(SPU_CFLAGS) -o fb_writer_spu fb_writer.c -lm
+        $(PPU_EMBEDSPU) -m32 fb_writer_spu fb_writer_spu fb_writer_spu-embed.o
+libfb_writer_spu.so: fb_writer_spu-embed.o
+        $(PPU_LD) -o libfb_writer_spu.so -shared -soname=libfb_writer_spu.so fb_writer_spu-embed.o
+libfb_writer_spu.a: fb_writer_spu-embed.o
+        $(PPU_AR) -qcs libfb_writer_spu.a fb_writer_spu-embed.o
+# yuv2rgb_converter
+yuv2rgb_spu-embed.o: yuv2rgb_converter.c spu_common.h
+        $(SPU_GCC) $(SPU_CFLAGS) -o yuv2rgb_spu yuv2rgb_converter.c -lm
+        $(PPU_EMBEDSPU) -m32 yuv2rgb_spu yuv2rgb_spu yuv2rgb_spu-embed.o
+libyuv2rgb_spu.a: yuv2rgb_spu-embed.o
+        $(PPU_AR) -qcs libyuv2rgb_spu.a yuv2rgb_spu-embed.o
+libyuv2rgb_spu.so: yuv2rgb_spu-embed.o
+        $(PPU_LD) -o libyuv2rgb_spu.so -shared -soname=libyuv2rgb_spu.so yuv2rgb_spu-embed.o
+# bilin_scaler
+bilin_scaler_spu-embed.o: bilin_scaler.c spu_common.h
+        $(SPU_GCC) $(SPU_CFLAGS) -o bilin_scaler_spu bilin_scaler.c -lm
+        $(PPU_EMBEDSPU) -m32 bilin_scaler_spu bilin_scaler_spu bilin_scaler_spu-embed.o
+libbilin_scaler_spu.a: bilin_scaler_spu-embed.o
+        $(PPU_AR) -qcs libbilin_scaler_spu.a bilin_scaler_spu-embed.o
+libbilin_scaler_spu.so: bilin_scaler_spu-embed.o
+        $(PPU_LD) -o libbilin_scaler_spu.so -shared -soname=libbilin_scaler_spu.so bilin_scaler_spu-embed.o
+install: libfb_writer_spu.a libfb_writer_spu.so \
+                                libyuv2rgb_spu.so libyuv2rgb_spu.a \
+                                libbilin_scaler_spu.so libbilin_scaler_spu.a
+        $(INSTALL) -c -m 0755 libfb_writer_spu.so $(PREFIX)/.
+        $(INSTALL) -c -m 0655 libfb_writer_spu.a $(PREFIX)/.
+        $(INSTALL) -c -m 0755 libyuv2rgb_spu.so $(PREFIX)/.
+        $(INSTALL) -c -m 0655 libyuv2rgb_spu.a $(PREFIX)/.
+        $(INSTALL) -c -m 0755 libbilin_scaler_spu.so $(PREFIX)/.
+        $(INSTALL) -c -m 0655 libbilin_scaler_spu.a $(PREFIX)/.
+uninstall: $(PREFIX)/libfb_writer_spu.so $(PREFIX)/libfb_writer_spu.a \
+                $(PREFIX)/libyuv2rgb_spu.so $(PREFIX)/libyuv2rgb_spu.a \
+                $(PREFIX)/libbilin_scaler_spu.so $(PREFIX)/libbilin_scaler_spu.a
+        rm -f $(PREFIX)/libfb_writer_spu.a
+        rm -f $(PREFIX)/libfb_writer_spu.so
+        rm -f $(PREFIX)/libyuv2rgb_spu.so
+        rm -f $(PREFIX)/libyuv2rgb_spu.a
+        rm -f $(PREFIX)/libbilin_scaler_spu.so
+        rm -f $(PREFIX)/libbilin_scaler_spu.a
+clean:
+        rm -f bilin_scaler_spu-embed.o libbilin_scaler_spu.so libbilin_scaler_spu.a bilin_scaler_spu
+        rm -f yuv2rgb_spu-embed.o libyuv2rgb_spu.so libyuv2rgb_spu.a yuv2rgb_spu
+        rm -f fb_writer_spu-embed.o libfb_writer_spu.so libfb_writer_spu.a fb_writer_spu
diff --git a/apps/plugins/sdl/src/video/ps3/spulibs/bilin_scaler.c b/apps/plugins/sdl/src/video/ps3/spulibs/bilin_scaler.c
new file mode 100644
index 0000000000..be9b5c6e8d
--- /dev/null
+++ b/apps/plugins/sdl/src/video/ps3/spulibs/bilin_scaler.c
@@ -0,0 +1,2050 @@
+/*
+ * SDL - Simple DirectMedia Layer
+ * CELL BE Support for PS3 Framebuffer
+ * Copyright (C) 2008, 2009 International Business Machines Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ *
+ *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
+ *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
+ *  SPE code based on research by:
+ *  Rene Becker
+ *  Thimo Emmerich
+ */
+#include "spu_common.h"
+#include <spu_intrinsics.h>
+#include <spu_mfcio.h>
+// Debugging
+//#define DEBUG
+#ifdef DEBUG
+#define deprintf(fmt, args... ) \
+        fprintf( stdout, fmt, ##args ); \
+        fflush( stdout );
+#else
+#define deprintf( fmt, args... )
+#endif
+struct scale_parms_t parms __attribute__((aligned(128)));
+/* A maximum of 8 lines Y, therefore 4 lines V, 4 lines U are stored
+ * there might be the need to retrieve misaligned data, adjust
+ * incoming v and u plane to be able to handle this (add 128)
+ */
+unsigned char y_plane[2][(MAX_HDTV_WIDTH+128)*4] __attribute__((aligned(128)));
+unsigned char v_plane[2][(MAX_HDTV_WIDTH+128)*2] __attribute__((aligned(128)));
+unsigned char u_plane[2][(MAX_HDTV_WIDTH+128)*2] __attribute__((aligned(128)));
+/* temp-buffer for scaling: 4 lines Y, therefore 2 lines V, 2 lines U */
+unsigned char scaled_y_plane[2][MAX_HDTV_WIDTH*2] __attribute__((aligned(128)));
+unsigned char scaled_v_plane[2][MAX_HDTV_WIDTH/2] __attribute__((aligned(128)));
+unsigned char scaled_u_plane[2][MAX_HDTV_WIDTH/2] __attribute__((aligned(128)));
+/* some vectors needed by the float to int conversion */
+static const vector float vec_255 = { 255.0f, 255.0f, 255.0f, 255.0f };
+static const vector float vec_0_1 = { 0.1f, 0.1f, 0.1f, 0.1f };
+void bilinear_scale_line_w8(unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride);
+void bilinear_scale_line_w16(unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride);
+void scale_srcw16_dstw16();
+void scale_srcw16_dstw32();
+void scale_srcw32_dstw16();
+void scale_srcw32_dstw32();
+int main( unsigned long long spe_id __attribute__((unused)), unsigned long long argp )
+{
+        deprintf("[SPU] bilin_scaler_spu is up... (on SPE #%llu)\n", spe_id);
+        /* DMA transfer for the input parameters */
+        spu_mfcdma32(&parms, (unsigned int)argp, sizeof(struct scale_parms_t), TAG_INIT, MFC_GET_CMD);
+        DMA_WAIT_TAG(TAG_INIT);
+        deprintf("[SPU] Scale %ux%u to %ux%u\n", parms.src_pixel_width, parms.src_pixel_height,
+                        parms.dst_pixel_width, parms.dst_pixel_height);
+        if(parms.src_pixel_width & 0x1f) {
+                if(parms.dst_pixel_width & 0x1F) {
+                        deprintf("[SPU] Using scale_srcw16_dstw16\n");
+                        scale_srcw16_dstw16();
+                } else {
+                        deprintf("[SPU] Using scale_srcw16_dstw32\n");
+                        scale_srcw16_dstw32();
+                }
+        } else {
+                if(parms.dst_pixel_width & 0x1F) {
+                        deprintf("[SPU] Using scale_srcw32_dstw16\n");
+                        scale_srcw32_dstw16();
+                } else {
+                        deprintf("[SPU] Using scale_srcw32_dstw32\n");
+                        scale_srcw32_dstw32();
+                }
+        }
+        deprintf("[SPU] bilin_scaler_spu... done!\n");
+        return 0;
+}
+/*
+ * vfloat_to_vuint()
+ *
+ * converts a float vector to an unsinged int vector using saturated
+ * arithmetic
+ *
+ * @param vec_s float vector for conversion
+ * @returns converted unsigned int vector
+ */
+inline static vector unsigned int vfloat_to_vuint(vector float vec_s) {
+        vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
+        vec_s = spu_sel(vec_s, vec_0_1, select_1);
+        vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
+        vec_s = spu_sel(vec_s, vec_255, select_2);
+        return spu_convtu(vec_s,0);
+}
+/*
+ * scale_srcw16_dstw16()
+ *
+ * processes an input image of width 16
+ * scaling is done to a width 16
+ * result stored in RAM
+ */
+void scale_srcw16_dstw16() {
+        // extract parameters
+        unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
+        unsigned int src_width = parms.src_pixel_width;
+        unsigned int src_height = parms.src_pixel_height;
+        unsigned int dst_width = parms.dst_pixel_width;
+        unsigned int dst_height = parms.dst_pixel_height;
+        // YVU
+        unsigned int src_linestride_y = src_width;
+        unsigned int src_dbl_linestride_y = src_width<<1;
+        unsigned int src_linestride_vu = src_width>>1;
+        unsigned int src_dbl_linestride_vu = src_width;
+        // scaled YVU
+        unsigned int scaled_src_linestride_y = dst_width;
+        // ram addresses
+        unsigned char* src_addr_y = parms.y_plane;
+        unsigned char* src_addr_v = parms.v_plane;
+        unsigned char* src_addr_u = parms.u_plane;
+        // for handling misalignment, addresses are precalculated
+        unsigned char* precalc_src_addr_v = src_addr_v;
+        unsigned char* precalc_src_addr_u = src_addr_u;
+        unsigned int dst_picture_size = dst_width*dst_height;
+        // Sizes for destination
+        unsigned int dst_dbl_linestride_y = dst_width<<1;
+        unsigned int dst_dbl_linestride_vu = dst_width>>1;
+        // Perform address calculation for Y, V and U in main memory with dst_addr as base
+        unsigned char* dst_addr_main_memory_y = dst_addr;
+        unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
+        unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
+        // calculate scale factors
+        vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
+        float y_scale = (float)src_height/(float)dst_height;
+        // double buffered processing
+        // buffer switching
+        unsigned int curr_src_idx = 0;
+        unsigned int curr_dst_idx = 0;
+        unsigned int next_src_idx, next_dst_idx;
+        // 2 lines y as output, upper and lowerline
+        unsigned int curr_interpl_y_upper = 0;
+        unsigned int next_interpl_y_upper;
+        unsigned int curr_interpl_y_lower, next_interpl_y_lower;
+        // only 1 line v/u output, both planes have the same dimension
+        unsigned int curr_interpl_vu = 0;
+        unsigned int next_interpl_vu;
+        // weights, calculated in every loop iteration
+        vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
+        vector float vf_next_NSweight_y_upper;
+        vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
+        vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
+        vector float vf_next_NSweight_vu;
+        // line indices for the src picture
+        float curr_src_y_upper = 0.0f, next_src_y_upper;
+        float curr_src_y_lower, next_src_y_lower;
+        float curr_src_vu = 0.0f, next_src_vu;
+        // line indices for the dst picture
+        unsigned int dst_y=0, dst_vu=0;
+        // offset for the v and u plane to handle misalignement
+        unsigned int curr_lsoff_v = 0, next_lsoff_v;
+        unsigned int curr_lsoff_u = 0, next_lsoff_u;
+        // calculate lower line indices
+        curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
+        curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
+        // lower line weight
+        vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
+        // start partially double buffered processing
+        // get initial data, 2 sets of y, 1 set v, 1 set u
+        mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
+        mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
+                        (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
+                        src_dbl_linestride_y,
+                        RETR_BUF,
+                        0, 0 );
+        mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
+        mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
+        /* iteration loop
+         * within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
+         * the scaled output is 2 lines y, 1 line v, 1 line u
+         * the yuv2rgb-converted output is stored to RAM
+         */
+        for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
+                dst_y = dst_vu<<1;
+                // calculate next indices
+                next_src_vu = ((float)dst_vu+1)*y_scale;
+                next_src_y_upper = ((float)dst_y+2)*y_scale;
+                next_src_y_lower = ((float)dst_y+3)*y_scale;
+                next_interpl_vu = (unsigned int) next_src_vu;
+                next_interpl_y_upper = (unsigned int) next_src_y_upper;
+                next_interpl_y_lower = (unsigned int) next_src_y_lower;
+                // calculate weight NORTH-SOUTH
+                vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
+                vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
+                vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
+                // get next lines
+                next_src_idx = curr_src_idx^1;
+                next_dst_idx = curr_dst_idx^1;
+                // 4 lines y
+                mfc_get( y_plane[next_src_idx],
+                                (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
+                                src_dbl_linestride_y,
+                                RETR_BUF+next_src_idx,
+                                0, 0 );
+                mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
+                                (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
+                                src_dbl_linestride_y,
+                                RETR_BUF+next_src_idx,
+                                0, 0 );
+                // 2 lines v
+                precalc_src_addr_v = src_addr_v+(next_interpl_vu*src_linestride_vu);
+                next_lsoff_v = ((unsigned int)precalc_src_addr_v)&0x0F;
+                mfc_get( v_plane[next_src_idx],
+                                ((unsigned int) precalc_src_addr_v)&0xFFFFFFF0,
+                                src_dbl_linestride_vu+(next_lsoff_v<<1),
+                                RETR_BUF+next_src_idx,
+                                0, 0 );
+                // 2 lines u
+                precalc_src_addr_u = src_addr_u+(next_interpl_vu*src_linestride_vu);
+                next_lsoff_u = ((unsigned int)precalc_src_addr_u)&0x0F;
+                mfc_get( u_plane[next_src_idx],
+                                ((unsigned int) precalc_src_addr_u)&0xFFFFFFF0,
+                                src_dbl_linestride_vu+(next_lsoff_v<<1),
+                                RETR_BUF+next_src_idx,
+                                0, 0 );
+                DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
+                // scaling
+                // work line y_upper
+                bilinear_scale_line_w16( y_plane[curr_src_idx],
+                                scaled_y_plane[curr_src_idx],
+                                dst_width,
+                                vf_x_scale,
+                                vf_curr_NSweight_y_upper,
+                                src_linestride_y );
+                // work line y_lower
+                bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
+                                scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
+                                dst_width,
+                                vf_x_scale,
+                                vf_curr_NSweight_y_lower,
+                                src_linestride_y );
+                // work line v
+                bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
+                                scaled_v_plane[curr_src_idx],
+                                dst_width>>1,
+                                vf_x_scale,
+                                vf_curr_NSweight_vu,
+                                src_linestride_vu );
+                // work line u
+                bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
+                                scaled_u_plane[curr_src_idx],
+                                dst_width>>1,
+                                vf_x_scale,
+                                vf_curr_NSweight_vu,
+                                src_linestride_vu );
+                // Store the result back to main memory into a destination buffer in YUV format
+                //---------------------------------------------------------------------------------------------
+                DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+                // Perform three DMA transfers to 3 different locations in the main memory!
+                // dst_width:   Pixel width of destination image
+                // dst_addr:    Destination address in main memory
+                // dst_vu:      Counter which is incremented one by one
+                // dst_y:       Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
+                mfc_put(        scaled_y_plane[curr_src_idx],                                   // What from local store (addr)
+                                (unsigned int)dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),   // Destination in main memory (addr)
+                                dst_dbl_linestride_y,                                           // Two Y lines (depending on the widht of the destination resolution)
+                                STR_BUF+curr_dst_idx,                                           // Tag
+                                0, 0 );
+                mfc_put(        scaled_v_plane[curr_src_idx],                                   // What from local store (addr)
+                                (unsigned int)dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),  // Destination in main memory (addr)
+                                dst_dbl_linestride_vu,                                          // Two V lines (depending on the widht of the destination resolution)
+                                STR_BUF+curr_dst_idx,                                           // Tag
+                                0, 0 );
+                mfc_put(        scaled_u_plane[curr_src_idx],                                   // What from local store (addr)
+                                (unsigned int)dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),  // Destination in main memory (addr)
+                                dst_dbl_linestride_vu,                                          // Two U lines (depending on the widht of the destination resolution)
+                                STR_BUF+curr_dst_idx,                                           // Tag
+                                0, 0 );
+                //---------------------------------------------------------------------------------------------
+                // update for next cycle
+                curr_src_idx = next_src_idx;
+                curr_dst_idx = next_dst_idx;
+                curr_interpl_y_upper = next_interpl_y_upper;
+                curr_interpl_y_lower = next_interpl_y_lower;
+                curr_interpl_vu = next_interpl_vu;
+                vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
+                vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
+                vf_curr_NSweight_vu = vf_next_NSweight_vu;
+                curr_src_y_upper = next_src_y_upper;
+                curr_src_y_lower = next_src_y_lower;
+                curr_src_vu = next_src_vu;
+                curr_lsoff_v = next_lsoff_v;
+                curr_lsoff_u = next_lsoff_u;
+        }
+        DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
+        // scaling
+        // work line y_upper
+        bilinear_scale_line_w16( y_plane[curr_src_idx],
+                        scaled_y_plane[curr_src_idx],
+                        dst_width,
+                        vf_x_scale,
+                        vf_curr_NSweight_y_upper,
+                        src_linestride_y );
+        // work line y_lower
+        bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
+                        scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
+                        dst_width,
+                        vf_x_scale,
+                        vf_curr_NSweight_y_lower,
+                        src_linestride_y );
+        // work line v
+        bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
+                        scaled_v_plane[curr_src_idx],
+                        dst_width>>1,
+                        vf_x_scale,
+                        vf_curr_NSweight_vu,
+                        src_linestride_vu );
+        // work line u
+        bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
+                        scaled_u_plane[curr_src_idx],
+                        dst_width>>1,
+                        vf_x_scale,
+                        vf_curr_NSweight_vu,
+                        src_linestride_vu );
+        // Store the result back to main memory into a destination buffer in YUV format
+        //---------------------------------------------------------------------------------------------
+        DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+        // Perform three DMA transfers to 3 different locations in the main memory!
+        // dst_width:   Pixel width of destination image
+        // dst_addr:    Destination address in main memory
+        // dst_vu:      Counter which is incremented one by one
+        // dst_y:       Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
+        mfc_put(        scaled_y_plane[curr_src_idx],                                   // What from local store (addr)
+                        (unsigned int)dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),   // Destination in main memory (addr)
+                        dst_dbl_linestride_y,                                           // Two Y lines (depending on the widht of the destination resolution)
+                        STR_BUF+curr_dst_idx,                                           // Tag
+                        0, 0 );
+        mfc_put(        scaled_v_plane[curr_src_idx],                                   // What from local store (addr)
+                        (unsigned int)dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),  // Destination in main memory (addr)
+                        dst_dbl_linestride_vu,                                          // Two V lines (depending on the widht of the destination resolution)
+                        STR_BUF+curr_dst_idx,                                           // Tag
+                        0, 0 );
+        mfc_put(        scaled_u_plane[curr_src_idx],                                   // What from local store (addr)
+                        (unsigned int)dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),  // Destination in main memory (addr)
+                        dst_dbl_linestride_vu,                                          // Two U lines (depending on the widht of the destination resolution)
+                        STR_BUF+curr_dst_idx,                                           // Tag
+                        0, 0 );
+        // wait for completion
+        DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+        //---------------------------------------------------------------------------------------------
+}
+/*
+ * scale_srcw16_dstw32()
+ *
+ * processes an input image of width 16
+ * scaling is done to a width 32
+ * yuv2rgb conversion on a width of 32
+ * result stored in RAM
+ */
+void scale_srcw16_dstw32() {
+        // extract parameters
+        unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
+        unsigned int src_width = parms.src_pixel_width;
+        unsigned int src_height = parms.src_pixel_height;
+        unsigned int dst_width = parms.dst_pixel_width;
+        unsigned int dst_height = parms.dst_pixel_height;
+        // YVU
+        unsigned int src_linestride_y = src_width;
+        unsigned int src_dbl_linestride_y = src_width<<1;
+        unsigned int src_linestride_vu = src_width>>1;
+        unsigned int src_dbl_linestride_vu = src_width;
+        // scaled YVU
+        unsigned int scaled_src_linestride_y = dst_width;
+        // ram addresses
+        unsigned char* src_addr_y = parms.y_plane;
+        unsigned char* src_addr_v = parms.v_plane;
+        unsigned char* src_addr_u = parms.u_plane;
+        unsigned int dst_picture_size = dst_width*dst_height;
+        // Sizes for destination
+        unsigned int dst_dbl_linestride_y = dst_width<<1;
+        unsigned int dst_dbl_linestride_vu = dst_width>>1;
+        // Perform address calculation for Y, V and U in main memory with dst_addr as base
+        unsigned char* dst_addr_main_memory_y = dst_addr;
+        unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
+        unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
+        // for handling misalignment, addresses are precalculated
+        unsigned char* precalc_src_addr_v = src_addr_v;
+        unsigned char* precalc_src_addr_u = src_addr_u;
+        // calculate scale factors
+        vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
+        float y_scale = (float)src_height/(float)dst_height;
+        // double buffered processing
+        // buffer switching
+        unsigned int curr_src_idx = 0;
+        unsigned int curr_dst_idx = 0;
+        unsigned int next_src_idx, next_dst_idx;
+        // 2 lines y as output, upper and lowerline
+        unsigned int curr_interpl_y_upper = 0;
+        unsigned int next_interpl_y_upper;
+        unsigned int curr_interpl_y_lower, next_interpl_y_lower;
+        // only 1 line v/u output, both planes have the same dimension
+        unsigned int curr_interpl_vu = 0;
+        unsigned int next_interpl_vu;
+        // weights, calculated in every loop iteration
+        vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
+        vector float vf_next_NSweight_y_upper;
+        vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
+        vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
+        vector float vf_next_NSweight_vu;
+        // line indices for the src picture
+        float curr_src_y_upper = 0.0f, next_src_y_upper;
+        float curr_src_y_lower, next_src_y_lower;
+        float curr_src_vu = 0.0f, next_src_vu;
+        // line indices for the dst picture
+        unsigned int dst_y=0, dst_vu=0;
+        // offset for the v and u plane to handle misalignement
+        unsigned int curr_lsoff_v = 0, next_lsoff_v;
+        unsigned int curr_lsoff_u = 0, next_lsoff_u;
+        // calculate lower line idices
+        curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
+        curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
+        // lower line weight
+        vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
+        // start partially double buffered processing
+        // get initial data, 2 sets of y, 1 set v, 1 set u
+        mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
+        mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
+                        (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
+                        src_dbl_linestride_y,
+                        RETR_BUF,
+                        0, 0 );
+        mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
+        mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
+        // iteration loop
+        // within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
+        // the scaled output is 2 lines y, 1 line v, 1 line u
+        // the yuv2rgb-converted output is stored to RAM
+        for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
+                dst_y = dst_vu<<1;
+                // calculate next indices
+                next_src_vu = ((float)dst_vu+1)*y_scale;
+                next_src_y_upper = ((float)dst_y+2)*y_scale;
+                next_src_y_lower = ((float)dst_y+3)*y_scale;
+                next_interpl_vu = (unsigned int) next_src_vu;
+                next_interpl_y_upper = (unsigned int) next_src_y_upper;
+                next_interpl_y_lower = (unsigned int) next_src_y_lower;
+                // calculate weight NORTH-SOUTH
+                vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
+                vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
+                vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
+                // get next lines
+                next_src_idx = curr_src_idx^1;
+                next_dst_idx = curr_dst_idx^1;
+                // 4 lines y
+                mfc_get( y_plane[next_src_idx],
+                                (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
+                                src_dbl_linestride_y,
+                                RETR_BUF+next_src_idx,
+                                0, 0 );
+                mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
+                                (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
+                                src_dbl_linestride_y,
+                                RETR_BUF+next_src_idx,
+                                0, 0 );
+                // 2 lines v
+                precalc_src_addr_v = src_addr_v+(next_interpl_vu*src_linestride_vu);
+                next_lsoff_v = ((unsigned int)precalc_src_addr_v)&0x0F;
+                mfc_get( v_plane[next_src_idx],
+                                ((unsigned int) precalc_src_addr_v)&0xFFFFFFF0,
+                                src_dbl_linestride_vu+(next_lsoff_v<<1),
+                                RETR_BUF+next_src_idx,
+                                0, 0 );
+                // 2 lines u
+                precalc_src_addr_u = src_addr_u+(next_interpl_vu*src_linestride_vu);
+                next_lsoff_u = ((unsigned int)precalc_src_addr_u)&0x0F;
+                mfc_get( u_plane[next_src_idx],
+                                ((unsigned int) precalc_src_addr_u)&0xFFFFFFF0,
+                                src_dbl_linestride_vu+(next_lsoff_v<<1),
+                                RETR_BUF+next_src_idx,
+                                0, 0 );
+                DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
+                // scaling
+                // work line y_upper
+                bilinear_scale_line_w16( y_plane[curr_src_idx],
+                                scaled_y_plane[curr_src_idx],
+                                dst_width,
+                                vf_x_scale,
+                                vf_curr_NSweight_y_upper,
+                                src_linestride_y );
+                // work line y_lower
+                bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
+                                scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
+                                dst_width,
+                                vf_x_scale,
+                                vf_curr_NSweight_y_lower,
+                                src_linestride_y );
+                // work line v
+                bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
+                                scaled_v_plane[curr_src_idx],
+                                dst_width>>1,
+                                vf_x_scale,
+                                vf_curr_NSweight_vu,
+                                src_linestride_vu );
+                // work line u
+                bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
+                                scaled_u_plane[curr_src_idx],
+                                dst_width>>1,
+                                vf_x_scale,
+                                vf_curr_NSweight_vu,
+                                src_linestride_vu );
+                //---------------------------------------------------------------------------------------------
+                DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+                // Perform three DMA transfers to 3 different locations in the main memory!
+                // dst_width:   Pixel width of destination image
+                // dst_addr:    Destination address in main memory
+                // dst_vu:      Counter which is incremented one by one
+                // dst_y:       Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
+                mfc_put(        scaled_y_plane[curr_src_idx],                                                   // What from local store (addr)
+                                (unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
+                                dst_dbl_linestride_y,                                                           // Two Y lines (depending on the widht of the destination resolution)
+                                STR_BUF+curr_dst_idx,                                                           // Tag
+                                0, 0 );
+                mfc_put(        scaled_v_plane[curr_src_idx],                                                   // What from local store (addr)
+                                (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
+                                dst_dbl_linestride_vu,                                                          // Two V lines (depending on the widht of the destination resolution)
+                                STR_BUF+curr_dst_idx,                                                           // Tag
+                                0, 0 );
+                mfc_put(        scaled_u_plane[curr_src_idx],                                                   // What from local store (addr)
+                                (unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),        // Destination in main memory (addr)
+                                dst_dbl_linestride_vu,                                                          // Two U lines (depending on the widht of the destination resolution)
+                                STR_BUF+curr_dst_idx,                                                           // Tag
+                                0, 0 );
+                //---------------------------------------------------------------------------------------------
+                // update for next cycle
+                curr_src_idx = next_src_idx;
+                curr_dst_idx = next_dst_idx;
+                curr_interpl_y_upper = next_interpl_y_upper;
+                curr_interpl_y_lower = next_interpl_y_lower;
+                curr_interpl_vu = next_interpl_vu;
+                vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
+                vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
+                vf_curr_NSweight_vu = vf_next_NSweight_vu;
+                curr_src_y_upper = next_src_y_upper;
+                curr_src_y_lower = next_src_y_lower;
+                curr_src_vu = next_src_vu;
+                curr_lsoff_v = next_lsoff_v;
+                curr_lsoff_u = next_lsoff_u;
+        }
+        DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
+        // scaling
+        // work line y_upper
+        bilinear_scale_line_w16( y_plane[curr_src_idx],
+                        scaled_y_plane[curr_src_idx],
+                        dst_width,
+                        vf_x_scale,
+                        vf_curr_NSweight_y_upper,
+                        src_linestride_y );
+        // work line y_lower
+        bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
+                        scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
+                        dst_width,
+                        vf_x_scale,
+                        vf_curr_NSweight_y_lower,
+                        src_linestride_y );
+        // work line v
+        bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
+                        scaled_v_plane[curr_src_idx],
+                        dst_width>>1,
+                        vf_x_scale,
+                        vf_curr_NSweight_vu,
+                        src_linestride_vu );
+        // work line u
+        bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
+                        scaled_u_plane[curr_src_idx],
+                        dst_width>>1,
+                        vf_x_scale,
+                        vf_curr_NSweight_vu,
+                        src_linestride_vu );
+        //---------------------------------------------------------------------------------------------
+        DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+        // Perform three DMA transfers to 3 different locations in the main memory!
+        // dst_width:   Pixel width of destination image
+        // dst_addr:    Destination address in main memory
+        // dst_vu:      Counter which is incremented one by one
+        // dst_y:       Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
+        mfc_put(        scaled_y_plane[curr_src_idx],                                                   // What from local store (addr)
+                        (unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
+                        dst_dbl_linestride_y,                                                           // Two Y lines (depending on the widht of the destination resolution)
+                        STR_BUF+curr_dst_idx,                                                           // Tag
+                        0, 0 );
+        mfc_put(        scaled_v_plane[curr_src_idx],                                                   // What from local store (addr)
+                        (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
+                        dst_dbl_linestride_vu,                                                          // Two V lines (depending on the widht of the destination resolution)
+                        STR_BUF+curr_dst_idx,                                                           // Tag
+                        0, 0 );
+        mfc_put(        scaled_u_plane[curr_src_idx],                                                   // What from local store (addr)
+                        (unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),        // Destination in main memory (addr)
+                        dst_dbl_linestride_vu,                                                          // Two U lines (depending on the widht of the destination resolution)
+                        STR_BUF+curr_dst_idx,                                                           // Tag
+                        0, 0 );
+        // wait for completion
+        DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+        //---------------------------------------------------------------------------------------------
+}
+/*
+ * scale_srcw32_dstw16()
+ *
+ * processes an input image of width 32
+ * scaling is done to a width 16
+ * yuv2rgb conversion on a width of 16
+ * result stored in RAM
+ */
+void scale_srcw32_dstw16() {
+        // extract parameters
+        unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
+        unsigned int src_width = parms.src_pixel_width;
+        unsigned int src_height = parms.src_pixel_height;
+        unsigned int dst_width = parms.dst_pixel_width;
+        unsigned int dst_height = parms.dst_pixel_height;
+        // YVU
+        unsigned int src_linestride_y = src_width;
+        unsigned int src_dbl_linestride_y = src_width<<1;
+        unsigned int src_linestride_vu = src_width>>1;
+        unsigned int src_dbl_linestride_vu = src_width;
+        // scaled YVU
+        unsigned int scaled_src_linestride_y = dst_width;
+        // ram addresses
+        unsigned char* src_addr_y = parms.y_plane;
+        unsigned char* src_addr_v = parms.v_plane;
+        unsigned char* src_addr_u = parms.u_plane;
+        unsigned int dst_picture_size = dst_width*dst_height;
+        // Sizes for destination
+        unsigned int dst_dbl_linestride_y = dst_width<<1;
+        unsigned int dst_dbl_linestride_vu = dst_width>>1;
+        // Perform address calculation for Y, V and U in main memory with dst_addr as base
+        unsigned char* dst_addr_main_memory_y = dst_addr;
+        unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
+        unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
+        // calculate scale factors
+        vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
+        float y_scale = (float)src_height/(float)dst_height;
+        // double buffered processing
+        // buffer switching
+        unsigned int curr_src_idx = 0;
+        unsigned int curr_dst_idx = 0;
+        unsigned int next_src_idx, next_dst_idx;
+        // 2 lines y as output, upper and lowerline
+        unsigned int curr_interpl_y_upper = 0;
+        unsigned int next_interpl_y_upper;
+        unsigned int curr_interpl_y_lower, next_interpl_y_lower;
+        // only 1 line v/u output, both planes have the same dimension
+        unsigned int curr_interpl_vu = 0;
+        unsigned int next_interpl_vu;
+        // weights, calculated in every loop iteration
+        vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
+        vector float vf_next_NSweight_y_upper;
+        vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
+        vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
+        vector float vf_next_NSweight_vu;
+        // line indices for the src picture
+        float curr_src_y_upper = 0.0f, next_src_y_upper;
+        float curr_src_y_lower, next_src_y_lower;
+        float curr_src_vu = 0.0f, next_src_vu;
+        // line indices for the dst picture
+        unsigned int dst_y=0, dst_vu=0;
+        // calculate lower line idices
+        curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
+        curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
+        // lower line weight
+        vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
+        // start partially double buffered processing
+        // get initial data, 2 sets of y, 1 set v, 1 set u
+        mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
+        mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
+                        (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
+                        src_dbl_linestride_y,
+                        RETR_BUF,
+                        0, 0 );
+        mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
+        mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
+        // iteration loop
+        // within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
+        // the scaled output is 2 lines y, 1 line v, 1 line u
+        // the yuv2rgb-converted output is stored to RAM
+        for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
+                dst_y = dst_vu<<1;
+                // calculate next indices
+                next_src_vu = ((float)dst_vu+1)*y_scale;
+                next_src_y_upper = ((float)dst_y+2)*y_scale;
+                next_src_y_lower = ((float)dst_y+3)*y_scale;
+                next_interpl_vu = (unsigned int) next_src_vu;
+                next_interpl_y_upper = (unsigned int) next_src_y_upper;
+                next_interpl_y_lower = (unsigned int) next_src_y_lower;
+                // calculate weight NORTH-SOUTH
+                vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
+                vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
+                vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
+                // get next lines
+                next_src_idx = curr_src_idx^1;
+                next_dst_idx = curr_dst_idx^1;
+                // 4 lines y
+                mfc_get( y_plane[next_src_idx],
+                                (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
+                                src_dbl_linestride_y,
+                                RETR_BUF+next_src_idx,
+                                0, 0 );
+                mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
+                                (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
+                                src_dbl_linestride_y,
+                                RETR_BUF+next_src_idx,
+                                0, 0 );
+                // 2 lines v
+                mfc_get( v_plane[next_src_idx],
+                                (unsigned int) src_addr_v+(next_interpl_vu*src_linestride_vu),
+                                src_dbl_linestride_vu,
+                                RETR_BUF+next_src_idx,
+                                0, 0 );
+                // 2 lines u
+                mfc_get( u_plane[next_src_idx],
+                                (unsigned int) src_addr_u+(next_interpl_vu*src_linestride_vu),
+                                src_dbl_linestride_vu,
+                                RETR_BUF+next_src_idx,
+                                0, 0 );
+                DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
+                // scaling
+                // work line y_upper
+                bilinear_scale_line_w16( y_plane[curr_src_idx],
+                                scaled_y_plane[curr_src_idx],
+                                dst_width,
+                                vf_x_scale,
+                                vf_curr_NSweight_y_upper,
+                                src_linestride_y );
+                // work line y_lower
+                bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
+                                scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
+                                dst_width,
+                                vf_x_scale,
+                                vf_curr_NSweight_y_lower,
+                                src_linestride_y );
+                // work line v
+                bilinear_scale_line_w16( v_plane[curr_src_idx],
+                                scaled_v_plane[curr_src_idx],
+                                dst_width>>1,
+                                vf_x_scale,
+                                vf_curr_NSweight_vu,
+                                src_linestride_vu );
+                // work line u
+                bilinear_scale_line_w16( u_plane[curr_src_idx],
+                                scaled_u_plane[curr_src_idx],
+                                dst_width>>1,
+                                vf_x_scale,
+                                vf_curr_NSweight_vu,
+                                src_linestride_vu );
+                //---------------------------------------------------------------------------------------------
+                DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+                // Perform three DMA transfers to 3 different locations in the main memory!
+                // dst_width:   Pixel width of destination image
+                // dst_addr:    Destination address in main memory
+                // dst_vu:      Counter which is incremented one by one
+                // dst_y:       Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
+                mfc_put(        scaled_y_plane[curr_src_idx],                                                   // What from local store (addr)
+                                (unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
+                                dst_dbl_linestride_y,                                                           // Two Y lines (depending on the widht of the destination resolution)
+                                STR_BUF+curr_dst_idx,                                                           // Tag
+                                0, 0 );
+                mfc_put(        scaled_v_plane[curr_src_idx],                                                   // What from local store (addr)
+                                (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
+                                dst_dbl_linestride_vu,                                                          // Two V lines (depending on the widht of the destination resolution)
+                                STR_BUF+curr_dst_idx,                                                           // Tag
+                                0, 0 );
+                mfc_put(        scaled_u_plane[curr_src_idx],                                                   // What from local store (addr)
+                                (unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
+                                dst_dbl_linestride_vu,                                                          // Two U lines (depending on the widht of the destination resolution)
+                                STR_BUF+curr_dst_idx,                                                           // Tag
+                                0, 0 );
+                //---------------------------------------------------------------------------------------------
+                // update for next cycle
+                curr_src_idx = next_src_idx;
+                curr_dst_idx = next_dst_idx;
+                curr_interpl_y_upper = next_interpl_y_upper;
+                curr_interpl_y_lower = next_interpl_y_lower;
+                curr_interpl_vu = next_interpl_vu;
+                vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
+                vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
+                vf_curr_NSweight_vu = vf_next_NSweight_vu;
+                curr_src_y_upper = next_src_y_upper;
+                curr_src_y_lower = next_src_y_lower;
+                curr_src_vu = next_src_vu;
+        }
+        DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
+        // scaling
+        // work line y_upper
+        bilinear_scale_line_w16( y_plane[curr_src_idx],
+                        scaled_y_plane[curr_src_idx],
+                        dst_width,
+                        vf_x_scale,
+                        vf_curr_NSweight_y_upper,
+                        src_linestride_y );
+        // work line y_lower
+        bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
+                        scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
+                        dst_width,
+                        vf_x_scale,
+                        vf_curr_NSweight_y_lower,
+                        src_linestride_y );
+        // work line v
+        bilinear_scale_line_w16( v_plane[curr_src_idx],
+                        scaled_v_plane[curr_src_idx],
+                        dst_width>>1,
+                        vf_x_scale,
+                        vf_curr_NSweight_vu,
+                        src_linestride_vu );
+        // work line u
+        bilinear_scale_line_w16( u_plane[curr_src_idx],
+                        scaled_u_plane[curr_src_idx],
+                        dst_width>>1,
+                        vf_x_scale,
+                        vf_curr_NSweight_vu,
+                        src_linestride_vu );
+        //---------------------------------------------------------------------------------------------
+        DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+        // Perform three DMA transfers to 3 different locations in the main memory!
+        // dst_width:   Pixel width of destination image
+        // dst_addr:    Destination address in main memory
+        // dst_vu:      Counter which is incremented one by one
+        // dst_y:       Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
+        mfc_put(        scaled_y_plane[curr_src_idx],                                                   // What from local store (addr)
+                        (unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
+                        dst_dbl_linestride_y,                                                           // Two Y lines (depending on the widht of the destination resolution)
+                        STR_BUF+curr_dst_idx,                                                           // Tag
+                        0, 0 );
+        mfc_put(        scaled_v_plane[curr_src_idx],                                                   // What from local store (addr)
+                        (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
+                        dst_dbl_linestride_vu,                                                          // Two V lines (depending on the widht of the destination resolution)
+                        STR_BUF+curr_dst_idx,                                                           // Tag
+                        0, 0 );
+        mfc_put(        scaled_u_plane[curr_src_idx],                                                   // What from local store (addr)
+                        (unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
+                        dst_dbl_linestride_vu,                                                          // Two U lines (depending on the widht of the destination resolution)
+                        STR_BUF+curr_dst_idx,                                                           // Tag
+                        0, 0 );
+        // wait for completion
+        DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+        //---------------------------------------------------------------------------------------------
+}
+/**
+ * scale_srcw32_dstw32()
+ *
+ * processes an input image of width 32
+ * scaling is done to a width 32
+ * yuv2rgb conversion on a width of 32
+ * result stored in RAM
+ */
+void scale_srcw32_dstw32() {
+        // extract parameters
+        unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
+        unsigned int src_width = parms.src_pixel_width;
+        unsigned int src_height = parms.src_pixel_height;
+        unsigned int dst_width = parms.dst_pixel_width;
+        unsigned int dst_height = parms.dst_pixel_height;
+        // YVU
+        unsigned int src_linestride_y = src_width;
+        unsigned int src_dbl_linestride_y = src_width<<1;
+        unsigned int src_linestride_vu = src_width>>1;
+        unsigned int src_dbl_linestride_vu = src_width;
+        // scaled YVU
+        unsigned int scaled_src_linestride_y = dst_width;
+        // ram addresses
+        unsigned char* src_addr_y = parms.y_plane;
+        unsigned char* src_addr_v = parms.v_plane;
+        unsigned char* src_addr_u = parms.u_plane;
+        unsigned int dst_picture_size = dst_width*dst_height;
+        // Sizes for destination
+        unsigned int dst_dbl_linestride_y = dst_width<<1;
+        unsigned int dst_dbl_linestride_vu = dst_width>>1;
+        // Perform address calculation for Y, V and U in main memory with dst_addr as base
+        unsigned char* dst_addr_main_memory_y = dst_addr;
+        unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
+        unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
+        // calculate scale factors
+        vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
+        float y_scale = (float)src_height/(float)dst_height;
+        // double buffered processing
+        // buffer switching
+        unsigned int curr_src_idx = 0;
+        unsigned int curr_dst_idx = 0;
+        unsigned int next_src_idx, next_dst_idx;
+        // 2 lines y as output, upper and lowerline
+        unsigned int curr_interpl_y_upper = 0;
+        unsigned int next_interpl_y_upper;
+        unsigned int curr_interpl_y_lower, next_interpl_y_lower;
+        // only 1 line v/u output, both planes have the same dimension
+        unsigned int curr_interpl_vu = 0;
+        unsigned int next_interpl_vu;
+        // weights, calculated in every loop iteration
+        vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
+        vector float vf_next_NSweight_y_upper;
+        vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
+        vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
+        vector float vf_next_NSweight_vu;
+        // line indices for the src picture
+        float curr_src_y_upper = 0.0f, next_src_y_upper;
+        float curr_src_y_lower, next_src_y_lower;
+        float curr_src_vu = 0.0f, next_src_vu;
+        // line indices for the dst picture
+        unsigned int dst_y=0, dst_vu=0;
+        // calculate lower line idices
+        curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
+        curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
+        // lower line weight
+        vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
+        // start partially double buffered processing
+        // get initial data, 2 sets of y, 1 set v, 1 set u
+        mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
+        mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
+                        (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
+                        src_dbl_linestride_y,
+                        RETR_BUF,
+                        0, 0 );
+        mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
+        mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
+        // iteration loop
+        // within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
+        // the scaled output is 2 lines y, 1 line v, 1 line u
+        // the yuv2rgb-converted output is stored to RAM
+        for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
+                dst_y = dst_vu<<1;
+                // calculate next indices
+                next_src_vu = ((float)dst_vu+1)*y_scale;
+                next_src_y_upper = ((float)dst_y+2)*y_scale;
+                next_src_y_lower = ((float)dst_y+3)*y_scale;
+                next_interpl_vu = (unsigned int) next_src_vu;
+                next_interpl_y_upper = (unsigned int) next_src_y_upper;
+                next_interpl_y_lower = (unsigned int) next_src_y_lower;
+                // calculate weight NORTH-SOUTH
+                vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
+                vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
+                vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
+                // get next lines
+                next_src_idx = curr_src_idx^1;
+                next_dst_idx = curr_dst_idx^1;
+                // 4 lines y
+                mfc_get( y_plane[next_src_idx],
+                                (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
+                                src_dbl_linestride_y,
+                                RETR_BUF+next_src_idx,
+                                0, 0 );
+                mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
+                                (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
+                                src_dbl_linestride_y,
+                                RETR_BUF+next_src_idx,
+                                0, 0 );
+                // 2 lines v
+                mfc_get( v_plane[next_src_idx],
+                                (unsigned int) src_addr_v+(next_interpl_vu*src_linestride_vu),
+                                src_dbl_linestride_vu,
+                                RETR_BUF+next_src_idx,
+                                0, 0 );
+                // 2 lines u
+                mfc_get( u_plane[next_src_idx],
+                                (unsigned int) src_addr_u+(next_interpl_vu*src_linestride_vu),
+                                src_dbl_linestride_vu,
+                                RETR_BUF+next_src_idx,
+                                0, 0 );
+                DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
+                // scaling
+                // work line y_upper
+                bilinear_scale_line_w16( y_plane[curr_src_idx],
+                                scaled_y_plane[curr_src_idx],
+                                dst_width,
+                                vf_x_scale,
+                                vf_curr_NSweight_y_upper,
+                                src_linestride_y );
+                // work line y_lower
+                bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
+                                scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
+                                dst_width,
+                                vf_x_scale,
+                                vf_curr_NSweight_y_lower,
+                                src_linestride_y );
+                // work line v
+                bilinear_scale_line_w16( v_plane[curr_src_idx],
+                                scaled_v_plane[curr_src_idx],
+                                dst_width>>1,
+                                vf_x_scale,
+                                vf_curr_NSweight_vu,
+                                src_linestride_vu );
+                // work line u
+                bilinear_scale_line_w16( u_plane[curr_src_idx],
+                                scaled_u_plane[curr_src_idx],
+                                dst_width>>1,
+                                vf_x_scale,
+                                vf_curr_NSweight_vu,
+                                src_linestride_vu );
+                // Store the result back to main memory into a destination buffer in YUV format
+                //---------------------------------------------------------------------------------------------
+                DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+                // Perform three DMA transfers to 3 different locations in the main memory!
+                // dst_width:   Pixel width of destination image
+                // dst_addr:    Destination address in main memory
+                // dst_vu:      Counter which is incremented one by one
+                // dst_y:       Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
+                mfc_put(        scaled_y_plane[curr_src_idx],                                                   // What from local store (addr)
+                                (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),  // Destination in main memory (addr)
+                                dst_dbl_linestride_y,                                                           // Two Y lines (depending on the widht of the destination resolution)
+                                STR_BUF+curr_dst_idx,                                                           // Tag
+                                0, 0 );
+                mfc_put(        scaled_v_plane[curr_src_idx],                                                   // What from local store (addr)
+                                (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
+                                dst_dbl_linestride_vu,                                                          // Two V lines (depending on the widht of the destination resolution)
+                                STR_BUF+curr_dst_idx,                                                           // Tag
+                                0, 0 );
+                mfc_put(        scaled_u_plane[curr_src_idx],                                                   // What from local store (addr)
+                                (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
+                                dst_dbl_linestride_vu,                                                          // Two U lines (depending on the widht of the destination resolution)
+                                STR_BUF+curr_dst_idx,                                                           // Tag
+                                0, 0 );
+                //---------------------------------------------------------------------------------------------
+                // update for next cycle
+                curr_src_idx = next_src_idx;
+                curr_dst_idx = next_dst_idx;
+                curr_interpl_y_upper = next_interpl_y_upper;
+                curr_interpl_y_lower = next_interpl_y_lower;
+                curr_interpl_vu = next_interpl_vu;
+                vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
+                vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
+                vf_curr_NSweight_vu = vf_next_NSweight_vu;
+                curr_src_y_upper = next_src_y_upper;
+                curr_src_y_lower = next_src_y_lower;
+                curr_src_vu = next_src_vu;
+        }
+        DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
+        // scaling
+        // work line y_upper
+        bilinear_scale_line_w16( y_plane[curr_src_idx],
+                        scaled_y_plane[curr_src_idx],
+                        dst_width,
+                        vf_x_scale,
+                        vf_curr_NSweight_y_upper,
+                        src_linestride_y );
+        // work line y_lower
+        bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
+                        scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
+                        dst_width,
+                        vf_x_scale,
+                        vf_curr_NSweight_y_lower,
+                        src_linestride_y );
+        // work line v
+        bilinear_scale_line_w16( v_plane[curr_src_idx],
+                        scaled_v_plane[curr_src_idx],
+                        dst_width>>1,
+                        vf_x_scale,
+                        vf_curr_NSweight_vu,
+                        src_linestride_vu );
+        // work line u
+        bilinear_scale_line_w16( u_plane[curr_src_idx],
+                        scaled_u_plane[curr_src_idx],
+                        dst_width>>1,
+                        vf_x_scale,
+                        vf_curr_NSweight_vu,
+                        src_linestride_vu );
+        // Store the result back to main memory into a destination buffer in YUV format
+        //---------------------------------------------------------------------------------------------
+        DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+        // Perform three DMA transfers to 3 different locations in the main memory!
+        // dst_width:   Pixel width of destination image
+        // dst_addr:    Destination address in main memory
+        // dst_vu:      Counter which is incremented one by one
+        // dst_y:       Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
+        mfc_put(        scaled_y_plane[curr_src_idx],                                                   // What from local store (addr)
+                        (unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
+                        dst_dbl_linestride_y,                                                           // Two Y lines (depending on the widht of the destination resolution)
+                        STR_BUF+curr_dst_idx,                                                           // Tag
+                        0, 0 );
+        mfc_put(        scaled_v_plane[curr_src_idx],                                                   // What from local store (addr)
+                        (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
+                        dst_dbl_linestride_vu,                                                          // Two V lines (depending on the widht of the destination resolution)
+                        STR_BUF+curr_dst_idx,                                                           // Tag
+                        0, 0 );
+        mfc_put(        scaled_u_plane[curr_src_idx],                                                   // What from local store (addr)
+                        (unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
+                        dst_dbl_linestride_vu,                                                          // Two U lines (depending on the widht of the destination resolution)
+                        STR_BUF+curr_dst_idx,                                                           // Tag
+                        0, 0 );
+        // wait for completion
+        DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+        //---------------------------------------------------------------------------------------------
+}
+/*
+ * bilinear_scale_line_w8()
+ *
+ * processes a line of yuv-input, width has to be a multiple of 8
+ * scaled yuv-output is written to local store buffer
+ *
+ * @param src buffer for 2 lines input
+ * @param dst_ buffer for 1 line output
+ * @param dst_width the width of the destination line
+ * @param vf_x_scale a float vector, at each entry is the x_scale-factor
+ * @param vf_NSweight a float vector, at each position is the weight NORTH/SOUTH for the current line
+ * @param src_linestride the stride of the srcline
+ */
+void bilinear_scale_line_w8( unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride ) {
+        unsigned char* dst = dst_;
+        unsigned int dst_x;
+        for( dst_x=0; dst_x<dst_width; dst_x+=8) {
+                // address calculation for loading the 4 surrounding pixel of each calculated
+                // destination pixel
+                vector unsigned int vui_dst_x_tmp = spu_splats( dst_x );
+                // lower range->first 4 pixel
+                // upper range->next 4 pixel
+                vector unsigned int vui_inc_dst_x_lower_range = { 0, 1, 2, 3 };
+                vector unsigned int vui_inc_dst_x_upper_range = { 4, 5, 6, 7 };
+                vector unsigned int vui_dst_x_lower_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_lower_range );
+                vector unsigned int vui_dst_x_upper_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_upper_range );
+                // calculate weight EAST-WEST
+                vector float vf_dst_x_lower_range = spu_convtf( vui_dst_x_lower_range, 0 );
+                vector float vf_dst_x_upper_range = spu_convtf( vui_dst_x_upper_range, 0 );
+                vector float vf_src_x_lower_range = spu_mul( vf_dst_x_lower_range, vf_x_scale );
+                vector float vf_src_x_upper_range = spu_mul( vf_dst_x_upper_range, vf_x_scale );
+                vector unsigned int vui_interpl_x_lower_range = spu_convtu( vf_src_x_lower_range, 0 );
+                vector unsigned int vui_interpl_x_upper_range = spu_convtu( vf_src_x_upper_range, 0 );
+                vector float vf_interpl_x_lower_range = spu_convtf( vui_interpl_x_lower_range, 0 );
+                vector float vf_interpl_x_upper_range = spu_convtf( vui_interpl_x_upper_range, 0 );
+                vector float vf_EWweight_lower_range = spu_sub( vf_src_x_lower_range, vf_interpl_x_lower_range );
+                vector float vf_EWweight_upper_range = spu_sub( vf_src_x_upper_range, vf_interpl_x_upper_range );
+                // calculate address offset
+                //
+                // pixel NORTH WEST
+                vector unsigned int vui_off_pixelNW_lower_range = vui_interpl_x_lower_range;
+                vector unsigned int vui_off_pixelNW_upper_range = vui_interpl_x_upper_range;
+                // pixel NORTH EAST-->(offpixelNW+1)
+                vector unsigned int vui_add_1 = { 1, 1, 1, 1 };
+                vector unsigned int vui_off_pixelNE_lower_range = spu_add( vui_off_pixelNW_lower_range, vui_add_1 );
+                vector unsigned int vui_off_pixelNE_upper_range = spu_add( vui_off_pixelNW_upper_range, vui_add_1 );
+                // SOUTH-WEST-->(offpixelNW+src_linestride)
+                vector unsigned int vui_srclinestride = spu_splats( src_linestride );
+                vector unsigned int vui_off_pixelSW_lower_range = spu_add( vui_srclinestride, vui_off_pixelNW_lower_range );
+                vector unsigned int vui_off_pixelSW_upper_range = spu_add( vui_srclinestride, vui_off_pixelNW_upper_range );
+                // SOUTH-EAST-->(offpixelNW+src_linestride+1)
+                vector unsigned int vui_off_pixelSE_lower_range = spu_add( vui_srclinestride, vui_off_pixelNE_lower_range );
+                vector unsigned int vui_off_pixelSE_upper_range = spu_add( vui_srclinestride, vui_off_pixelNE_upper_range );
+                // calculate each address
+                vector unsigned int vui_src_ls = spu_splats( (unsigned int) src );
+                vector unsigned int vui_addr_pixelNW_lower_range = spu_add( vui_src_ls, vui_off_pixelNW_lower_range );
+                vector unsigned int vui_addr_pixelNW_upper_range = spu_add( vui_src_ls, vui_off_pixelNW_upper_range );
+                vector unsigned int vui_addr_pixelNE_lower_range = spu_add( vui_src_ls, vui_off_pixelNE_lower_range );
+                vector unsigned int vui_addr_pixelNE_upper_range = spu_add( vui_src_ls, vui_off_pixelNE_upper_range );
+                vector unsigned int vui_addr_pixelSW_lower_range = spu_add( vui_src_ls, vui_off_pixelSW_lower_range );
+                vector unsigned int vui_addr_pixelSW_upper_range = spu_add( vui_src_ls, vui_off_pixelSW_upper_range );
+                vector unsigned int vui_addr_pixelSE_lower_range = spu_add( vui_src_ls, vui_off_pixelSE_lower_range );
+                vector unsigned int vui_addr_pixelSE_upper_range = spu_add( vui_src_ls, vui_off_pixelSE_upper_range );
+                // get each pixel
+                //
+                // scalar load, afterwards insertion into the right position
+                // NORTH WEST
+                vector unsigned char null_vector = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+                vector unsigned char vuc_pixel_NW_lower_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 0 )), null_vector, 3 );
+                vuc_pixel_NW_lower_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 1 )),
+                                vuc_pixel_NW_lower_range, 7 );
+                vuc_pixel_NW_lower_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 2 )),
+                                vuc_pixel_NW_lower_range, 11 );
+                vuc_pixel_NW_lower_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 3 )),
+                                vuc_pixel_NW_lower_range, 15 );
+                vector unsigned char vuc_pixel_NW_upper_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 0 )), null_vector, 3 );
+                vuc_pixel_NW_upper_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 1 )),
+                                vuc_pixel_NW_upper_range, 7 );
+                vuc_pixel_NW_upper_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 2 )),
+                                vuc_pixel_NW_upper_range, 11 );
+                vuc_pixel_NW_upper_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 3 )),
+                                vuc_pixel_NW_upper_range, 15 );
+                // NORTH EAST
+                vector unsigned char vuc_pixel_NE_lower_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 0 )), null_vector, 3 );
+                vuc_pixel_NE_lower_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 1 )),
+                                vuc_pixel_NE_lower_range, 7 );
+                vuc_pixel_NE_lower_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 2 )),
+                                vuc_pixel_NE_lower_range, 11 );
+                vuc_pixel_NE_lower_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 3 )),
+                                vuc_pixel_NE_lower_range, 15 );
+                vector unsigned char vuc_pixel_NE_upper_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 0 )), null_vector, 3 );
+                vuc_pixel_NE_upper_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 1 )),
+                                vuc_pixel_NE_upper_range, 7 );
+                vuc_pixel_NE_upper_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 2 )),
+                                vuc_pixel_NE_upper_range, 11 );
+                vuc_pixel_NE_upper_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 3 )),
+                                vuc_pixel_NE_upper_range, 15 );
+                // SOUTH WEST
+                vector unsigned char vuc_pixel_SW_lower_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 0 )), null_vector, 3 );
+                vuc_pixel_SW_lower_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 1 )),
+                                vuc_pixel_SW_lower_range, 7 );
+                vuc_pixel_SW_lower_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 2 )),
+                                vuc_pixel_SW_lower_range, 11 );
+                vuc_pixel_SW_lower_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 3 )),
+                                vuc_pixel_SW_lower_range, 15 );
+                vector unsigned char vuc_pixel_SW_upper_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 0 )), null_vector, 3 );
+                vuc_pixel_SW_upper_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 1 )),
+                                vuc_pixel_SW_upper_range, 7 );
+                vuc_pixel_SW_upper_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 2 )),
+                                vuc_pixel_SW_upper_range, 11 );
+                vuc_pixel_SW_upper_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 3 )),
+                                vuc_pixel_SW_upper_range, 15 );
+                // SOUTH EAST
+                vector unsigned char vuc_pixel_SE_lower_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 0 )), null_vector, 3 );
+                vuc_pixel_SE_lower_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 1 )),
+                                vuc_pixel_SE_lower_range, 7 );
+                vuc_pixel_SE_lower_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 2 )),
+                                vuc_pixel_SE_lower_range, 11 );
+                vuc_pixel_SE_lower_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 3 )),
+                                vuc_pixel_SE_lower_range, 15 );
+                vector unsigned char vuc_pixel_SE_upper_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 0 )), null_vector, 3 );
+                vuc_pixel_SE_upper_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 1 )),
+                                vuc_pixel_SE_upper_range, 7 );
+                vuc_pixel_SE_upper_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 2 )),
+                                vuc_pixel_SE_upper_range, 11 );
+                vuc_pixel_SE_upper_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 3 )),
+                                vuc_pixel_SE_upper_range, 15 );
+                // convert to float
+                vector float vf_pixel_NW_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_lower_range, 0 );
+                vector float vf_pixel_NW_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_upper_range, 0 );
+                vector float vf_pixel_SW_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_lower_range, 0 );
+                vector float vf_pixel_SW_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_upper_range, 0 );
+                vector float vf_pixel_NE_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_lower_range, 0 );
+                vector float vf_pixel_NE_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_upper_range, 0 );
+                vector float vf_pixel_SE_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_lower_range, 0 );
+                vector float vf_pixel_SE_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_upper_range, 0 );
+                // first linear interpolation: EWtop
+                // EWtop = NW + EWweight*(NE-NW)
+                //
+                // lower range
+                vector float vf_EWtop_lower_range_tmp = spu_sub( vf_pixel_NE_lower_range, vf_pixel_NW_lower_range );
+                vector float vf_EWtop_lower_range = spu_madd( vf_EWweight_lower_range,
+                                                                vf_EWtop_lower_range_tmp,
+                                                                vf_pixel_NW_lower_range );
+                // upper range
+                vector float vf_EWtop_upper_range_tmp = spu_sub( vf_pixel_NE_upper_range, vf_pixel_NW_upper_range );
+                vector float vf_EWtop_upper_range = spu_madd( vf_EWweight_upper_range,
+                                                                vf_EWtop_upper_range_tmp,
+                                                                vf_pixel_NW_upper_range );
+                // second linear interpolation: EWbottom
+                // EWbottom = SW + EWweight*(SE-SW)
+                //
+                // lower range
+                vector float vf_EWbottom_lower_range_tmp = spu_sub( vf_pixel_SE_lower_range, vf_pixel_SW_lower_range );
+                vector float vf_EWbottom_lower_range = spu_madd( vf_EWweight_lower_range,
+                                                                vf_EWbottom_lower_range_tmp,
+                                                                vf_pixel_SW_lower_range );
+                // upper range
+                vector float vf_EWbottom_upper_range_tmp = spu_sub( vf_pixel_SE_upper_range, vf_pixel_SW_upper_range );
+                vector float vf_EWbottom_upper_range = spu_madd( vf_EWweight_upper_range,
+                                                                vf_EWbottom_upper_range_tmp,
+                                                                vf_pixel_SW_upper_range );
+                // third linear interpolation: the bilinear interpolated value
+                // result = EWtop + NSweight*(EWbottom-EWtop);
+                //
+                // lower range
+                vector float vf_result_lower_range_tmp = spu_sub( vf_EWbottom_lower_range, vf_EWtop_lower_range );
+                vector float vf_result_lower_range = spu_madd( vf_NSweight,
+                                                                vf_result_lower_range_tmp,
+                                                                vf_EWtop_lower_range );
+                // upper range
+                vector float vf_result_upper_range_tmp = spu_sub( vf_EWbottom_upper_range, vf_EWtop_upper_range );
+                vector float vf_result_upper_range = spu_madd( vf_NSweight,
+                                                                vf_result_upper_range_tmp,
+                                                                vf_EWtop_upper_range );
+                // convert back: using saturated arithmetic
+                vector unsigned int vui_result_lower_range = vfloat_to_vuint( vf_result_lower_range );
+                vector unsigned int vui_result_upper_range = vfloat_to_vuint( vf_result_upper_range );
+                // merge results->lower,upper
+                vector unsigned char vuc_mask_merge_result = { 0x03, 0x07, 0x0B, 0x0F,
+                                                               0x13, 0x17, 0x1B, 0x1F,
+                                                               0x00, 0x00, 0x00, 0x00,
+                                                               0x00, 0x00, 0x00, 0x00 };
+                vector unsigned char vuc_result = spu_shuffle( (vector unsigned char) vui_result_lower_range,
+                                                                (vector unsigned char) vui_result_upper_range,
+                                                                vuc_mask_merge_result );
+                // partial storing
+                vector unsigned char vuc_mask_out = { 0x00, 0x00, 0x00, 0x00,
+                                                      0x00, 0x00, 0x00, 0x00,
+                                                      0xFF, 0xFF, 0xFF, 0xFF,
+                                                      0xFF, 0xFF, 0xFF, 0xFF };
+                // get currently stored data
+                vector unsigned char vuc_orig = *((vector unsigned char*)dst);
+                // clear currently stored data
+                vuc_orig = spu_and( vuc_orig,
+                                spu_rlqwbyte( vuc_mask_out, ((unsigned int)dst)&0x0F) );
+                // rotate result according to storing address
+                vuc_result = spu_rlqwbyte( vuc_result, ((unsigned int)dst)&0x0F );
+                // store result
+                *((vector unsigned char*)dst) = spu_or( vuc_result,
+                                                        vuc_orig );
+                dst += 8;
+        }
+}
+/*
+ * bilinear_scale_line_w16()
+ *
+ * processes a line of yuv-input, width has to be a multiple of 16
+ * scaled yuv-output is written to local store buffer
+ *
+ * @param src buffer for 2 lines input
+ * @param dst_ buffer for 1 line output
+ * @param dst_width the width of the destination line
+ * @param vf_x_scale a float vector, at each entry is the x_scale-factor
+ * @param vf_NSweight a float vector, at each position is the weight NORTH/SOUTH for the current line
+ * @param src_linestride the stride of the srcline
+ */
+void bilinear_scale_line_w16( unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride ) {
+        unsigned char* dst = dst_;
+        unsigned int dst_x;
+        for( dst_x=0; dst_x<dst_width; dst_x+=16) {
+                // address calculation for loading the 4 surrounding pixel of each calculated
+                // destination pixel
+                vector unsigned int vui_dst_x_tmp = spu_splats( dst_x );
+                // parallelised processing
+                // first range->pixel 1 2 3 4
+                // second range->pixel 5 6 7 8
+                // third range->pixel 9 10 11 12
+                // fourth range->pixel 13 14 15 16
+                vector unsigned int vui_inc_dst_x_first_range = { 0, 1, 2, 3 };
+                vector unsigned int vui_inc_dst_x_second_range = { 4, 5, 6, 7 };
+                vector unsigned int vui_inc_dst_x_third_range = { 8, 9, 10, 11 };
+                vector unsigned int vui_inc_dst_x_fourth_range = { 12, 13, 14, 15 };
+                vector unsigned int vui_dst_x_first_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_first_range );
+                vector unsigned int vui_dst_x_second_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_second_range );
+                vector unsigned int vui_dst_x_third_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_third_range );
+                vector unsigned int vui_dst_x_fourth_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_fourth_range );
+                // calculate weight EAST-WEST
+                vector float vf_dst_x_first_range = spu_convtf( vui_dst_x_first_range, 0 );
+                vector float vf_dst_x_second_range = spu_convtf( vui_dst_x_second_range, 0 );
+                vector float vf_dst_x_third_range = spu_convtf( vui_dst_x_third_range, 0 );
+                vector float vf_dst_x_fourth_range = spu_convtf( vui_dst_x_fourth_range, 0 );
+                vector float vf_src_x_first_range = spu_mul( vf_dst_x_first_range, vf_x_scale );
+                vector float vf_src_x_second_range = spu_mul( vf_dst_x_second_range, vf_x_scale );
+                vector float vf_src_x_third_range = spu_mul( vf_dst_x_third_range, vf_x_scale );
+                vector float vf_src_x_fourth_range = spu_mul( vf_dst_x_fourth_range, vf_x_scale );
+                vector unsigned int vui_interpl_x_first_range = spu_convtu( vf_src_x_first_range, 0 );
+                vector unsigned int vui_interpl_x_second_range = spu_convtu( vf_src_x_second_range, 0 );
+                vector unsigned int vui_interpl_x_third_range = spu_convtu( vf_src_x_third_range, 0 );
+                vector unsigned int vui_interpl_x_fourth_range = spu_convtu( vf_src_x_fourth_range, 0 );
+                vector float vf_interpl_x_first_range = spu_convtf( vui_interpl_x_first_range, 0 );
+                vector float vf_interpl_x_second_range = spu_convtf( vui_interpl_x_second_range, 0 );
+                vector float vf_interpl_x_third_range = spu_convtf( vui_interpl_x_third_range, 0 );
+                vector float vf_interpl_x_fourth_range = spu_convtf( vui_interpl_x_fourth_range, 0 );
+                vector float vf_EWweight_first_range = spu_sub( vf_src_x_first_range, vf_interpl_x_first_range );
+                vector float vf_EWweight_second_range = spu_sub( vf_src_x_second_range, vf_interpl_x_second_range );
+                vector float vf_EWweight_third_range = spu_sub( vf_src_x_third_range, vf_interpl_x_third_range );
+                vector float vf_EWweight_fourth_range = spu_sub( vf_src_x_fourth_range, vf_interpl_x_fourth_range );
+                // calculate address offset
+                //
+                // pixel NORTH WEST
+                vector unsigned int vui_off_pixelNW_first_range = vui_interpl_x_first_range;
+                vector unsigned int vui_off_pixelNW_second_range = vui_interpl_x_second_range;
+                vector unsigned int vui_off_pixelNW_third_range = vui_interpl_x_third_range;
+                vector unsigned int vui_off_pixelNW_fourth_range = vui_interpl_x_fourth_range;
+                // pixel NORTH EAST-->(offpixelNW+1)
+                vector unsigned int vui_add_1 = { 1, 1, 1, 1 };
+                vector unsigned int vui_off_pixelNE_first_range = spu_add( vui_off_pixelNW_first_range, vui_add_1 );
+                vector unsigned int vui_off_pixelNE_second_range = spu_add( vui_off_pixelNW_second_range, vui_add_1 );
+                vector unsigned int vui_off_pixelNE_third_range = spu_add( vui_off_pixelNW_third_range, vui_add_1 );
+                vector unsigned int vui_off_pixelNE_fourth_range = spu_add( vui_off_pixelNW_fourth_range, vui_add_1 );
+                // SOUTH-WEST-->(offpixelNW+src_linestride)
+                vector unsigned int vui_srclinestride = spu_splats( src_linestride );
+                vector unsigned int vui_off_pixelSW_first_range = spu_add( vui_srclinestride, vui_off_pixelNW_first_range );
+                vector unsigned int vui_off_pixelSW_second_range = spu_add( vui_srclinestride, vui_off_pixelNW_second_range );
+                vector unsigned int vui_off_pixelSW_third_range = spu_add( vui_srclinestride, vui_off_pixelNW_third_range );
+                vector unsigned int vui_off_pixelSW_fourth_range = spu_add( vui_srclinestride, vui_off_pixelNW_fourth_range );
+                // SOUTH-EAST-->(offpixelNW+src_linestride+1)
+                vector unsigned int vui_off_pixelSE_first_range = spu_add( vui_srclinestride, vui_off_pixelNE_first_range );
+                vector unsigned int vui_off_pixelSE_second_range = spu_add( vui_srclinestride, vui_off_pixelNE_second_range );
+                vector unsigned int vui_off_pixelSE_third_range = spu_add( vui_srclinestride, vui_off_pixelNE_third_range );
+                vector unsigned int vui_off_pixelSE_fourth_range = spu_add( vui_srclinestride, vui_off_pixelNE_fourth_range );
+                // calculate each address
+                vector unsigned int vui_src_ls = spu_splats( (unsigned int) src );
+                vector unsigned int vui_addr_pixelNW_first_range = spu_add( vui_src_ls, vui_off_pixelNW_first_range );
+                vector unsigned int vui_addr_pixelNW_second_range = spu_add( vui_src_ls, vui_off_pixelNW_second_range );
+                vector unsigned int vui_addr_pixelNW_third_range = spu_add( vui_src_ls, vui_off_pixelNW_third_range );
+                vector unsigned int vui_addr_pixelNW_fourth_range = spu_add( vui_src_ls, vui_off_pixelNW_fourth_range );
+                vector unsigned int vui_addr_pixelNE_first_range = spu_add( vui_src_ls, vui_off_pixelNE_first_range );
+                vector unsigned int vui_addr_pixelNE_second_range = spu_add( vui_src_ls, vui_off_pixelNE_second_range );
+                vector unsigned int vui_addr_pixelNE_third_range = spu_add( vui_src_ls, vui_off_pixelNE_third_range );
+                vector unsigned int vui_addr_pixelNE_fourth_range = spu_add( vui_src_ls, vui_off_pixelNE_fourth_range );
+                vector unsigned int vui_addr_pixelSW_first_range = spu_add( vui_src_ls, vui_off_pixelSW_first_range );
+                vector unsigned int vui_addr_pixelSW_second_range = spu_add( vui_src_ls, vui_off_pixelSW_second_range );
+                vector unsigned int vui_addr_pixelSW_third_range = spu_add( vui_src_ls, vui_off_pixelSW_third_range );
+                vector unsigned int vui_addr_pixelSW_fourth_range = spu_add( vui_src_ls, vui_off_pixelSW_fourth_range );
+                vector unsigned int vui_addr_pixelSE_first_range = spu_add( vui_src_ls, vui_off_pixelSE_first_range );
+                vector unsigned int vui_addr_pixelSE_second_range = spu_add( vui_src_ls, vui_off_pixelSE_second_range );
+                vector unsigned int vui_addr_pixelSE_third_range = spu_add( vui_src_ls, vui_off_pixelSE_third_range );
+                vector unsigned int vui_addr_pixelSE_fourth_range = spu_add( vui_src_ls, vui_off_pixelSE_fourth_range );
+                // get each pixel
+                //
+                // scalar load, afterwards insertion into the right position
+                // NORTH WEST
+                // first range
+                vector unsigned char null_vector = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+                vector unsigned char vuc_pixel_NW_first_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 0 )), null_vector, 3 );
+                vuc_pixel_NW_first_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 1 )),
+                                vuc_pixel_NW_first_range, 7 );
+                vuc_pixel_NW_first_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 2 )),
+                                vuc_pixel_NW_first_range, 11 );
+                vuc_pixel_NW_first_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 3 )),
+                                vuc_pixel_NW_first_range, 15 );
+                // second range
+                vector unsigned char vuc_pixel_NW_second_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 0 )), null_vector, 3 );
+                vuc_pixel_NW_second_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 1 )),
+                                vuc_pixel_NW_second_range, 7 );
+                vuc_pixel_NW_second_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 2 )),
+                                vuc_pixel_NW_second_range, 11 );
+                vuc_pixel_NW_second_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 3 )),
+                                vuc_pixel_NW_second_range, 15 );
+                // third range
+                vector unsigned char vuc_pixel_NW_third_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 0 )), null_vector, 3 );
+                vuc_pixel_NW_third_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 1 )),
+                                vuc_pixel_NW_third_range, 7 );
+                vuc_pixel_NW_third_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 2 )),
+                                vuc_pixel_NW_third_range, 11 );
+                vuc_pixel_NW_third_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 3 )),
+                                vuc_pixel_NW_third_range, 15 );
+                // fourth range
+                vector unsigned char vuc_pixel_NW_fourth_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 0 )), null_vector, 3 );
+                vuc_pixel_NW_fourth_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 1 )),
+                                vuc_pixel_NW_fourth_range, 7 );
+                vuc_pixel_NW_fourth_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 2 )),
+                                vuc_pixel_NW_fourth_range, 11 );
+                vuc_pixel_NW_fourth_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 3 )),
+                                vuc_pixel_NW_fourth_range, 15 );
+                // NORTH EAST
+                // first range
+                vector unsigned char vuc_pixel_NE_first_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 0 )), null_vector, 3 );
+                vuc_pixel_NE_first_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 1 )),
+                                vuc_pixel_NE_first_range, 7 );
+                vuc_pixel_NE_first_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 2 )),
+                                vuc_pixel_NE_first_range, 11 );
+                vuc_pixel_NE_first_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 3 )),
+                                vuc_pixel_NE_first_range, 15 );
+                // second range
+                vector unsigned char vuc_pixel_NE_second_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 0 )), null_vector, 3 );
+                vuc_pixel_NE_second_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 1 )),
+                                vuc_pixel_NE_second_range, 7 );
+                vuc_pixel_NE_second_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 2 )),
+                                vuc_pixel_NE_second_range, 11 );
+                vuc_pixel_NE_second_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 3 )),
+                                vuc_pixel_NE_second_range, 15 );
+                // third range
+                vector unsigned char vuc_pixel_NE_third_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 0 )), null_vector, 3 );
+                vuc_pixel_NE_third_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 1 )),
+                                vuc_pixel_NE_third_range, 7 );
+                vuc_pixel_NE_third_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 2 )),
+                                vuc_pixel_NE_third_range, 11 );
+                vuc_pixel_NE_third_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 3 )),
+                                vuc_pixel_NE_third_range, 15 );
+                // fourth range
+                vector unsigned char vuc_pixel_NE_fourth_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 0 )), null_vector, 3 );
+                vuc_pixel_NE_fourth_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 1 )),
+                                vuc_pixel_NE_fourth_range, 7 );
+                vuc_pixel_NE_fourth_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 2 )),
+                                vuc_pixel_NE_fourth_range, 11 );
+                vuc_pixel_NE_fourth_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 3 )),
+                                vuc_pixel_NE_fourth_range, 15 );
+                // SOUTH WEST
+                // first range
+                vector unsigned char vuc_pixel_SW_first_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 0 )), null_vector, 3 );
+                vuc_pixel_SW_first_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 1 )),
+                                vuc_pixel_SW_first_range, 7 );
+                vuc_pixel_SW_first_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 2 )),
+                                vuc_pixel_SW_first_range, 11 );
+                vuc_pixel_SW_first_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 3 )),
+                                vuc_pixel_SW_first_range, 15 );
+                // second range
+                vector unsigned char vuc_pixel_SW_second_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 0 )), null_vector, 3 );
+                vuc_pixel_SW_second_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 1 )),
+                                vuc_pixel_SW_second_range, 7 );
+                vuc_pixel_SW_second_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 2 )),
+                                vuc_pixel_SW_second_range, 11 );
+                vuc_pixel_SW_second_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 3 )),
+                                vuc_pixel_SW_second_range, 15 );
+                // third range
+                vector unsigned char vuc_pixel_SW_third_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 0 )), null_vector, 3 );
+                vuc_pixel_SW_third_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 1 )),
+                                vuc_pixel_SW_third_range, 7 );
+                vuc_pixel_SW_third_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 2 )),
+                                vuc_pixel_SW_third_range, 11 );
+                vuc_pixel_SW_third_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 3 )),
+                                vuc_pixel_SW_third_range, 15 );
+                // fourth range
+                vector unsigned char vuc_pixel_SW_fourth_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 0 )), null_vector, 3 );
+                vuc_pixel_SW_fourth_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 1 )),
+                                vuc_pixel_SW_fourth_range, 7 );
+                vuc_pixel_SW_fourth_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 2 )),
+                                vuc_pixel_SW_fourth_range, 11 );
+                vuc_pixel_SW_fourth_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 3 )),
+                                vuc_pixel_SW_fourth_range, 15 );
+                // NORTH EAST
+                // first range
+                vector unsigned char vuc_pixel_SE_first_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 0 )), null_vector, 3 );
+                vuc_pixel_SE_first_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 1 )),
+                                vuc_pixel_SE_first_range, 7 );
+                vuc_pixel_SE_first_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 2 )),
+                                vuc_pixel_SE_first_range, 11 );
+                vuc_pixel_SE_first_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 3 )),
+                                vuc_pixel_SE_first_range, 15 );
+                // second range
+                vector unsigned char vuc_pixel_SE_second_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 0 )), null_vector, 3 );
+                vuc_pixel_SE_second_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 1 )),
+                                vuc_pixel_SE_second_range, 7 );
+                vuc_pixel_SE_second_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 2 )),
+                                vuc_pixel_SE_second_range, 11 );
+                vuc_pixel_SE_second_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 3 )),
+                                vuc_pixel_SE_second_range, 15 );
+                // third range
+                vector unsigned char vuc_pixel_SE_third_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 0 )), null_vector, 3 );
+                vuc_pixel_SE_third_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 1 )),
+                                vuc_pixel_SE_third_range, 7 );
+                vuc_pixel_SE_third_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 2 )),
+                                vuc_pixel_SE_third_range, 11 );
+                vuc_pixel_SE_third_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 3 )),
+                                vuc_pixel_SE_third_range, 15 );
+                // fourth range
+                vector unsigned char vuc_pixel_SE_fourth_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 0 )), null_vector, 3 );
+                vuc_pixel_SE_fourth_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 1 )),
+                                vuc_pixel_SE_fourth_range, 7 );
+                vuc_pixel_SE_fourth_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 2 )),
+                                vuc_pixel_SE_fourth_range, 11 );
+                vuc_pixel_SE_fourth_range = spu_insert(
+                                *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 3 )),
+                                vuc_pixel_SE_fourth_range, 15 );
+                // convert to float
+                vector float vf_pixel_NW_first_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_first_range, 0 );
+                vector float vf_pixel_NW_second_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_second_range, 0 );
+                vector float vf_pixel_NW_third_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_third_range, 0 );
+                vector float vf_pixel_NW_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_fourth_range, 0 );
+                vector float vf_pixel_NE_first_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_first_range, 0 );
+                vector float vf_pixel_NE_second_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_second_range, 0 );
+                vector float vf_pixel_NE_third_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_third_range, 0 );
+                vector float vf_pixel_NE_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_fourth_range, 0 );
+                vector float vf_pixel_SW_first_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_first_range, 0 );
+                vector float vf_pixel_SW_second_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_second_range, 0 );
+                vector float vf_pixel_SW_third_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_third_range, 0 );
+                vector float vf_pixel_SW_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_fourth_range, 0 );
+                vector float vf_pixel_SE_first_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_first_range, 0 );
+                vector float vf_pixel_SE_second_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_second_range, 0 );
+                vector float vf_pixel_SE_third_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_third_range, 0 );
+                vector float vf_pixel_SE_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_fourth_range, 0 );
+                // first linear interpolation: EWtop
+                // EWtop = NW + EWweight*(NE-NW)
+                //
+                // first range
+                vector float vf_EWtop_first_range_tmp = spu_sub( vf_pixel_NE_first_range, vf_pixel_NW_first_range );
+                vector float vf_EWtop_first_range = spu_madd( vf_EWweight_first_range,
+                                                                vf_EWtop_first_range_tmp,
+                                                                vf_pixel_NW_first_range );
+                // second range
+                vector float vf_EWtop_second_range_tmp = spu_sub( vf_pixel_NE_second_range, vf_pixel_NW_second_range );
+                vector float vf_EWtop_second_range = spu_madd( vf_EWweight_second_range,
+                                                                vf_EWtop_second_range_tmp,
+                                                                vf_pixel_NW_second_range );
+                // third range
+                vector float vf_EWtop_third_range_tmp = spu_sub( vf_pixel_NE_third_range, vf_pixel_NW_third_range );
+                vector float vf_EWtop_third_range = spu_madd( vf_EWweight_third_range,
+                                                                vf_EWtop_third_range_tmp,
+                                                                vf_pixel_NW_third_range );
+                // fourth range
+                vector float vf_EWtop_fourth_range_tmp = spu_sub( vf_pixel_NE_fourth_range, vf_pixel_NW_fourth_range );
+                vector float vf_EWtop_fourth_range = spu_madd( vf_EWweight_fourth_range,
+                                                                vf_EWtop_fourth_range_tmp,
+                                                                vf_pixel_NW_fourth_range );
+                // second linear interpolation: EWbottom
+                // EWbottom = SW + EWweight*(SE-SW)
+                //
+                // first range
+                vector float vf_EWbottom_first_range_tmp = spu_sub( vf_pixel_SE_first_range, vf_pixel_SW_first_range );
+                vector float vf_EWbottom_first_range = spu_madd( vf_EWweight_first_range,
+                                                                vf_EWbottom_first_range_tmp,
+                                                                vf_pixel_SW_first_range );
+                // second range
+                vector float vf_EWbottom_second_range_tmp = spu_sub( vf_pixel_SE_second_range, vf_pixel_SW_second_range );
+                vector float vf_EWbottom_second_range = spu_madd( vf_EWweight_second_range,
+                                                                vf_EWbottom_second_range_tmp,
+                                                                vf_pixel_SW_second_range );
+                // first range
+                vector float vf_EWbottom_third_range_tmp = spu_sub( vf_pixel_SE_third_range, vf_pixel_SW_third_range );
+                vector float vf_EWbottom_third_range = spu_madd( vf_EWweight_third_range,
+                                                                vf_EWbottom_third_range_tmp,
+                                                                vf_pixel_SW_third_range );
+                // first range
+                vector float vf_EWbottom_fourth_range_tmp = spu_sub( vf_pixel_SE_fourth_range, vf_pixel_SW_fourth_range );
+                vector float vf_EWbottom_fourth_range = spu_madd( vf_EWweight_fourth_range,
+                                                                vf_EWbottom_fourth_range_tmp,
+                                                                vf_pixel_SW_fourth_range );
+                // third linear interpolation: the bilinear interpolated value
+                // result = EWtop + NSweight*(EWbottom-EWtop);
+                //
+                // first range
+                vector float vf_result_first_range_tmp = spu_sub( vf_EWbottom_first_range, vf_EWtop_first_range );
+                vector float vf_result_first_range = spu_madd( vf_NSweight,
+                                                                vf_result_first_range_tmp,
+                                                                vf_EWtop_first_range );
+                // second range
+                vector float vf_result_second_range_tmp = spu_sub( vf_EWbottom_second_range, vf_EWtop_second_range );
+                vector float vf_result_second_range = spu_madd( vf_NSweight,
+                                                                vf_result_second_range_tmp,
+                                                                vf_EWtop_second_range );
+                // third range
+                vector float vf_result_third_range_tmp = spu_sub( vf_EWbottom_third_range, vf_EWtop_third_range );
+                vector float vf_result_third_range = spu_madd( vf_NSweight,
+                                                                vf_result_third_range_tmp,
+                                                                vf_EWtop_third_range );
+                // fourth range
+                vector float vf_result_fourth_range_tmp = spu_sub( vf_EWbottom_fourth_range, vf_EWtop_fourth_range );
+                vector float vf_result_fourth_range = spu_madd( vf_NSweight,
+                                                                vf_result_fourth_range_tmp,
+                                                                vf_EWtop_fourth_range );
+                // convert back: using saturated arithmetic
+                vector unsigned int vui_result_first_range = vfloat_to_vuint( vf_result_first_range );
+                vector unsigned int vui_result_second_range = vfloat_to_vuint( vf_result_second_range );
+                vector unsigned int vui_result_third_range = vfloat_to_vuint( vf_result_third_range );
+                vector unsigned int vui_result_fourth_range = vfloat_to_vuint( vf_result_fourth_range );
+                // merge results->lower,upper
+                vector unsigned char vuc_mask_merge_result_first_second = { 0x03, 0x07, 0x0B, 0x0F,
+                                                                            0x13, 0x17, 0x1B, 0x1F,
+                                                                            0x00, 0x00, 0x00, 0x00,
+                                                                            0x00, 0x00, 0x00, 0x00 };
+                vector unsigned char vuc_mask_merge_result_third_fourth = { 0x00, 0x00, 0x00, 0x00,
+                                                                            0x00, 0x00, 0x00, 0x00,
+                                                                            0x03, 0x07, 0x0B, 0x0F,
+                                                                            0x13, 0x17, 0x1B, 0x1F };
+                vector unsigned char vuc_result_first_second =
+                                                spu_shuffle( (vector unsigned char) vui_result_first_range,
+                                                                 (vector unsigned char) vui_result_second_range,
+                                                                vuc_mask_merge_result_first_second );
+                vector unsigned char vuc_result_third_fourth =
+                                                spu_shuffle( (vector unsigned char) vui_result_third_range,
+                                                                 (vector unsigned char) vui_result_fourth_range,
+                                                                vuc_mask_merge_result_third_fourth );
+                // store result
+                *((vector unsigned char*)dst) = spu_or( vuc_result_first_second,
+                                                        vuc_result_third_fourth );
+                dst += 16;
+        }
+}
diff --git a/apps/plugins/sdl/src/video/ps3/spulibs/fb_writer.c b/apps/plugins/sdl/src/video/ps3/spulibs/fb_writer.c
new file mode 100644
index 0000000000..0eb51cc682
--- /dev/null
+++ b/apps/plugins/sdl/src/video/ps3/spulibs/fb_writer.c
@@ -0,0 +1,193 @@
+/*
+ * SDL - Simple DirectMedia Layer
+ * CELL BE Support for PS3 Framebuffer
+ * Copyright (C) 2008, 2009 International Business Machines Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ *
+ *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
+ *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
+ *  SPE code based on research by:
+ *  Rene Becker
+ *  Thimo Emmerich
+ */
+#include "spu_common.h"
+#include <spu_intrinsics.h>
+#include <spu_mfcio.h>
+#include <stdio.h>
+#include <string.h>
+// Debugging
+//#define DEBUG
+#ifdef DEBUG
+#define deprintf(fmt, args... ) \
+        fprintf( stdout, fmt, ##args ); \
+        fflush( stdout );
+#else
+#define deprintf( fmt, args... )
+#endif
+void cpy_to_fb(unsigned int);
+/* fb_writer_spu parms */
+static volatile struct fb_writer_parms_t parms __attribute__ ((aligned(128)));
+/* Code running on SPU */
+int main(unsigned long long spe_id __attribute__ ((unused)), unsigned long long argp __attribute__ ((unused)))
+{
+        deprintf("[SPU] fb_writer_spu is up... (on SPE #%llu)\n", spe_id);
+        uint32_t ea_mfc, mbox;
+        // send ready message
+        spu_write_out_mbox(SPU_READY);
+        while (1) {
+                /* Check mailbox */
+                mbox = spu_read_in_mbox();
+                deprintf("[SPU] Message is %u\n", mbox);
+                switch (mbox) {
+                        case SPU_EXIT:
+                                deprintf("[SPU] fb_writer goes down...\n");
+                                return 0;
+                        case SPU_START:
+                                break;
+                        default:
+                                deprintf("[SPU] Cannot handle message\n");
+                                continue;
+                }
+                /* Tag Manager setup */
+                unsigned int tags;
+                tags = mfc_multi_tag_reserve(5);
+                if (tags == MFC_TAG_INVALID) {
+                        deprintf("[SPU] Failed to reserve mfc tags on fb_writer\n");
+                        return 0;
+                }
+                /* Framebuffer parms */
+                ea_mfc = spu_read_in_mbox();
+                deprintf("[SPU] Message on fb_writer is %u\n", ea_mfc);
+                spu_mfcdma32(&parms, (unsigned int)ea_mfc,
+                                sizeof(struct fb_writer_parms_t), tags,
+                                MFC_GET_CMD);
+                deprintf("[SPU] argp = %u\n", (unsigned int)argp);
+                DMA_WAIT_TAG(tags);
+                /* Copy parms->data to framebuffer */
+                deprintf("[SPU] Copying to framebuffer started\n");
+                cpy_to_fb(tags);
+                deprintf("[SPU] Copying to framebuffer done!\n");
+                mfc_multi_tag_release(tags, 5);
+                deprintf("[SPU] fb_writer_spu... done!\n");
+                /* Send FIN msg */
+                spu_write_out_mbox(SPU_FIN);
+        }
+        return 0;
+}
+void cpy_to_fb(unsigned int tag_id_base)
+{
+        unsigned int i;
+        unsigned char current_buf;
+        uint8_t *in = parms.data;
+        /* Align fb pointer which was centered before */
+        uint8_t *fb =
+            (unsigned char *)((unsigned int)parms.center & 0xFFFFFFF0);
+        uint32_t bounded_input_height = parms.bounded_input_height;
+        uint32_t bounded_input_width = parms.bounded_input_width;
+        uint32_t fb_pixel_size = parms.fb_pixel_size;
+        uint32_t out_line_stride = parms.out_line_stride;
+        uint32_t in_line_stride = parms.in_line_stride;
+        uint32_t in_line_size = bounded_input_width * fb_pixel_size;
+        current_buf = 0;
+        /* Local store buffer */
+        static volatile uint8_t buf[4][BUFFER_SIZE]
+            __attribute__ ((aligned(128)));
+        /* do 4-times multibuffering using DMA list, process in two steps */
+        for (i = 0; i < bounded_input_height >> 2; i++) {
+                /* first buffer */
+                DMA_WAIT_TAG(tag_id_base + 1);
+                // retrieve buffer
+                spu_mfcdma32(buf[0], (unsigned int)in, in_line_size,
+                             tag_id_base + 1, MFC_GETB_CMD);
+                DMA_WAIT_TAG(tag_id_base + 1);
+                // store buffer
+                spu_mfcdma32(buf[0], (unsigned int)fb, in_line_size,
+                             tag_id_base + 1, MFC_PUTB_CMD);
+                in += in_line_stride;
+                fb += out_line_stride;
+                deprintf("[SPU] 1st buffer copied in=0x%x, fb=0x%x\n", in,
+                       fb);
+                /* second buffer */
+                DMA_WAIT_TAG(tag_id_base + 2);
+                // retrieve buffer
+                spu_mfcdma32(buf[1], (unsigned int)in, in_line_size,
+                             tag_id_base + 2, MFC_GETB_CMD);
+                DMA_WAIT_TAG(tag_id_base + 2);
+                // store buffer
+                spu_mfcdma32(buf[1], (unsigned int)fb, in_line_size,
+                             tag_id_base + 2, MFC_PUTB_CMD);
+                in += in_line_stride;
+                fb += out_line_stride;
+                deprintf("[SPU] 2nd buffer copied in=0x%x, fb=0x%x\n", in,
+                       fb);
+                /* third buffer */
+                DMA_WAIT_TAG(tag_id_base + 3);
+                // retrieve buffer
+                spu_mfcdma32(buf[2], (unsigned int)in, in_line_size,
+                             tag_id_base + 3, MFC_GETB_CMD);
+                DMA_WAIT_TAG(tag_id_base + 3);
+                // store buffer
+                spu_mfcdma32(buf[2], (unsigned int)fb, in_line_size,
+                             tag_id_base + 3, MFC_PUTB_CMD);
+                in += in_line_stride;
+                fb += out_line_stride;
+                deprintf("[SPU] 3rd buffer copied in=0x%x, fb=0x%x\n", in,
+                       fb);
+                /* fourth buffer */
+                DMA_WAIT_TAG(tag_id_base + 4);
+                // retrieve buffer
+                spu_mfcdma32(buf[3], (unsigned int)in, in_line_size,
+                             tag_id_base + 4, MFC_GETB_CMD);
+                DMA_WAIT_TAG(tag_id_base + 4);
+                // store buffer
+                spu_mfcdma32(buf[3], (unsigned int)fb, in_line_size,
+                             tag_id_base + 4, MFC_PUTB_CMD);
+                in += in_line_stride;
+                fb += out_line_stride;
+                deprintf("[SPU] 4th buffer copied in=0x%x, fb=0x%x\n", in,
+                       fb);
+                deprintf("[SPU] Loop #%i, bounded_input_height=%i\n", i,
+                       bounded_input_height >> 2);
+        }
+        DMA_WAIT_TAG(tag_id_base + 2);
+        DMA_WAIT_TAG(tag_id_base + 3);
+        DMA_WAIT_TAG(tag_id_base + 4);
+}
diff --git a/apps/plugins/sdl/src/video/ps3/spulibs/spu_common.h b/apps/plugins/sdl/src/video/ps3/spulibs/spu_common.h
new file mode 100644
index 0000000000..42c328c83d
--- /dev/null
+++ b/apps/plugins/sdl/src/video/ps3/spulibs/spu_common.h
@@ -0,0 +1,108 @@
+/*
+ * SDL - Simple DirectMedia Layer
+ * CELL BE Support for PS3 Framebuffer
+ * Copyright (C) 2008, 2009 International Business Machines Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ *
+ *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
+ *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
+ *  SPE code based on research by:
+ *  Rene Becker
+ *  Thimo Emmerich
+ */
+/* Common definitions/makros for SPUs */
+#ifndef _SPU_COMMON_H
+#define _SPU_COMMON_H
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+/* Tag management */
+#define DMA_WAIT_TAG(_tag)     \
+    mfc_write_tag_mask(1<<(_tag)); \
+    mfc_read_tag_status_all();
+/* SPU mailbox messages */
+#define SPU_READY       0
+#define SPU_START       1
+#define SPU_FIN         2
+#define SPU_EXIT        3
+/* Tags */
+#define RETR_BUF        0
+#define STR_BUF         1
+#define TAG_INIT        2
+/* Buffersizes */
+#define MAX_HDTV_WIDTH 1920
+#define MAX_HDTV_HEIGHT 1080
+/* One stride of HDTV */
+#define BUFFER_SIZE 7680
+/* fb_writer ppu/spu exchange parms */
+struct fb_writer_parms_t {
+        uint8_t *data;
+        uint8_t *center;
+        uint32_t out_line_stride;
+        uint32_t in_line_stride;
+        uint32_t bounded_input_height;
+        uint32_t bounded_input_width;
+        uint32_t fb_pixel_size;
+        /* This padding is to fulfill the need for 16 byte alignment. On parm change, update! */
+        char padding[4];
+} __attribute__((aligned(128)));
+/* yuv2rgb ppu/spu exchange parms */
+struct yuv2rgb_parms_t {
+        uint8_t* y_plane;
+        uint8_t* v_plane;
+        uint8_t* u_plane;
+        uint8_t* dstBuffer;
+        unsigned int src_pixel_width;
+        unsigned int src_pixel_height;
+        /* This padding is to fulfill the need for 16 byte alignment. On parm change, update! */
+        char padding[128 - ((4 * sizeof(uint8_t *) + 2 * sizeof(unsigned int)) & 0x7F)];
+} __attribute__((aligned(128)));
+/* bilin_scaler ppu/spu exchange parms */
+struct scale_parms_t {
+        uint8_t* y_plane;
+        uint8_t* v_plane;
+        uint8_t* u_plane;
+        uint8_t* dstBuffer;
+        unsigned int src_pixel_width;
+        unsigned int src_pixel_height;
+        unsigned int dst_pixel_width;
+        unsigned int dst_pixel_height;
+        /* This padding is to fulfill the need for 16 byte alignment. On parm change, update! */
+        char padding[128 - ((4 * sizeof(uint8_t *) + 4 * sizeof(unsigned int)) & 0x7F)];
+} __attribute__((aligned(128)));
+#endif /* _SPU_COMMON_H */
diff --git a/apps/plugins/sdl/src/video/ps3/spulibs/yuv2rgb_converter.c b/apps/plugins/sdl/src/video/ps3/spulibs/yuv2rgb_converter.c
new file mode 100644
index 0000000000..5e166914c5
--- /dev/null
+++ b/apps/plugins/sdl/src/video/ps3/spulibs/yuv2rgb_converter.c
@@ -0,0 +1,629 @@
+/*
+ * SDL - Simple DirectMedia Layer
+ * CELL BE Support for PS3 Framebuffer
+ * Copyright (C) 2008, 2009 International Business Machines Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ *
+ *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
+ *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
+ *  SPE code based on research by:
+ *  Rene Becker
+ *  Thimo Emmerich
+ */
+#include "spu_common.h"
+#include <spu_intrinsics.h>
+#include <spu_mfcio.h>
+// Debugging
+//#define DEBUG
+#ifdef DEBUG
+#define deprintf(fmt, args... ) \
+        fprintf( stdout, fmt, ##args ); \
+        fflush( stdout );
+#else
+#define deprintf( fmt, args... )
+#endif
+struct yuv2rgb_parms_t parms_converter __attribute__((aligned(128)));
+/* A maximum of 8 lines Y, therefore 4 lines V, 4 lines U are stored
+ * there might be the need to retrieve misaligned data, adjust
+ * incoming v and u plane to be able to handle this (add 128)
+ */
+unsigned char y_plane[2][(MAX_HDTV_WIDTH + 128) * 4] __attribute__((aligned(128)));
+unsigned char v_plane[2][(MAX_HDTV_WIDTH + 128) * 2] __attribute__((aligned(128)));
+unsigned char u_plane[2][(MAX_HDTV_WIDTH + 128) * 2] __attribute__((aligned(128)));
+/* A maximum of 4 lines BGRA are stored, 4 byte per pixel */
+unsigned char bgra[4 * MAX_HDTV_WIDTH * 4] __attribute__((aligned(128)));
+/* some vectors needed by the float to int conversion */
+static const vector float vec_255 = { 255.0f, 255.0f, 255.0f, 255.0f };
+static const vector float vec_0_1 = { 0.1f, 0.1f, 0.1f, 0.1f };
+void yuv_to_rgb_w16();
+void yuv_to_rgb_w32();
+void yuv_to_rgb_w16_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr, unsigned int width);
+void yuv_to_rgb_w32_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width);
+int main(unsigned long long spe_id __attribute__((unused)), unsigned long long argp __attribute__ ((unused)))
+{
+        deprintf("[SPU] yuv2rgb_spu is up... (on SPE #%llu)\n", spe_id);
+        uint32_t ea_mfc, mbox;
+        // send ready message
+        spu_write_out_mbox(SPU_READY);
+        while (1) {
+                /* Check mailbox */
+                mbox = spu_read_in_mbox();
+                deprintf("[SPU] Message is %u\n", mbox);
+                switch (mbox) {
+                        case SPU_EXIT:
+                                deprintf("[SPU] fb_writer goes down...\n");
+                                return 0;
+                        case SPU_START:
+                                break;
+                        default:
+                                deprintf("[SPU] Cannot handle message\n");
+                                continue;
+                }
+                /* Tag Manager setup */
+                unsigned int tag_id;
+                tag_id = mfc_multi_tag_reserve(1);
+                if (tag_id == MFC_TAG_INVALID) {
+                        deprintf("[SPU] Failed to reserve mfc tags on yuv2rgb_converter\n");
+                        return 0;
+                }
+                /* DMA transfer for the input parameters */
+                ea_mfc = spu_read_in_mbox();
+                deprintf("[SPU] Message on yuv2rgb_converter is %u\n", ea_mfc);
+                spu_mfcdma32(&parms_converter, (unsigned int)ea_mfc, sizeof(struct yuv2rgb_parms_t), tag_id, MFC_GET_CMD);
+                DMA_WAIT_TAG(tag_id);
+                /* There are alignment issues that involve handling of special cases
+                 * a width of 32 results in a width of 16 in the chrominance
+                 * --> choose the proper handling to optimize the performance
+                 */
+                deprintf("[SPU] Convert %ix%i from YUV to RGB\n", parms_converter.src_pixel_width, parms_converter.src_pixel_height);
+                if (parms_converter.src_pixel_width & 0x1f) {
+                        deprintf("[SPU] Using yuv_to_rgb_w16\n");
+                        yuv_to_rgb_w16();
+                } else {
+                        deprintf("[SPU] Using yuv_to_rgb_w32\n");
+                        yuv_to_rgb_w32();
+                }
+                mfc_multi_tag_release(tag_id, 1);
+                deprintf("[SPU] yuv2rgb_spu... done!\n");
+                /* Send FIN message */
+                spu_write_out_mbox(SPU_FIN);
+        }
+        return 0;
+}
+/*
+ * float_to_char()
+ *
+ * converts a float to a character using saturated
+ * arithmetic
+ *
+ * @param s float for conversion
+ * @returns converted character
+ */
+inline static unsigned char float_to_char(float s) {
+        vector float vec_s = spu_splats(s);
+        vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
+        vec_s = spu_sel(vec_s, vec_0_1, select_1);
+        vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
+        vec_s = spu_sel(vec_s, vec_255, select_2);
+        return (unsigned char) spu_extract(vec_s,0);
+}
+/*
+ * vfloat_to_vuint()
+ *
+ * converts a float vector to an unsinged int vector using saturated
+ * arithmetic
+ *
+ * @param vec_s float vector for conversion
+ * @returns converted unsigned int vector
+ */
+inline static vector unsigned int vfloat_to_vuint(vector float vec_s) {
+        vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
+        vec_s = spu_sel(vec_s, vec_0_1, select_1);
+        vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
+        vec_s = spu_sel(vec_s, vec_255, select_2);
+        return spu_convtu(vec_s,0);
+}
+void yuv_to_rgb_w16() {
+        // Pixel dimensions of the picture
+        uint32_t width, height;
+        // Extract parameters
+        width = parms_converter.src_pixel_width;
+        height = parms_converter.src_pixel_height;
+        // Plane data management
+        // Y
+        unsigned char* ram_addr_y = parms_converter.y_plane;
+        // V
+        unsigned char* ram_addr_v = parms_converter.v_plane;
+        // U
+        unsigned char* ram_addr_u = parms_converter.u_plane;
+        // BGRA
+        unsigned char* ram_addr_bgra = parms_converter.dstBuffer;
+        // Strides
+        unsigned int stride_y = width;
+        unsigned int stride_vu = width>>1;
+        // Buffer management
+        unsigned int buf_idx = 0;
+        unsigned int size_4lines_y = stride_y<<2;
+        unsigned int size_2lines_y = stride_y<<1;
+        unsigned int size_2lines_vu = stride_vu<<1;
+        // 2*width*4byte_per_pixel
+        unsigned int size_2lines_bgra = width<<3;
+        // start double-buffered processing
+        // 4 lines y
+        spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y, size_4lines_y, RETR_BUF+buf_idx, MFC_GET_CMD);
+        // 2 lines v
+        spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
+        // 2 lines u
+        spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
+        // Wait for these transfers to be completed
+        DMA_WAIT_TAG((RETR_BUF + buf_idx));
+        unsigned int i;
+        for(i=0; i<(height>>2)-1; i++) {
+                buf_idx^=1;
+                // 4 lines y
+                spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y+size_4lines_y, size_4lines_y, RETR_BUF+buf_idx, MFC_GET_CMD);
+                // 2 lines v
+                spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v+size_2lines_vu, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
+                // 2 lines u
+                spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u+size_2lines_vu, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
+                DMA_WAIT_TAG((RETR_BUF + buf_idx));
+                buf_idx^=1;
+                // Convert YUV to BGRA, store it back (first two lines)
+                yuv_to_rgb_w16_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
+                // Next two lines
+                yuv_to_rgb_w16_line(y_plane[buf_idx] + size_2lines_y,
+                                v_plane[buf_idx] + stride_vu,
+                                u_plane[buf_idx] + stride_vu,
+                                bgra + size_2lines_bgra,
+                                width);
+                // Wait for previous storing transfer to be completed
+                DMA_WAIT_TAG(STR_BUF);
+                // Store converted lines in two steps->max transfer size 16384
+                spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+                ram_addr_bgra += size_2lines_bgra;
+                spu_mfcdma32(bgra+size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+                ram_addr_bgra += size_2lines_bgra;
+                // Move 4 lines
+                ram_addr_y += size_4lines_y;
+                ram_addr_v += size_2lines_vu;
+                ram_addr_u += size_2lines_vu;
+                buf_idx^=1;
+        }
+        // Convert YUV to BGRA, store it back (first two lines)
+        yuv_to_rgb_w16_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
+        // Next two lines
+        yuv_to_rgb_w16_line(y_plane[buf_idx] + size_2lines_y,
+                        v_plane[buf_idx] + stride_vu,
+                        u_plane[buf_idx] + stride_vu,
+                        bgra + size_2lines_bgra,
+                        width);
+        // Wait for previous storing transfer to be completed
+        DMA_WAIT_TAG(STR_BUF);
+        spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+        ram_addr_bgra += size_2lines_bgra;
+        spu_mfcdma32(bgra+size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+        // wait for previous storing transfer to be completed
+        DMA_WAIT_TAG(STR_BUF);
+}
+void yuv_to_rgb_w32() {
+        // Pixel dimensions of the picture
+        uint32_t width, height;
+        // Extract parameters
+        width = parms_converter.src_pixel_width;
+        height = parms_converter.src_pixel_height;
+        // Plane data management
+        // Y
+        unsigned char* ram_addr_y = parms_converter.y_plane;
+        // V
+        unsigned char* ram_addr_v = parms_converter.v_plane;
+        // U
+        unsigned char* ram_addr_u = parms_converter.u_plane;
+        // BGRA
+        unsigned char* ram_addr_bgra = parms_converter.dstBuffer;
+        // Strides
+        unsigned int stride_y = width;
+        unsigned int stride_vu = width>>1;
+        // Buffer management
+        unsigned int buf_idx = 0;
+        unsigned int size_4lines_y = stride_y<<2;
+        unsigned int size_2lines_y = stride_y<<1;
+        unsigned int size_2lines_vu = stride_vu<<1;
+        // 2*width*4byte_per_pixel
+        unsigned int size_2lines_bgra = width<<3;
+        // start double-buffered processing
+        // 4 lines y
+        spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y, size_4lines_y, RETR_BUF + buf_idx, MFC_GET_CMD);
+        // 2 lines v
+        spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
+        // 2 lines u
+        spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
+        // Wait for these transfers to be completed
+        DMA_WAIT_TAG((RETR_BUF + buf_idx));
+        unsigned int i;
+        for(i=0; i < (height>>2)-1; i++) {
+                buf_idx^=1;
+                // 4 lines y
+                spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y+size_4lines_y, size_4lines_y, RETR_BUF + buf_idx, MFC_GET_CMD);
+                deprintf("4lines = %d\n", size_4lines_y);
+                // 2 lines v
+                spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v+size_2lines_vu, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
+                deprintf("2lines = %d\n", size_2lines_vu);
+                // 2 lines u
+                spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u+size_2lines_vu, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
+                deprintf("2lines = %d\n", size_2lines_vu);
+                DMA_WAIT_TAG((RETR_BUF + buf_idx));
+                buf_idx^=1;
+                // Convert YUV to BGRA, store it back (first two lines)
+                yuv_to_rgb_w32_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
+                // Next two lines
+                yuv_to_rgb_w32_line(y_plane[buf_idx] + size_2lines_y,
+                                v_plane[buf_idx] + stride_vu,
+                                u_plane[buf_idx] + stride_vu,
+                                bgra + size_2lines_bgra,
+                                width);
+                // Wait for previous storing transfer to be completed
+                DMA_WAIT_TAG(STR_BUF);
+                // Store converted lines in two steps->max transfer size 16384
+                spu_mfcdma32(bgra, (unsigned int)ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+                ram_addr_bgra += size_2lines_bgra;
+                spu_mfcdma32(bgra + size_2lines_bgra, (unsigned int)ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+                ram_addr_bgra += size_2lines_bgra;
+                // Move 4 lines
+                ram_addr_y += size_4lines_y;
+                ram_addr_v += size_2lines_vu;
+                ram_addr_u += size_2lines_vu;
+                buf_idx^=1;
+        }
+        // Convert YUV to BGRA, store it back (first two lines)
+        yuv_to_rgb_w32_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
+        // Next two lines
+        yuv_to_rgb_w32_line(y_plane[buf_idx] + size_2lines_y,
+                        v_plane[buf_idx] + stride_vu,
+                        u_plane[buf_idx] + stride_vu,
+                        bgra + size_2lines_bgra,
+                        width);
+        // Wait for previous storing transfer to be completed
+        DMA_WAIT_TAG(STR_BUF);
+        spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+        ram_addr_bgra += size_2lines_bgra;
+        spu_mfcdma32(bgra + size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+        // Wait for previous storing transfer to be completed
+        DMA_WAIT_TAG(STR_BUF);
+}
+/* Some vectors needed by the yuv 2 rgb conversion algorithm */
+const vector float vec_minus_128 = { -128.0f, -128.0f, -128.0f, -128.0f };
+const vector unsigned char vec_null = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+const vector unsigned char vec_char2int_first = { 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x13 };
+const vector unsigned char vec_char2int_second = { 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x17 };
+const vector unsigned char vec_char2int_third = { 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x00, 0x00, 0x1B };
+const vector unsigned char vec_char2int_fourth = { 0x00, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x00, 0x1D, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x00, 0x1F };
+const vector float vec_R_precalc_coeff = {1.403f, 1.403f, 1.403f, 1.403f};
+const vector float vec_Gu_precalc_coeff = {-0.344f, -0.344f, -0.344f, -0.344f};
+const vector float vec_Gv_precalc_coeff = {-0.714f, -0.714f, -0.714f, -0.714f};
+const vector float vec_B_precalc_coeff = {1.773f, 1.773f, 1.773f, 1.773f};
+const vector unsigned int vec_alpha =  { 255 << 24, 255 << 24, 255 << 24, 255 << 24 };
+const vector unsigned char vec_select_floats_upper = { 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07 };
+const vector unsigned char vec_select_floats_lower = { 0x08, 0x09, 0x0A, 0x0B, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x0C, 0x0D, 0x0E, 0x0F };
+/*
+ * yuv_to_rgb_w16()
+ *
+ * processes to line of yuv-input, width has to be a multiple of 16
+ * two lines of yuv are taken as input
+ *
+ * @param y_addr address of the y plane in local store
+ * @param v_addr address of the v plane in local store
+ * @param u_addr address of the u plane in local store
+ * @param bgra_addr_ address of the bgra output buffer
+ * @param width the width in pixel
+ */
+void yuv_to_rgb_w16_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width) {
+        // each pixel is stored as an integer
+        unsigned int* bgra_addr = (unsigned int*) bgra_addr_;
+        unsigned int x;
+        for(x = 0; x < width; x+=2) {
+                // Gehe zweischrittig durch die zeile, da jeder u und v wert fuer 4 pixel(zwei hoch, zwei breit) gilt
+                const unsigned char Y_1 = *(y_addr + x);
+                const unsigned char Y_2 = *(y_addr + x + 1);
+                const unsigned char Y_3 = *(y_addr + x + width);
+                const unsigned char Y_4 = *(y_addr + x + width + 1);
+                const unsigned char U = *(u_addr + (x >> 1));
+                const unsigned char V = *(v_addr + (x >> 1));
+                float V_minus_128 = (float)((float)V - 128.0f);
+                float U_minus_128 = (float)((float)U - 128.0f);
+                float R_precalculate = 1.403f * V_minus_128;
+                float G_precalculate = -(0.344f * U_minus_128 + 0.714f * V_minus_128);
+                float B_precalculate = 1.773f * U_minus_128;
+                const unsigned char R_1 = float_to_char((Y_1 + R_precalculate));
+                const unsigned char R_2 = float_to_char((Y_2 + R_precalculate));
+                const unsigned char R_3 = float_to_char((Y_3 + R_precalculate));
+                const unsigned char R_4 = float_to_char((Y_4 + R_precalculate));
+                const unsigned char G_1 = float_to_char((Y_1 + G_precalculate));
+                const unsigned char G_2 = float_to_char((Y_2 + G_precalculate));
+                const unsigned char G_3 = float_to_char((Y_3 + G_precalculate));
+                const unsigned char G_4 = float_to_char((Y_4 + G_precalculate));
+                const unsigned char B_1 = float_to_char((Y_1 + B_precalculate));
+                const unsigned char B_2 = float_to_char((Y_2 + B_precalculate));
+                const unsigned char B_3 = float_to_char((Y_3 + B_precalculate));
+                const unsigned char B_4 = float_to_char((Y_4 + B_precalculate));
+                *(bgra_addr + x) = (B_1 << 0)| (G_1 << 8) | (R_1 << 16) | (255 << 24);
+                *(bgra_addr + x + 1) = (B_2 << 0)| (G_2 << 8) | (R_2 << 16) | (255 << 24);
+                *(bgra_addr + x + width) = (B_3 << 0)| (G_3 << 8) | (R_3 << 16) | (255 << 24);
+                *(bgra_addr + x + width + 1) = (B_4 << 0)| (G_4 << 8) | (R_4 << 16) | (255 << 24);
+        }
+}
+/*
+ * yuv_to_rgb_w32()
+ *
+ * processes to line of yuv-input, width has to be a multiple of 32
+ * two lines of yuv are taken as input
+ *
+ * @param y_addr address of the y plane in local store
+ * @param v_addr address of the v plane in local store
+ * @param u_addr address of the u plane in local store
+ * @param bgra_addr_ address of the bgra output buffer
+ * @param width the width in pixel
+ */
+void yuv_to_rgb_w32_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width) {
+        // each pixel is stored as an integer
+        unsigned int* bgra_addr = (unsigned int*) bgra_addr_;
+        unsigned int x;
+        for(x = 0; x < width; x+=32) {
+                // Gehe zweischrittig durch die zeile, da jeder u und v wert fuer 4 pixel(zwei hoch, zwei breit) gilt
+                const vector unsigned char vchar_Y_1 = *((vector unsigned char*)(y_addr + x));
+                const vector unsigned char vchar_Y_2 = *((vector unsigned char*)(y_addr + x + 16));
+                const vector unsigned char vchar_Y_3 = *((vector unsigned char*)(y_addr + x + width));
+                const vector unsigned char vchar_Y_4 = *((vector unsigned char*)(y_addr + x + width + 16));
+                const vector unsigned char vchar_U = *((vector unsigned char*)(u_addr + (x >> 1)));
+                const vector unsigned char vchar_V = *((vector unsigned char*)(v_addr + (x >> 1)));
+                const vector float vfloat_U_1 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_first), 0),vec_minus_128);
+                const vector float vfloat_U_2 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_second), 0),vec_minus_128);
+                const vector float vfloat_U_3 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_third), 0),vec_minus_128);
+                const vector float vfloat_U_4 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_fourth), 0),vec_minus_128);
+                const vector float vfloat_V_1 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_first), 0),vec_minus_128);
+                const vector float vfloat_V_2 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_second), 0),vec_minus_128);
+                const vector float vfloat_V_3 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_third), 0),vec_minus_128);
+                const vector float vfloat_V_4 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_fourth), 0),vec_minus_128);
+                vector float Y_1 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_first), 0);
+                vector float Y_2 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_second), 0);
+                vector float Y_3 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_third), 0);
+                vector float Y_4 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_fourth), 0);
+                vector float Y_5 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_first), 0);
+                vector float Y_6 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_second), 0);
+                vector float Y_7 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_third), 0);
+                vector float Y_8 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_fourth), 0);
+                vector float Y_9 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_first), 0);
+                vector float Y_10 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_second), 0);
+                vector float Y_11 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_third), 0);
+                vector float Y_12 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_fourth), 0);
+                vector float Y_13 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_first), 0);
+                vector float Y_14 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_second), 0);
+                vector float Y_15 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_third), 0);
+                vector float Y_16 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_fourth), 0);
+                const vector float R1a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_1);
+                const vector float R2a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_2);
+                const vector float R3a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_3);
+                const vector float R4a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_4);
+                const vector float R1_precalculate = spu_shuffle(R1a_precalculate,  R1a_precalculate, vec_select_floats_upper);
+                const vector float R2_precalculate = spu_shuffle(R1a_precalculate,  R1a_precalculate, vec_select_floats_lower);
+                const vector float R3_precalculate = spu_shuffle(R2a_precalculate,  R2a_precalculate, vec_select_floats_upper);
+                const vector float R4_precalculate = spu_shuffle(R2a_precalculate,  R2a_precalculate, vec_select_floats_lower);
+                const vector float R5_precalculate = spu_shuffle(R3a_precalculate,  R3a_precalculate, vec_select_floats_upper);
+                const vector float R6_precalculate = spu_shuffle(R3a_precalculate,  R3a_precalculate, vec_select_floats_lower);
+                const vector float R7_precalculate = spu_shuffle(R4a_precalculate,  R4a_precalculate, vec_select_floats_upper);
+                const vector float R8_precalculate = spu_shuffle(R4a_precalculate,  R4a_precalculate, vec_select_floats_lower);
+                const vector float G1a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_1, spu_mul(vfloat_V_1, vec_Gv_precalc_coeff));
+                const vector float G2a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_2, spu_mul(vfloat_V_2, vec_Gv_precalc_coeff));
+                const vector float G3a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_3, spu_mul(vfloat_V_3, vec_Gv_precalc_coeff));
+                const vector float G4a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_4, spu_mul(vfloat_V_4, vec_Gv_precalc_coeff));
+                const vector float G1_precalculate = spu_shuffle(G1a_precalculate,  G1a_precalculate, vec_select_floats_upper);
+                const vector float G2_precalculate = spu_shuffle(G1a_precalculate,  G1a_precalculate, vec_select_floats_lower);
+                const vector float G3_precalculate = spu_shuffle(G2a_precalculate,  G2a_precalculate, vec_select_floats_upper);
+                const vector float G4_precalculate = spu_shuffle(G2a_precalculate,  G2a_precalculate, vec_select_floats_lower);
+                const vector float G5_precalculate = spu_shuffle(G3a_precalculate,  G3a_precalculate, vec_select_floats_upper);
+                const vector float G6_precalculate = spu_shuffle(G3a_precalculate,  G3a_precalculate, vec_select_floats_lower);
+                const vector float G7_precalculate = spu_shuffle(G4a_precalculate,  G4a_precalculate, vec_select_floats_upper);
+                const vector float G8_precalculate = spu_shuffle(G4a_precalculate,  G4a_precalculate, vec_select_floats_lower);
+                const vector float B1a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_1);
+                const vector float B2a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_2);
+                const vector float B3a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_3);
+                const vector float B4a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_4);
+                const vector float B1_precalculate = spu_shuffle(B1a_precalculate,  B1a_precalculate, vec_select_floats_upper);
+                const vector float B2_precalculate = spu_shuffle(B1a_precalculate,  B1a_precalculate, vec_select_floats_lower);
+                const vector float B3_precalculate = spu_shuffle(B2a_precalculate,  B2a_precalculate, vec_select_floats_upper);
+                const vector float B4_precalculate = spu_shuffle(B2a_precalculate,  B2a_precalculate, vec_select_floats_lower);
+                const vector float B5_precalculate = spu_shuffle(B3a_precalculate,  B3a_precalculate, vec_select_floats_upper);
+                const vector float B6_precalculate = spu_shuffle(B3a_precalculate,  B3a_precalculate, vec_select_floats_lower);
+                const vector float B7_precalculate = spu_shuffle(B4a_precalculate,  B4a_precalculate, vec_select_floats_upper);
+                const vector float B8_precalculate = spu_shuffle(B4a_precalculate,  B4a_precalculate, vec_select_floats_lower);
+                const vector unsigned int  R_1 = vfloat_to_vuint(spu_add( Y_1, R1_precalculate));
+                const vector unsigned int  R_2 = vfloat_to_vuint(spu_add( Y_2, R2_precalculate));
+                const vector unsigned int  R_3 = vfloat_to_vuint(spu_add( Y_3, R3_precalculate));
+                const vector unsigned int  R_4 = vfloat_to_vuint(spu_add( Y_4, R4_precalculate));
+                const vector unsigned int  R_5 = vfloat_to_vuint(spu_add( Y_5, R5_precalculate));
+                const vector unsigned int  R_6 = vfloat_to_vuint(spu_add( Y_6, R6_precalculate));
+                const vector unsigned int  R_7 = vfloat_to_vuint(spu_add( Y_7, R7_precalculate));
+                const vector unsigned int  R_8 = vfloat_to_vuint(spu_add( Y_8, R8_precalculate));
+                const vector unsigned int  R_9 = vfloat_to_vuint(spu_add( Y_9, R1_precalculate));
+                const vector unsigned int R_10 = vfloat_to_vuint(spu_add(Y_10, R2_precalculate));
+                const vector unsigned int R_11 = vfloat_to_vuint(spu_add(Y_11, R3_precalculate));
+                const vector unsigned int R_12 = vfloat_to_vuint(spu_add(Y_12, R4_precalculate));
+                const vector unsigned int R_13 = vfloat_to_vuint(spu_add(Y_13, R5_precalculate));
+                const vector unsigned int R_14 = vfloat_to_vuint(spu_add(Y_14, R6_precalculate));
+                const vector unsigned int R_15 = vfloat_to_vuint(spu_add(Y_15, R7_precalculate));
+                const vector unsigned int R_16 = vfloat_to_vuint(spu_add(Y_16, R8_precalculate));
+                const vector unsigned int  G_1 = vfloat_to_vuint(spu_add( Y_1, G1_precalculate));
+                const vector unsigned int  G_2 = vfloat_to_vuint(spu_add( Y_2, G2_precalculate));
+                const vector unsigned int  G_3 = vfloat_to_vuint(spu_add( Y_3, G3_precalculate));
+                const vector unsigned int  G_4 = vfloat_to_vuint(spu_add( Y_4, G4_precalculate));
+                const vector unsigned int  G_5 = vfloat_to_vuint(spu_add( Y_5, G5_precalculate));
+                const vector unsigned int  G_6 = vfloat_to_vuint(spu_add( Y_6, G6_precalculate));
+                const vector unsigned int  G_7 = vfloat_to_vuint(spu_add( Y_7, G7_precalculate));
+                const vector unsigned int  G_8 = vfloat_to_vuint(spu_add( Y_8, G8_precalculate));
+                const vector unsigned int  G_9 = vfloat_to_vuint(spu_add( Y_9, G1_precalculate));
+                const vector unsigned int G_10 = vfloat_to_vuint(spu_add(Y_10, G2_precalculate));
+                const vector unsigned int G_11 = vfloat_to_vuint(spu_add(Y_11, G3_precalculate));
+                const vector unsigned int G_12 = vfloat_to_vuint(spu_add(Y_12, G4_precalculate));
+                const vector unsigned int G_13 = vfloat_to_vuint(spu_add(Y_13, G5_precalculate));
+                const vector unsigned int G_14 = vfloat_to_vuint(spu_add(Y_14, G6_precalculate));
+                const vector unsigned int G_15 = vfloat_to_vuint(spu_add(Y_15, G7_precalculate));
+                const vector unsigned int G_16 = vfloat_to_vuint(spu_add(Y_16, G8_precalculate));
+                const vector unsigned int  B_1 = vfloat_to_vuint(spu_add( Y_1, B1_precalculate));
+                const vector unsigned int  B_2 = vfloat_to_vuint(spu_add( Y_2, B2_precalculate));
+                const vector unsigned int  B_3 = vfloat_to_vuint(spu_add( Y_3, B3_precalculate));
+                const vector unsigned int  B_4 = vfloat_to_vuint(spu_add( Y_4, B4_precalculate));
+                const vector unsigned int  B_5 = vfloat_to_vuint(spu_add( Y_5, B5_precalculate));
+                const vector unsigned int  B_6 = vfloat_to_vuint(spu_add( Y_6, B6_precalculate));
+                const vector unsigned int  B_7 = vfloat_to_vuint(spu_add( Y_7, B7_precalculate));
+                const vector unsigned int  B_8 = vfloat_to_vuint(spu_add( Y_8, B8_precalculate));
+                const vector unsigned int  B_9 = vfloat_to_vuint(spu_add( Y_9, B1_precalculate));
+                const vector unsigned int B_10 = vfloat_to_vuint(spu_add(Y_10, B2_precalculate));
+                const vector unsigned int B_11 = vfloat_to_vuint(spu_add(Y_11, B3_precalculate));
+                const vector unsigned int B_12 = vfloat_to_vuint(spu_add(Y_12, B4_precalculate));
+                const vector unsigned int B_13 = vfloat_to_vuint(spu_add(Y_13, B5_precalculate));
+                const vector unsigned int B_14 = vfloat_to_vuint(spu_add(Y_14, B6_precalculate));
+                const vector unsigned int B_15 = vfloat_to_vuint(spu_add(Y_15, B7_precalculate));
+                const vector unsigned int B_16 = vfloat_to_vuint(spu_add(Y_16, B8_precalculate));
+                *((vector unsigned int*)(bgra_addr + x)) = spu_or(spu_or(vec_alpha,  B_1), spu_or(spu_slqwbyte( R_1, 2),spu_slqwbyte(G_1, 1)));
+                *((vector unsigned int*)(bgra_addr + x + 4)) = spu_or(spu_or(vec_alpha,  B_2), spu_or(spu_slqwbyte( R_2, 2),spu_slqwbyte(G_2, 1)));
+                *((vector unsigned int*)(bgra_addr + x + 8)) = spu_or(spu_or(vec_alpha,  B_3), spu_or(spu_slqwbyte( R_3, 2),spu_slqwbyte(G_3, 1)));
+                *((vector unsigned int*)(bgra_addr + x + 12)) = spu_or(spu_or(vec_alpha,  B_4), spu_or(spu_slqwbyte( R_4, 2),spu_slqwbyte(G_4, 1)));
+                *((vector unsigned int*)(bgra_addr + x + 16)) = spu_or(spu_or(vec_alpha,  B_5), spu_or(spu_slqwbyte( R_5, 2),spu_slqwbyte(G_5, 1)));
+                *((vector unsigned int*)(bgra_addr + x + 20)) = spu_or(spu_or(vec_alpha,  B_6), spu_or(spu_slqwbyte( R_6, 2),spu_slqwbyte(G_6, 1)));
+                *((vector unsigned int*)(bgra_addr + x + 24)) = spu_or(spu_or(vec_alpha,  B_7), spu_or(spu_slqwbyte( R_7, 2),spu_slqwbyte(G_7, 1)));
+                *((vector unsigned int*)(bgra_addr + x + 28)) = spu_or(spu_or(vec_alpha,  B_8), spu_or(spu_slqwbyte( R_8, 2),spu_slqwbyte(G_8, 1)));
+                *((vector unsigned int*)(bgra_addr + x + width)) = spu_or(spu_or(vec_alpha,  B_9), spu_or(spu_slqwbyte( R_9, 2),spu_slqwbyte(G_9, 1)));
+                *((vector unsigned int*)(bgra_addr + x + width + 4)) = spu_or(spu_or(vec_alpha, B_10), spu_or(spu_slqwbyte(R_10, 2),spu_slqwbyte(G_10, 1)));
+                *((vector unsigned int*)(bgra_addr + x + width + 8)) = spu_or(spu_or(vec_alpha, B_11), spu_or(spu_slqwbyte(R_11, 2),spu_slqwbyte(G_11, 1)));
+                *((vector unsigned int*)(bgra_addr + x + width + 12)) = spu_or(spu_or(vec_alpha, B_12), spu_or(spu_slqwbyte(R_12, 2),spu_slqwbyte(G_12, 1)));
+                *((vector unsigned int*)(bgra_addr + x + width + 16)) = spu_or(spu_or(vec_alpha, B_13), spu_or(spu_slqwbyte(R_13, 2),spu_slqwbyte(G_13, 1)));
+                *((vector unsigned int*)(bgra_addr + x + width + 20)) = spu_or(spu_or(vec_alpha, B_14), spu_or(spu_slqwbyte(R_14, 2),spu_slqwbyte(G_14, 1)));
+                *((vector unsigned int*)(bgra_addr + x + width + 24)) = spu_or(spu_or(vec_alpha, B_15), spu_or(spu_slqwbyte(R_15, 2),spu_slqwbyte(G_15, 1)));
+                *((vector unsigned int*)(bgra_addr + x + width + 28)) = spu_or(spu_or(vec_alpha, B_16), spu_or(spu_slqwbyte(R_16, 2),spu_slqwbyte(G_16, 1)));
+        }
+}