summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--apps/codecs/libwmavoice/Makefile1
-rw-r--r--apps/codecs/libwmavoice/dsputil.c4535
-rw-r--r--apps/codecs/libwmavoice/dsputil.h800
3 files changed, 0 insertions, 5336 deletions
diff --git a/apps/codecs/libwmavoice/Makefile b/apps/codecs/libwmavoice/Makefile
index f1d987f40c..2bd7b94f9b 100644
--- a/apps/codecs/libwmavoice/Makefile
+++ b/apps/codecs/libwmavoice/Makefile
@@ -31,7 +31,6 @@ get_bits.h\
31wmavoice_data.h\ 31wmavoice_data.h\
32avcodec.h\ 32avcodec.h\
33fft.h\ 33fft.h\
34dsputil.h\
35acelp_filters.h\ 34acelp_filters.h\
36celp_filters.h\ 35celp_filters.h\
37put_bits.h\ 36put_bits.h\
diff --git a/apps/codecs/libwmavoice/dsputil.c b/apps/codecs/libwmavoice/dsputil.c
deleted file mode 100644
index 534f03f885..0000000000
--- a/apps/codecs/libwmavoice/dsputil.c
+++ /dev/null
@@ -1,4535 +0,0 @@
1/*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25/**
26 * @file
27 * DSP utils
28 */
29
30#include "avcodec.h"
31#include "dsputil.h"
32#include "simple_idct.h"
33#include "faandct.h"
34#include "faanidct.h"
35#include "mathops.h"
36#include "mpegvideo.h"
37#include "config.h"
38#include "lpc.h"
39#include "ac3dec.h"
40#include "vorbis.h"
41#include "png.h"
42#include "vp8dsp.h"
43
44uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
45uint32_t ff_squareTbl[512] = {0, };
46
47// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
48#define pb_7f (~0UL/255 * 0x7f)
49#define pb_80 (~0UL/255 * 0x80)
50
51const uint8_t ff_zigzag_direct[64] = {
52 0, 1, 8, 16, 9, 2, 3, 10,
53 17, 24, 32, 25, 18, 11, 4, 5,
54 12, 19, 26, 33, 40, 48, 41, 34,
55 27, 20, 13, 6, 7, 14, 21, 28,
56 35, 42, 49, 56, 57, 50, 43, 36,
57 29, 22, 15, 23, 30, 37, 44, 51,
58 58, 59, 52, 45, 38, 31, 39, 46,
59 53, 60, 61, 54, 47, 55, 62, 63
60};
61
62/* Specific zigzag scan for 248 idct. NOTE that unlike the
63 specification, we interleave the fields */
64const uint8_t ff_zigzag248_direct[64] = {
65 0, 8, 1, 9, 16, 24, 2, 10,
66 17, 25, 32, 40, 48, 56, 33, 41,
67 18, 26, 3, 11, 4, 12, 19, 27,
68 34, 42, 49, 57, 50, 58, 35, 43,
69 20, 28, 5, 13, 6, 14, 21, 29,
70 36, 44, 51, 59, 52, 60, 37, 45,
71 22, 30, 7, 15, 23, 31, 38, 46,
72 53, 61, 54, 62, 39, 47, 55, 63,
73};
74
75/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
76DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
77
78const uint8_t ff_alternate_horizontal_scan[64] = {
79 0, 1, 2, 3, 8, 9, 16, 17,
80 10, 11, 4, 5, 6, 7, 15, 14,
81 13, 12, 19, 18, 24, 25, 32, 33,
82 26, 27, 20, 21, 22, 23, 28, 29,
83 30, 31, 34, 35, 40, 41, 48, 49,
84 42, 43, 36, 37, 38, 39, 44, 45,
85 46, 47, 50, 51, 56, 57, 58, 59,
86 52, 53, 54, 55, 60, 61, 62, 63,
87};
88
89const uint8_t ff_alternate_vertical_scan[64] = {
90 0, 8, 16, 24, 1, 9, 2, 10,
91 17, 25, 32, 40, 48, 56, 57, 49,
92 41, 33, 26, 18, 3, 11, 4, 12,
93 19, 27, 34, 42, 50, 58, 35, 43,
94 51, 59, 20, 28, 5, 13, 6, 14,
95 21, 29, 36, 44, 52, 60, 37, 45,
96 53, 61, 22, 30, 7, 15, 23, 31,
97 38, 46, 54, 62, 39, 47, 55, 63,
98};
99
100/* Input permutation for the simple_idct_mmx */
101static const uint8_t simple_mmx_permutation[64]={
102 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
103 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
104 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
105 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
106 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
107 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
108 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
109 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
110};
111
112static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
113
114void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
115 int i;
116 int end;
117
118 st->scantable= src_scantable;
119
120 for(i=0; i<64; i++){
121 int j;
122 j = src_scantable[i];
123 st->permutated[i] = permutation[j];
124#if ARCH_PPC
125 st->inverse[j] = i;
126#endif
127 }
128
129 end=-1;
130 for(i=0; i<64; i++){
131 int j;
132 j = st->permutated[i];
133 if(j>end) end=j;
134 st->raster_end[i]= end;
135 }
136}
137
138static int pix_sum_c(uint8_t * pix, int line_size)
139{
140 int s, i, j;
141
142 s = 0;
143 for (i = 0; i < 16; i++) {
144 for (j = 0; j < 16; j += 8) {
145 s += pix[0];
146 s += pix[1];
147 s += pix[2];
148 s += pix[3];
149 s += pix[4];
150 s += pix[5];
151 s += pix[6];
152 s += pix[7];
153 pix += 8;
154 }
155 pix += line_size - 16;
156 }
157 return s;
158}
159
160static int pix_norm1_c(uint8_t * pix, int line_size)
161{
162 int s, i, j;
163 uint32_t *sq = ff_squareTbl + 256;
164
165 s = 0;
166 for (i = 0; i < 16; i++) {
167 for (j = 0; j < 16; j += 8) {
168#if 0
169 s += sq[pix[0]];
170 s += sq[pix[1]];
171 s += sq[pix[2]];
172 s += sq[pix[3]];
173 s += sq[pix[4]];
174 s += sq[pix[5]];
175 s += sq[pix[6]];
176 s += sq[pix[7]];
177#else
178#if LONG_MAX > 2147483647
179 register uint64_t x=*(uint64_t*)pix;
180 s += sq[x&0xff];
181 s += sq[(x>>8)&0xff];
182 s += sq[(x>>16)&0xff];
183 s += sq[(x>>24)&0xff];
184 s += sq[(x>>32)&0xff];
185 s += sq[(x>>40)&0xff];
186 s += sq[(x>>48)&0xff];
187 s += sq[(x>>56)&0xff];
188#else
189 register uint32_t x=*(uint32_t*)pix;
190 s += sq[x&0xff];
191 s += sq[(x>>8)&0xff];
192 s += sq[(x>>16)&0xff];
193 s += sq[(x>>24)&0xff];
194 x=*(uint32_t*)(pix+4);
195 s += sq[x&0xff];
196 s += sq[(x>>8)&0xff];
197 s += sq[(x>>16)&0xff];
198 s += sq[(x>>24)&0xff];
199#endif
200#endif
201 pix += 8;
202 }
203 pix += line_size - 16;
204 }
205 return s;
206}
207
208static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
209 int i;
210
211 for(i=0; i+8<=w; i+=8){
212 dst[i+0]= av_bswap32(src[i+0]);
213 dst[i+1]= av_bswap32(src[i+1]);
214 dst[i+2]= av_bswap32(src[i+2]);
215 dst[i+3]= av_bswap32(src[i+3]);
216 dst[i+4]= av_bswap32(src[i+4]);
217 dst[i+5]= av_bswap32(src[i+5]);
218 dst[i+6]= av_bswap32(src[i+6]);
219 dst[i+7]= av_bswap32(src[i+7]);
220 }
221 for(;i<w; i++){
222 dst[i+0]= av_bswap32(src[i+0]);
223 }
224}
225
226static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
227{
228 int s, i;
229 uint32_t *sq = ff_squareTbl + 256;
230
231 s = 0;
232 for (i = 0; i < h; i++) {
233 s += sq[pix1[0] - pix2[0]];
234 s += sq[pix1[1] - pix2[1]];
235 s += sq[pix1[2] - pix2[2]];
236 s += sq[pix1[3] - pix2[3]];
237 pix1 += line_size;
238 pix2 += line_size;
239 }
240 return s;
241}
242
243static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
244{
245 int s, i;
246 uint32_t *sq = ff_squareTbl + 256;
247
248 s = 0;
249 for (i = 0; i < h; i++) {
250 s += sq[pix1[0] - pix2[0]];
251 s += sq[pix1[1] - pix2[1]];
252 s += sq[pix1[2] - pix2[2]];
253 s += sq[pix1[3] - pix2[3]];
254 s += sq[pix1[4] - pix2[4]];
255 s += sq[pix1[5] - pix2[5]];
256 s += sq[pix1[6] - pix2[6]];
257 s += sq[pix1[7] - pix2[7]];
258 pix1 += line_size;
259 pix2 += line_size;
260 }
261 return s;
262}
263
264static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
265{
266 int s, i;
267 uint32_t *sq = ff_squareTbl + 256;
268
269 s = 0;
270 for (i = 0; i < h; i++) {
271 s += sq[pix1[ 0] - pix2[ 0]];
272 s += sq[pix1[ 1] - pix2[ 1]];
273 s += sq[pix1[ 2] - pix2[ 2]];
274 s += sq[pix1[ 3] - pix2[ 3]];
275 s += sq[pix1[ 4] - pix2[ 4]];
276 s += sq[pix1[ 5] - pix2[ 5]];
277 s += sq[pix1[ 6] - pix2[ 6]];
278 s += sq[pix1[ 7] - pix2[ 7]];
279 s += sq[pix1[ 8] - pix2[ 8]];
280 s += sq[pix1[ 9] - pix2[ 9]];
281 s += sq[pix1[10] - pix2[10]];
282 s += sq[pix1[11] - pix2[11]];
283 s += sq[pix1[12] - pix2[12]];
284 s += sq[pix1[13] - pix2[13]];
285 s += sq[pix1[14] - pix2[14]];
286 s += sq[pix1[15] - pix2[15]];
287
288 pix1 += line_size;
289 pix2 += line_size;
290 }
291 return s;
292}
293
294/* draw the edges of width 'w' of an image of size width, height */
295//FIXME check that this is ok for mpeg4 interlaced
296static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
297{
298 uint8_t *ptr, *last_line;
299 int i;
300
301 last_line = buf + (height - 1) * wrap;
302 for(i=0;i<w;i++) {
303 /* top and bottom */
304 memcpy(buf - (i + 1) * wrap, buf, width);
305 memcpy(last_line + (i + 1) * wrap, last_line, width);
306 }
307 /* left and right */
308 ptr = buf;
309 for(i=0;i<height;i++) {
310 memset(ptr - w, ptr[0], w);
311 memset(ptr + width, ptr[width-1], w);
312 ptr += wrap;
313 }
314 /* corners */
315 for(i=0;i<w;i++) {
316 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
317 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
318 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
319 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
320 }
321}
322
323/**
324 * Copy a rectangular area of samples to a temporary buffer and replicate the border samples.
325 * @param buf destination buffer
326 * @param src source buffer
327 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
328 * @param block_w width of block
329 * @param block_h height of block
330 * @param src_x x coordinate of the top left sample of the block in the source buffer
331 * @param src_y y coordinate of the top left sample of the block in the source buffer
332 * @param w width of the source buffer
333 * @param h height of the source buffer
334 */
335void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
336 int src_x, int src_y, int w, int h){
337 int x, y;
338 int start_y, start_x, end_y, end_x;
339
340 if(src_y>= h){
341 src+= (h-1-src_y)*linesize;
342 src_y=h-1;
343 }else if(src_y<=-block_h){
344 src+= (1-block_h-src_y)*linesize;
345 src_y=1-block_h;
346 }
347 if(src_x>= w){
348 src+= (w-1-src_x);
349 src_x=w-1;
350 }else if(src_x<=-block_w){
351 src+= (1-block_w-src_x);
352 src_x=1-block_w;
353 }
354
355 start_y= FFMAX(0, -src_y);
356 start_x= FFMAX(0, -src_x);
357 end_y= FFMIN(block_h, h-src_y);
358 end_x= FFMIN(block_w, w-src_x);
359
360 // copy existing part
361 for(y=start_y; y<end_y; y++){
362 for(x=start_x; x<end_x; x++){
363 buf[x + y*linesize]= src[x + y*linesize];
364 }
365 }
366
367 //top
368 for(y=0; y<start_y; y++){
369 for(x=start_x; x<end_x; x++){
370 buf[x + y*linesize]= buf[x + start_y*linesize];
371 }
372 }
373
374 //bottom
375 for(y=end_y; y<block_h; y++){
376 for(x=start_x; x<end_x; x++){
377 buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
378 }
379 }
380
381 for(y=0; y<block_h; y++){
382 //left
383 for(x=0; x<start_x; x++){
384 buf[x + y*linesize]= buf[start_x + y*linesize];
385 }
386
387 //right
388 for(x=end_x; x<block_w; x++){
389 buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
390 }
391 }
392}
393
394static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
395{
396 int i;
397
398 /* read the pixels */
399 for(i=0;i<8;i++) {
400 block[0] = pixels[0];
401 block[1] = pixels[1];
402 block[2] = pixels[2];
403 block[3] = pixels[3];
404 block[4] = pixels[4];
405 block[5] = pixels[5];
406 block[6] = pixels[6];
407 block[7] = pixels[7];
408 pixels += line_size;
409 block += 8;
410 }
411}
412
413static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
414 const uint8_t *s2, int stride){
415 int i;
416
417 /* read the pixels */
418 for(i=0;i<8;i++) {
419 block[0] = s1[0] - s2[0];
420 block[1] = s1[1] - s2[1];
421 block[2] = s1[2] - s2[2];
422 block[3] = s1[3] - s2[3];
423 block[4] = s1[4] - s2[4];
424 block[5] = s1[5] - s2[5];
425 block[6] = s1[6] - s2[6];
426 block[7] = s1[7] - s2[7];
427 s1 += stride;
428 s2 += stride;
429 block += 8;
430 }
431}
432
433
434static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
435 int line_size)
436{
437 int i;
438 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
439
440 /* read the pixels */
441 for(i=0;i<8;i++) {
442 pixels[0] = cm[block[0]];
443 pixels[1] = cm[block[1]];
444 pixels[2] = cm[block[2]];
445 pixels[3] = cm[block[3]];
446 pixels[4] = cm[block[4]];
447 pixels[5] = cm[block[5]];
448 pixels[6] = cm[block[6]];
449 pixels[7] = cm[block[7]];
450
451 pixels += line_size;
452 block += 8;
453 }
454}
455
456static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
457 int line_size)
458{
459 int i;
460 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
461
462 /* read the pixels */
463 for(i=0;i<4;i++) {
464 pixels[0] = cm[block[0]];
465 pixels[1] = cm[block[1]];
466 pixels[2] = cm[block[2]];
467 pixels[3] = cm[block[3]];
468
469 pixels += line_size;
470 block += 8;
471 }
472}
473
474static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
475 int line_size)
476{
477 int i;
478 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
479
480 /* read the pixels */
481 for(i=0;i<2;i++) {
482 pixels[0] = cm[block[0]];
483 pixels[1] = cm[block[1]];
484
485 pixels += line_size;
486 block += 8;
487 }
488}
489
490static void put_signed_pixels_clamped_c(const DCTELEM *block,
491 uint8_t *restrict pixels,
492 int line_size)
493{
494 int i, j;
495
496 for (i = 0; i < 8; i++) {
497 for (j = 0; j < 8; j++) {
498 if (*block < -128)
499 *pixels = 0;
500 else if (*block > 127)
501 *pixels = 255;
502 else
503 *pixels = (uint8_t)(*block + 128);
504 block++;
505 pixels++;
506 }
507 pixels += (line_size - 8);
508 }
509}
510
511static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
512 int line_size)
513{
514 int i;
515
516 /* read the pixels */
517 for(i=0;i<8;i++) {
518 pixels[0] = block[0];
519 pixels[1] = block[1];
520 pixels[2] = block[2];
521 pixels[3] = block[3];
522 pixels[4] = block[4];
523 pixels[5] = block[5];
524 pixels[6] = block[6];
525 pixels[7] = block[7];
526
527 pixels += line_size;
528 block += 8;
529 }
530}
531
532static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
533 int line_size)
534{
535 int i;
536 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
537
538 /* read the pixels */
539 for(i=0;i<8;i++) {
540 pixels[0] = cm[pixels[0] + block[0]];
541 pixels[1] = cm[pixels[1] + block[1]];
542 pixels[2] = cm[pixels[2] + block[2]];
543 pixels[3] = cm[pixels[3] + block[3]];
544 pixels[4] = cm[pixels[4] + block[4]];
545 pixels[5] = cm[pixels[5] + block[5]];
546 pixels[6] = cm[pixels[6] + block[6]];
547 pixels[7] = cm[pixels[7] + block[7]];
548 pixels += line_size;
549 block += 8;
550 }
551}
552
553static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
554 int line_size)
555{
556 int i;
557 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
558
559 /* read the pixels */
560 for(i=0;i<4;i++) {
561 pixels[0] = cm[pixels[0] + block[0]];
562 pixels[1] = cm[pixels[1] + block[1]];
563 pixels[2] = cm[pixels[2] + block[2]];
564 pixels[3] = cm[pixels[3] + block[3]];
565 pixels += line_size;
566 block += 8;
567 }
568}
569
570static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
571 int line_size)
572{
573 int i;
574 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
575
576 /* read the pixels */
577 for(i=0;i<2;i++) {
578 pixels[0] = cm[pixels[0] + block[0]];
579 pixels[1] = cm[pixels[1] + block[1]];
580 pixels += line_size;
581 block += 8;
582 }
583}
584
585static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
586{
587 int i;
588 for(i=0;i<8;i++) {
589 pixels[0] += block[0];
590 pixels[1] += block[1];
591 pixels[2] += block[2];
592 pixels[3] += block[3];
593 pixels[4] += block[4];
594 pixels[5] += block[5];
595 pixels[6] += block[6];
596 pixels[7] += block[7];
597 pixels += line_size;
598 block += 8;
599 }
600}
601
602static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
603{
604 int i;
605 for(i=0;i<4;i++) {
606 pixels[0] += block[0];
607 pixels[1] += block[1];
608 pixels[2] += block[2];
609 pixels[3] += block[3];
610 pixels += line_size;
611 block += 4;
612 }
613}
614
615static int sum_abs_dctelem_c(DCTELEM *block)
616{
617 int sum=0, i;
618 for(i=0; i<64; i++)
619 sum+= FFABS(block[i]);
620 return sum;
621}
622
623static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
624{
625 int i;
626
627 for (i = 0; i < h; i++) {
628 memset(block, value, 16);
629 block += line_size;
630 }
631}
632
633static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
634{
635 int i;
636
637 for (i = 0; i < h; i++) {
638 memset(block, value, 8);
639 block += line_size;
640 }
641}
642
643static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
644{
645 int i, j;
646 uint16_t *dst1 = (uint16_t *) dst;
647 uint16_t *dst2 = (uint16_t *)(dst + linesize);
648
649 for (j = 0; j < 8; j++) {
650 for (i = 0; i < 8; i++) {
651 dst1[i] = dst2[i] = src[i] * 0x0101;
652 }
653 src += 8;
654 dst1 += linesize;
655 dst2 += linesize;
656 }
657}
658
659#if 0
660
661#define PIXOP2(OPNAME, OP) \
662static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
663{\
664 int i;\
665 for(i=0; i<h; i++){\
666 OP(*((uint64_t*)block), AV_RN64(pixels));\
667 pixels+=line_size;\
668 block +=line_size;\
669 }\
670}\
671\
672static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
673{\
674 int i;\
675 for(i=0; i<h; i++){\
676 const uint64_t a= AV_RN64(pixels );\
677 const uint64_t b= AV_RN64(pixels+1);\
678 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
679 pixels+=line_size;\
680 block +=line_size;\
681 }\
682}\
683\
684static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
685{\
686 int i;\
687 for(i=0; i<h; i++){\
688 const uint64_t a= AV_RN64(pixels );\
689 const uint64_t b= AV_RN64(pixels+1);\
690 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
691 pixels+=line_size;\
692 block +=line_size;\
693 }\
694}\
695\
696static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
697{\
698 int i;\
699 for(i=0; i<h; i++){\
700 const uint64_t a= AV_RN64(pixels );\
701 const uint64_t b= AV_RN64(pixels+line_size);\
702 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
703 pixels+=line_size;\
704 block +=line_size;\
705 }\
706}\
707\
708static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
709{\
710 int i;\
711 for(i=0; i<h; i++){\
712 const uint64_t a= AV_RN64(pixels );\
713 const uint64_t b= AV_RN64(pixels+line_size);\
714 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
715 pixels+=line_size;\
716 block +=line_size;\
717 }\
718}\
719\
720static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
721{\
722 int i;\
723 const uint64_t a= AV_RN64(pixels );\
724 const uint64_t b= AV_RN64(pixels+1);\
725 uint64_t l0= (a&0x0303030303030303ULL)\
726 + (b&0x0303030303030303ULL)\
727 + 0x0202020202020202ULL;\
728 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
729 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
730 uint64_t l1,h1;\
731\
732 pixels+=line_size;\
733 for(i=0; i<h; i+=2){\
734 uint64_t a= AV_RN64(pixels );\
735 uint64_t b= AV_RN64(pixels+1);\
736 l1= (a&0x0303030303030303ULL)\
737 + (b&0x0303030303030303ULL);\
738 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
739 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
740 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
741 pixels+=line_size;\
742 block +=line_size;\
743 a= AV_RN64(pixels );\
744 b= AV_RN64(pixels+1);\
745 l0= (a&0x0303030303030303ULL)\
746 + (b&0x0303030303030303ULL)\
747 + 0x0202020202020202ULL;\
748 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
749 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
750 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
751 pixels+=line_size;\
752 block +=line_size;\
753 }\
754}\
755\
756static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
757{\
758 int i;\
759 const uint64_t a= AV_RN64(pixels );\
760 const uint64_t b= AV_RN64(pixels+1);\
761 uint64_t l0= (a&0x0303030303030303ULL)\
762 + (b&0x0303030303030303ULL)\
763 + 0x0101010101010101ULL;\
764 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
765 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
766 uint64_t l1,h1;\
767\
768 pixels+=line_size;\
769 for(i=0; i<h; i+=2){\
770 uint64_t a= AV_RN64(pixels );\
771 uint64_t b= AV_RN64(pixels+1);\
772 l1= (a&0x0303030303030303ULL)\
773 + (b&0x0303030303030303ULL);\
774 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
775 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
776 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
777 pixels+=line_size;\
778 block +=line_size;\
779 a= AV_RN64(pixels );\
780 b= AV_RN64(pixels+1);\
781 l0= (a&0x0303030303030303ULL)\
782 + (b&0x0303030303030303ULL)\
783 + 0x0101010101010101ULL;\
784 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
785 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
786 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
787 pixels+=line_size;\
788 block +=line_size;\
789 }\
790}\
791\
792CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
793CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
794CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
795CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
796CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
797CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
798CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
799
800#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
801#else // 64 bit variant
802
803#define PIXOP2(OPNAME, OP) \
804static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
805 int i;\
806 for(i=0; i<h; i++){\
807 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
808 pixels+=line_size;\
809 block +=line_size;\
810 }\
811}\
812static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
813 int i;\
814 for(i=0; i<h; i++){\
815 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
816 pixels+=line_size;\
817 block +=line_size;\
818 }\
819}\
820static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
821 int i;\
822 for(i=0; i<h; i++){\
823 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
824 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
825 pixels+=line_size;\
826 block +=line_size;\
827 }\
828}\
829static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
830 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
831}\
832\
833static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
834 int src_stride1, int src_stride2, int h){\
835 int i;\
836 for(i=0; i<h; i++){\
837 uint32_t a,b;\
838 a= AV_RN32(&src1[i*src_stride1 ]);\
839 b= AV_RN32(&src2[i*src_stride2 ]);\
840 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
841 a= AV_RN32(&src1[i*src_stride1+4]);\
842 b= AV_RN32(&src2[i*src_stride2+4]);\
843 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
844 }\
845}\
846\
847static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
848 int src_stride1, int src_stride2, int h){\
849 int i;\
850 for(i=0; i<h; i++){\
851 uint32_t a,b;\
852 a= AV_RN32(&src1[i*src_stride1 ]);\
853 b= AV_RN32(&src2[i*src_stride2 ]);\
854 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
855 a= AV_RN32(&src1[i*src_stride1+4]);\
856 b= AV_RN32(&src2[i*src_stride2+4]);\
857 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
858 }\
859}\
860\
861static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
862 int src_stride1, int src_stride2, int h){\
863 int i;\
864 for(i=0; i<h; i++){\
865 uint32_t a,b;\
866 a= AV_RN32(&src1[i*src_stride1 ]);\
867 b= AV_RN32(&src2[i*src_stride2 ]);\
868 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
869 }\
870}\
871\
872static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
873 int src_stride1, int src_stride2, int h){\
874 int i;\
875 for(i=0; i<h; i++){\
876 uint32_t a,b;\
877 a= AV_RN16(&src1[i*src_stride1 ]);\
878 b= AV_RN16(&src2[i*src_stride2 ]);\
879 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
880 }\
881}\
882\
883static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
884 int src_stride1, int src_stride2, int h){\
885 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
886 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
887}\
888\
889static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
890 int src_stride1, int src_stride2, int h){\
891 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
892 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
893}\
894\
895static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
896 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
897}\
898\
899static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
900 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
901}\
902\
903static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
904 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
905}\
906\
907static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
908 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
909}\
910\
911static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
912 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
913 int i;\
914 for(i=0; i<h; i++){\
915 uint32_t a, b, c, d, l0, l1, h0, h1;\
916 a= AV_RN32(&src1[i*src_stride1]);\
917 b= AV_RN32(&src2[i*src_stride2]);\
918 c= AV_RN32(&src3[i*src_stride3]);\
919 d= AV_RN32(&src4[i*src_stride4]);\
920 l0= (a&0x03030303UL)\
921 + (b&0x03030303UL)\
922 + 0x02020202UL;\
923 h0= ((a&0xFCFCFCFCUL)>>2)\
924 + ((b&0xFCFCFCFCUL)>>2);\
925 l1= (c&0x03030303UL)\
926 + (d&0x03030303UL);\
927 h1= ((c&0xFCFCFCFCUL)>>2)\
928 + ((d&0xFCFCFCFCUL)>>2);\
929 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
930 a= AV_RN32(&src1[i*src_stride1+4]);\
931 b= AV_RN32(&src2[i*src_stride2+4]);\
932 c= AV_RN32(&src3[i*src_stride3+4]);\
933 d= AV_RN32(&src4[i*src_stride4+4]);\
934 l0= (a&0x03030303UL)\
935 + (b&0x03030303UL)\
936 + 0x02020202UL;\
937 h0= ((a&0xFCFCFCFCUL)>>2)\
938 + ((b&0xFCFCFCFCUL)>>2);\
939 l1= (c&0x03030303UL)\
940 + (d&0x03030303UL);\
941 h1= ((c&0xFCFCFCFCUL)>>2)\
942 + ((d&0xFCFCFCFCUL)>>2);\
943 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
944 }\
945}\
946\
947static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
948 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
949}\
950\
951static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
952 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
953}\
954\
955static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
956 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
957}\
958\
959static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
960 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
961}\
962\
963static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
964 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
965 int i;\
966 for(i=0; i<h; i++){\
967 uint32_t a, b, c, d, l0, l1, h0, h1;\
968 a= AV_RN32(&src1[i*src_stride1]);\
969 b= AV_RN32(&src2[i*src_stride2]);\
970 c= AV_RN32(&src3[i*src_stride3]);\
971 d= AV_RN32(&src4[i*src_stride4]);\
972 l0= (a&0x03030303UL)\
973 + (b&0x03030303UL)\
974 + 0x01010101UL;\
975 h0= ((a&0xFCFCFCFCUL)>>2)\
976 + ((b&0xFCFCFCFCUL)>>2);\
977 l1= (c&0x03030303UL)\
978 + (d&0x03030303UL);\
979 h1= ((c&0xFCFCFCFCUL)>>2)\
980 + ((d&0xFCFCFCFCUL)>>2);\
981 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
982 a= AV_RN32(&src1[i*src_stride1+4]);\
983 b= AV_RN32(&src2[i*src_stride2+4]);\
984 c= AV_RN32(&src3[i*src_stride3+4]);\
985 d= AV_RN32(&src4[i*src_stride4+4]);\
986 l0= (a&0x03030303UL)\
987 + (b&0x03030303UL)\
988 + 0x01010101UL;\
989 h0= ((a&0xFCFCFCFCUL)>>2)\
990 + ((b&0xFCFCFCFCUL)>>2);\
991 l1= (c&0x03030303UL)\
992 + (d&0x03030303UL);\
993 h1= ((c&0xFCFCFCFCUL)>>2)\
994 + ((d&0xFCFCFCFCUL)>>2);\
995 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
996 }\
997}\
998static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
999 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1000 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1001 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1002}\
1003static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1004 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1005 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1006 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1007}\
1008\
1009static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1010{\
1011 int i, a0, b0, a1, b1;\
1012 a0= pixels[0];\
1013 b0= pixels[1] + 2;\
1014 a0 += b0;\
1015 b0 += pixels[2];\
1016\
1017 pixels+=line_size;\
1018 for(i=0; i<h; i+=2){\
1019 a1= pixels[0];\
1020 b1= pixels[1];\
1021 a1 += b1;\
1022 b1 += pixels[2];\
1023\
1024 block[0]= (a1+a0)>>2; /* FIXME non put */\
1025 block[1]= (b1+b0)>>2;\
1026\
1027 pixels+=line_size;\
1028 block +=line_size;\
1029\
1030 a0= pixels[0];\
1031 b0= pixels[1] + 2;\
1032 a0 += b0;\
1033 b0 += pixels[2];\
1034\
1035 block[0]= (a1+a0)>>2;\
1036 block[1]= (b1+b0)>>2;\
1037 pixels+=line_size;\
1038 block +=line_size;\
1039 }\
1040}\
1041\
1042static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1043{\
1044 int i;\
1045 const uint32_t a= AV_RN32(pixels );\
1046 const uint32_t b= AV_RN32(pixels+1);\
1047 uint32_t l0= (a&0x03030303UL)\
1048 + (b&0x03030303UL)\
1049 + 0x02020202UL;\
1050 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1051 + ((b&0xFCFCFCFCUL)>>2);\
1052 uint32_t l1,h1;\
1053\
1054 pixels+=line_size;\
1055 for(i=0; i<h; i+=2){\
1056 uint32_t a= AV_RN32(pixels );\
1057 uint32_t b= AV_RN32(pixels+1);\
1058 l1= (a&0x03030303UL)\
1059 + (b&0x03030303UL);\
1060 h1= ((a&0xFCFCFCFCUL)>>2)\
1061 + ((b&0xFCFCFCFCUL)>>2);\
1062 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1063 pixels+=line_size;\
1064 block +=line_size;\
1065 a= AV_RN32(pixels );\
1066 b= AV_RN32(pixels+1);\
1067 l0= (a&0x03030303UL)\
1068 + (b&0x03030303UL)\
1069 + 0x02020202UL;\
1070 h0= ((a&0xFCFCFCFCUL)>>2)\
1071 + ((b&0xFCFCFCFCUL)>>2);\
1072 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1073 pixels+=line_size;\
1074 block +=line_size;\
1075 }\
1076}\
1077\
1078static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1079{\
1080 int j;\
1081 for(j=0; j<2; j++){\
1082 int i;\
1083 const uint32_t a= AV_RN32(pixels );\
1084 const uint32_t b= AV_RN32(pixels+1);\
1085 uint32_t l0= (a&0x03030303UL)\
1086 + (b&0x03030303UL)\
1087 + 0x02020202UL;\
1088 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1089 + ((b&0xFCFCFCFCUL)>>2);\
1090 uint32_t l1,h1;\
1091\
1092 pixels+=line_size;\
1093 for(i=0; i<h; i+=2){\
1094 uint32_t a= AV_RN32(pixels );\
1095 uint32_t b= AV_RN32(pixels+1);\
1096 l1= (a&0x03030303UL)\
1097 + (b&0x03030303UL);\
1098 h1= ((a&0xFCFCFCFCUL)>>2)\
1099 + ((b&0xFCFCFCFCUL)>>2);\
1100 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1101 pixels+=line_size;\
1102 block +=line_size;\
1103 a= AV_RN32(pixels );\
1104 b= AV_RN32(pixels+1);\
1105 l0= (a&0x03030303UL)\
1106 + (b&0x03030303UL)\
1107 + 0x02020202UL;\
1108 h0= ((a&0xFCFCFCFCUL)>>2)\
1109 + ((b&0xFCFCFCFCUL)>>2);\
1110 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1111 pixels+=line_size;\
1112 block +=line_size;\
1113 }\
1114 pixels+=4-line_size*(h+1);\
1115 block +=4-line_size*h;\
1116 }\
1117}\
1118\
1119static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1120{\
1121 int j;\
1122 for(j=0; j<2; j++){\
1123 int i;\
1124 const uint32_t a= AV_RN32(pixels );\
1125 const uint32_t b= AV_RN32(pixels+1);\
1126 uint32_t l0= (a&0x03030303UL)\
1127 + (b&0x03030303UL)\
1128 + 0x01010101UL;\
1129 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1130 + ((b&0xFCFCFCFCUL)>>2);\
1131 uint32_t l1,h1;\
1132\
1133 pixels+=line_size;\
1134 for(i=0; i<h; i+=2){\
1135 uint32_t a= AV_RN32(pixels );\
1136 uint32_t b= AV_RN32(pixels+1);\
1137 l1= (a&0x03030303UL)\
1138 + (b&0x03030303UL);\
1139 h1= ((a&0xFCFCFCFCUL)>>2)\
1140 + ((b&0xFCFCFCFCUL)>>2);\
1141 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1142 pixels+=line_size;\
1143 block +=line_size;\
1144 a= AV_RN32(pixels );\
1145 b= AV_RN32(pixels+1);\
1146 l0= (a&0x03030303UL)\
1147 + (b&0x03030303UL)\
1148 + 0x01010101UL;\
1149 h0= ((a&0xFCFCFCFCUL)>>2)\
1150 + ((b&0xFCFCFCFCUL)>>2);\
1151 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1152 pixels+=line_size;\
1153 block +=line_size;\
1154 }\
1155 pixels+=4-line_size*(h+1);\
1156 block +=4-line_size*h;\
1157 }\
1158}\
1159\
1160CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1161CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1162CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1163CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1164CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1165CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1166CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1167CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1168
1169#define op_avg(a, b) a = rnd_avg32(a, b)
1170#endif
1171#define op_put(a, b) a = b
1172
1173PIXOP2(avg, op_avg)
1174PIXOP2(put, op_put)
1175#undef op_avg
1176#undef op_put
1177
1178#define avg2(a,b) ((a+b+1)>>1)
1179#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1180
1181static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1182 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1183}
1184
1185static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1186 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1187}
1188
1189static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1190{
1191 const int A=(16-x16)*(16-y16);
1192 const int B=( x16)*(16-y16);
1193 const int C=(16-x16)*( y16);
1194 const int D=( x16)*( y16);
1195 int i;
1196
1197 for(i=0; i<h; i++)
1198 {
1199 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1200 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1201 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1202 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1203 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1204 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1205 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1206 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1207 dst+= stride;
1208 src+= stride;
1209 }
1210}
1211
1212void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1213 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1214{
1215 int y, vx, vy;
1216 const int s= 1<<shift;
1217
1218 width--;
1219 height--;
1220
1221 for(y=0; y<h; y++){
1222 int x;
1223
1224 vx= ox;
1225 vy= oy;
1226 for(x=0; x<8; x++){ //XXX FIXME optimize
1227 int src_x, src_y, frac_x, frac_y, index;
1228
1229 src_x= vx>>16;
1230 src_y= vy>>16;
1231 frac_x= src_x&(s-1);
1232 frac_y= src_y&(s-1);
1233 src_x>>=shift;
1234 src_y>>=shift;
1235
1236 if((unsigned)src_x < width){
1237 if((unsigned)src_y < height){
1238 index= src_x + src_y*stride;
1239 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1240 + src[index +1]* frac_x )*(s-frac_y)
1241 + ( src[index+stride ]*(s-frac_x)
1242 + src[index+stride+1]* frac_x )* frac_y
1243 + r)>>(shift*2);
1244 }else{
1245 index= src_x + av_clip(src_y, 0, height)*stride;
1246 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1247 + src[index +1]* frac_x )*s
1248 + r)>>(shift*2);
1249 }
1250 }else{
1251 if((unsigned)src_y < height){
1252 index= av_clip(src_x, 0, width) + src_y*stride;
1253 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1254 + src[index+stride ]* frac_y )*s
1255 + r)>>(shift*2);
1256 }else{
1257 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1258 dst[y*stride + x]= src[index ];
1259 }
1260 }
1261
1262 vx+= dxx;
1263 vy+= dyx;
1264 }
1265 ox += dxy;
1266 oy += dyy;
1267 }
1268}
1269
1270static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1271 switch(width){
1272 case 2: put_pixels2_c (dst, src, stride, height); break;
1273 case 4: put_pixels4_c (dst, src, stride, height); break;
1274 case 8: put_pixels8_c (dst, src, stride, height); break;
1275 case 16:put_pixels16_c(dst, src, stride, height); break;
1276 }
1277}
1278
1279static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1280 int i,j;
1281 for (i=0; i < height; i++) {
1282 for (j=0; j < width; j++) {
1283 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1284 }
1285 src += stride;
1286 dst += stride;
1287 }
1288}
1289
1290static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1291 int i,j;
1292 for (i=0; i < height; i++) {
1293 for (j=0; j < width; j++) {
1294 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1295 }
1296 src += stride;
1297 dst += stride;
1298 }
1299}
1300
1301static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1302 int i,j;
1303 for (i=0; i < height; i++) {
1304 for (j=0; j < width; j++) {
1305 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1306 }
1307 src += stride;
1308 dst += stride;
1309 }
1310}
1311
1312static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1313 int i,j;
1314 for (i=0; i < height; i++) {
1315 for (j=0; j < width; j++) {
1316 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1317 }
1318 src += stride;
1319 dst += stride;
1320 }
1321}
1322
1323static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1324 int i,j;
1325 for (i=0; i < height; i++) {
1326 for (j=0; j < width; j++) {
1327 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1328 }
1329 src += stride;
1330 dst += stride;
1331 }
1332}
1333
1334static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1335 int i,j;
1336 for (i=0; i < height; i++) {
1337 for (j=0; j < width; j++) {
1338 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1339 }
1340 src += stride;
1341 dst += stride;
1342 }
1343}
1344
1345static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1346 int i,j;
1347 for (i=0; i < height; i++) {
1348 for (j=0; j < width; j++) {
1349 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1350 }
1351 src += stride;
1352 dst += stride;
1353 }
1354}
1355
1356static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1357 int i,j;
1358 for (i=0; i < height; i++) {
1359 for (j=0; j < width; j++) {
1360 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1361 }
1362 src += stride;
1363 dst += stride;
1364 }
1365}
1366
1367static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1368 switch(width){
1369 case 2: avg_pixels2_c (dst, src, stride, height); break;
1370 case 4: avg_pixels4_c (dst, src, stride, height); break;
1371 case 8: avg_pixels8_c (dst, src, stride, height); break;
1372 case 16:avg_pixels16_c(dst, src, stride, height); break;
1373 }
1374}
1375
1376static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1377 int i,j;
1378 for (i=0; i < height; i++) {
1379 for (j=0; j < width; j++) {
1380 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1381 }
1382 src += stride;
1383 dst += stride;
1384 }
1385}
1386
1387static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1388 int i,j;
1389 for (i=0; i < height; i++) {
1390 for (j=0; j < width; j++) {
1391 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1392 }
1393 src += stride;
1394 dst += stride;
1395 }
1396}
1397
1398static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1399 int i,j;
1400 for (i=0; i < height; i++) {
1401 for (j=0; j < width; j++) {
1402 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1403 }
1404 src += stride;
1405 dst += stride;
1406 }
1407}
1408
1409static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1410 int i,j;
1411 for (i=0; i < height; i++) {
1412 for (j=0; j < width; j++) {
1413 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1414 }
1415 src += stride;
1416 dst += stride;
1417 }
1418}
1419
1420static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1421 int i,j;
1422 for (i=0; i < height; i++) {
1423 for (j=0; j < width; j++) {
1424 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1425 }
1426 src += stride;
1427 dst += stride;
1428 }
1429}
1430
1431static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1432 int i,j;
1433 for (i=0; i < height; i++) {
1434 for (j=0; j < width; j++) {
1435 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1436 }
1437 src += stride;
1438 dst += stride;
1439 }
1440}
1441
1442static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1443 int i,j;
1444 for (i=0; i < height; i++) {
1445 for (j=0; j < width; j++) {
1446 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1447 }
1448 src += stride;
1449 dst += stride;
1450 }
1451}
1452
1453static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1454 int i,j;
1455 for (i=0; i < height; i++) {
1456 for (j=0; j < width; j++) {
1457 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1458 }
1459 src += stride;
1460 dst += stride;
1461 }
1462}
1463#if 0
1464#define TPEL_WIDTH(width)\
1465static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1466 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1467static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1468 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1469static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1470 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1471static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1472 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1473static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1474 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1475static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1476 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1477static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1478 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1479static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1480 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1481static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1482 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1483#endif
1484
1485#define H264_CHROMA_MC(OPNAME, OP)\
1486static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1487 const int A=(8-x)*(8-y);\
1488 const int B=( x)*(8-y);\
1489 const int C=(8-x)*( y);\
1490 const int D=( x)*( y);\
1491 int i;\
1492 \
1493 assert(x<8 && y<8 && x>=0 && y>=0);\
1494\
1495 if(D){\
1496 for(i=0; i<h; i++){\
1497 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1498 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1499 dst+= stride;\
1500 src+= stride;\
1501 }\
1502 }else{\
1503 const int E= B+C;\
1504 const int step= C ? stride : 1;\
1505 for(i=0; i<h; i++){\
1506 OP(dst[0], (A*src[0] + E*src[step+0]));\
1507 OP(dst[1], (A*src[1] + E*src[step+1]));\
1508 dst+= stride;\
1509 src+= stride;\
1510 }\
1511 }\
1512}\
1513\
1514static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1515 const int A=(8-x)*(8-y);\
1516 const int B=( x)*(8-y);\
1517 const int C=(8-x)*( y);\
1518 const int D=( x)*( y);\
1519 int i;\
1520 \
1521 assert(x<8 && y<8 && x>=0 && y>=0);\
1522\
1523 if(D){\
1524 for(i=0; i<h; i++){\
1525 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1526 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1527 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1528 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1529 dst+= stride;\
1530 src+= stride;\
1531 }\
1532 }else{\
1533 const int E= B+C;\
1534 const int step= C ? stride : 1;\
1535 for(i=0; i<h; i++){\
1536 OP(dst[0], (A*src[0] + E*src[step+0]));\
1537 OP(dst[1], (A*src[1] + E*src[step+1]));\
1538 OP(dst[2], (A*src[2] + E*src[step+2]));\
1539 OP(dst[3], (A*src[3] + E*src[step+3]));\
1540 dst+= stride;\
1541 src+= stride;\
1542 }\
1543 }\
1544}\
1545\
1546static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1547 const int A=(8-x)*(8-y);\
1548 const int B=( x)*(8-y);\
1549 const int C=(8-x)*( y);\
1550 const int D=( x)*( y);\
1551 int i;\
1552 \
1553 assert(x<8 && y<8 && x>=0 && y>=0);\
1554\
1555 if(D){\
1556 for(i=0; i<h; i++){\
1557 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1558 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1559 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1560 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1561 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1562 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1563 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1564 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1565 dst+= stride;\
1566 src+= stride;\
1567 }\
1568 }else{\
1569 const int E= B+C;\
1570 const int step= C ? stride : 1;\
1571 for(i=0; i<h; i++){\
1572 OP(dst[0], (A*src[0] + E*src[step+0]));\
1573 OP(dst[1], (A*src[1] + E*src[step+1]));\
1574 OP(dst[2], (A*src[2] + E*src[step+2]));\
1575 OP(dst[3], (A*src[3] + E*src[step+3]));\
1576 OP(dst[4], (A*src[4] + E*src[step+4]));\
1577 OP(dst[5], (A*src[5] + E*src[step+5]));\
1578 OP(dst[6], (A*src[6] + E*src[step+6]));\
1579 OP(dst[7], (A*src[7] + E*src[step+7]));\
1580 dst+= stride;\
1581 src+= stride;\
1582 }\
1583 }\
1584}
1585
1586#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1587#define op_put(a, b) a = (((b) + 32)>>6)
1588
1589H264_CHROMA_MC(put_ , op_put)
1590H264_CHROMA_MC(avg_ , op_avg)
1591#undef op_avg
1592#undef op_put
1593
1594static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1595 const int A=(8-x)*(8-y);
1596 const int B=( x)*(8-y);
1597 const int C=(8-x)*( y);
1598 const int D=( x)*( y);
1599 int i;
1600
1601 assert(x<8 && y<8 && x>=0 && y>=0);
1602
1603 for(i=0; i<h; i++)
1604 {
1605 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1606 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1607 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1608 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1609 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1610 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1611 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1612 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1613 dst+= stride;
1614 src+= stride;
1615 }
1616}
1617
1618static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1619 const int A=(8-x)*(8-y);
1620 const int B=( x)*(8-y);
1621 const int C=(8-x)*( y);
1622 const int D=( x)*( y);
1623 int i;
1624
1625 assert(x<8 && y<8 && x>=0 && y>=0);
1626
1627 for(i=0; i<h; i++)
1628 {
1629 dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1630 dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1631 dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1632 dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1633 dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1634 dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1635 dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1636 dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1637 dst+= stride;
1638 src+= stride;
1639 }
1640}
1641
1642#define QPEL_MC(r, OPNAME, RND, OP) \
1643static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1644 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1645 int i;\
1646 for(i=0; i<h; i++)\
1647 {\
1648 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1649 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1650 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1651 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1652 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1653 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1654 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1655 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1656 dst+=dstStride;\
1657 src+=srcStride;\
1658 }\
1659}\
1660\
1661static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1662 const int w=8;\
1663 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1664 int i;\
1665 for(i=0; i<w; i++)\
1666 {\
1667 const int src0= src[0*srcStride];\
1668 const int src1= src[1*srcStride];\
1669 const int src2= src[2*srcStride];\
1670 const int src3= src[3*srcStride];\
1671 const int src4= src[4*srcStride];\
1672 const int src5= src[5*srcStride];\
1673 const int src6= src[6*srcStride];\
1674 const int src7= src[7*srcStride];\
1675 const int src8= src[8*srcStride];\
1676 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1677 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1678 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1679 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1680 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1681 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1682 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1683 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1684 dst++;\
1685 src++;\
1686 }\
1687}\
1688\
1689static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1690 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1691 int i;\
1692 \
1693 for(i=0; i<h; i++)\
1694 {\
1695 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1696 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1697 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1698 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1699 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1700 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1701 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1702 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1703 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1704 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1705 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1706 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1707 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1708 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1709 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1710 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1711 dst+=dstStride;\
1712 src+=srcStride;\
1713 }\
1714}\
1715\
1716static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1717 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1718 int i;\
1719 const int w=16;\
1720 for(i=0; i<w; i++)\
1721 {\
1722 const int src0= src[0*srcStride];\
1723 const int src1= src[1*srcStride];\
1724 const int src2= src[2*srcStride];\
1725 const int src3= src[3*srcStride];\
1726 const int src4= src[4*srcStride];\
1727 const int src5= src[5*srcStride];\
1728 const int src6= src[6*srcStride];\
1729 const int src7= src[7*srcStride];\
1730 const int src8= src[8*srcStride];\
1731 const int src9= src[9*srcStride];\
1732 const int src10= src[10*srcStride];\
1733 const int src11= src[11*srcStride];\
1734 const int src12= src[12*srcStride];\
1735 const int src13= src[13*srcStride];\
1736 const int src14= src[14*srcStride];\
1737 const int src15= src[15*srcStride];\
1738 const int src16= src[16*srcStride];\
1739 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1740 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1741 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1742 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1743 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1744 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1745 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1746 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1747 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1748 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1749 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1750 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1751 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1752 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1753 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1754 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1755 dst++;\
1756 src++;\
1757 }\
1758}\
1759\
1760static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1761 OPNAME ## pixels8_c(dst, src, stride, 8);\
1762}\
1763\
1764static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1765 uint8_t half[64];\
1766 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1767 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1768}\
1769\
1770static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1771 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1772}\
1773\
1774static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1775 uint8_t half[64];\
1776 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1777 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1778}\
1779\
1780static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1781 uint8_t full[16*9];\
1782 uint8_t half[64];\
1783 copy_block9(full, src, 16, stride, 9);\
1784 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1785 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1786}\
1787\
1788static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1789 uint8_t full[16*9];\
1790 copy_block9(full, src, 16, stride, 9);\
1791 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1792}\
1793\
1794static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1795 uint8_t full[16*9];\
1796 uint8_t half[64];\
1797 copy_block9(full, src, 16, stride, 9);\
1798 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1799 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1800}\
1801void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1802 uint8_t full[16*9];\
1803 uint8_t halfH[72];\
1804 uint8_t halfV[64];\
1805 uint8_t halfHV[64];\
1806 copy_block9(full, src, 16, stride, 9);\
1807 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1808 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1809 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1810 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1811}\
1812static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1813 uint8_t full[16*9];\
1814 uint8_t halfH[72];\
1815 uint8_t halfHV[64];\
1816 copy_block9(full, src, 16, stride, 9);\
1817 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1818 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1819 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1820 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1821}\
1822void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1823 uint8_t full[16*9];\
1824 uint8_t halfH[72];\
1825 uint8_t halfV[64];\
1826 uint8_t halfHV[64];\
1827 copy_block9(full, src, 16, stride, 9);\
1828 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1829 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1830 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1831 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1832}\
1833static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1834 uint8_t full[16*9];\
1835 uint8_t halfH[72];\
1836 uint8_t halfHV[64];\
1837 copy_block9(full, src, 16, stride, 9);\
1838 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1839 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1840 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1841 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1842}\
1843void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1844 uint8_t full[16*9];\
1845 uint8_t halfH[72];\
1846 uint8_t halfV[64];\
1847 uint8_t halfHV[64];\
1848 copy_block9(full, src, 16, stride, 9);\
1849 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1850 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1851 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1852 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1853}\
1854static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1855 uint8_t full[16*9];\
1856 uint8_t halfH[72];\
1857 uint8_t halfHV[64];\
1858 copy_block9(full, src, 16, stride, 9);\
1859 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1860 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1861 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1862 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1863}\
1864void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1865 uint8_t full[16*9];\
1866 uint8_t halfH[72];\
1867 uint8_t halfV[64];\
1868 uint8_t halfHV[64];\
1869 copy_block9(full, src, 16, stride, 9);\
1870 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1871 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1872 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1873 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1874}\
1875static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1876 uint8_t full[16*9];\
1877 uint8_t halfH[72];\
1878 uint8_t halfHV[64];\
1879 copy_block9(full, src, 16, stride, 9);\
1880 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1881 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1882 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1883 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1884}\
1885static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1886 uint8_t halfH[72];\
1887 uint8_t halfHV[64];\
1888 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1889 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1890 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1891}\
1892static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1893 uint8_t halfH[72];\
1894 uint8_t halfHV[64];\
1895 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1896 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1897 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1898}\
1899void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1900 uint8_t full[16*9];\
1901 uint8_t halfH[72];\
1902 uint8_t halfV[64];\
1903 uint8_t halfHV[64];\
1904 copy_block9(full, src, 16, stride, 9);\
1905 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1906 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1907 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1908 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1909}\
1910static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1911 uint8_t full[16*9];\
1912 uint8_t halfH[72];\
1913 copy_block9(full, src, 16, stride, 9);\
1914 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1915 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1916 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1917}\
1918void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1919 uint8_t full[16*9];\
1920 uint8_t halfH[72];\
1921 uint8_t halfV[64];\
1922 uint8_t halfHV[64];\
1923 copy_block9(full, src, 16, stride, 9);\
1924 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1925 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1926 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1927 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1928}\
1929static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1930 uint8_t full[16*9];\
1931 uint8_t halfH[72];\
1932 copy_block9(full, src, 16, stride, 9);\
1933 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1934 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1935 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1936}\
1937static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1938 uint8_t halfH[72];\
1939 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1940 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1941}\
1942static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1943 OPNAME ## pixels16_c(dst, src, stride, 16);\
1944}\
1945\
1946static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1947 uint8_t half[256];\
1948 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1949 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1950}\
1951\
1952static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1953 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1954}\
1955\
1956static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1957 uint8_t half[256];\
1958 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1959 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1960}\
1961\
1962static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1963 uint8_t full[24*17];\
1964 uint8_t half[256];\
1965 copy_block17(full, src, 24, stride, 17);\
1966 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1967 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1968}\
1969\
1970static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1971 uint8_t full[24*17];\
1972 copy_block17(full, src, 24, stride, 17);\
1973 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1974}\
1975\
1976static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1977 uint8_t full[24*17];\
1978 uint8_t half[256];\
1979 copy_block17(full, src, 24, stride, 17);\
1980 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1981 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1982}\
1983void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1984 uint8_t full[24*17];\
1985 uint8_t halfH[272];\
1986 uint8_t halfV[256];\
1987 uint8_t halfHV[256];\
1988 copy_block17(full, src, 24, stride, 17);\
1989 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1990 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1991 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1992 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1993}\
1994static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1995 uint8_t full[24*17];\
1996 uint8_t halfH[272];\
1997 uint8_t halfHV[256];\
1998 copy_block17(full, src, 24, stride, 17);\
1999 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2000 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2001 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2002 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2003}\
2004void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2005 uint8_t full[24*17];\
2006 uint8_t halfH[272];\
2007 uint8_t halfV[256];\
2008 uint8_t halfHV[256];\
2009 copy_block17(full, src, 24, stride, 17);\
2010 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2011 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2012 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2013 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2014}\
2015static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2016 uint8_t full[24*17];\
2017 uint8_t halfH[272];\
2018 uint8_t halfHV[256];\
2019 copy_block17(full, src, 24, stride, 17);\
2020 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2021 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2022 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2023 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2024}\
2025void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2026 uint8_t full[24*17];\
2027 uint8_t halfH[272];\
2028 uint8_t halfV[256];\
2029 uint8_t halfHV[256];\
2030 copy_block17(full, src, 24, stride, 17);\
2031 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2032 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2033 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2034 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2035}\
2036static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2037 uint8_t full[24*17];\
2038 uint8_t halfH[272];\
2039 uint8_t halfHV[256];\
2040 copy_block17(full, src, 24, stride, 17);\
2041 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2042 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2043 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2044 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2045}\
2046void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2047 uint8_t full[24*17];\
2048 uint8_t halfH[272];\
2049 uint8_t halfV[256];\
2050 uint8_t halfHV[256];\
2051 copy_block17(full, src, 24, stride, 17);\
2052 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2053 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2054 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2055 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2056}\
2057static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2058 uint8_t full[24*17];\
2059 uint8_t halfH[272];\
2060 uint8_t halfHV[256];\
2061 copy_block17(full, src, 24, stride, 17);\
2062 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2063 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2064 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2065 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2066}\
2067static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2068 uint8_t halfH[272];\
2069 uint8_t halfHV[256];\
2070 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2071 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2072 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2073}\
2074static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2075 uint8_t halfH[272];\
2076 uint8_t halfHV[256];\
2077 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2078 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2079 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2080}\
2081void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2082 uint8_t full[24*17];\
2083 uint8_t halfH[272];\
2084 uint8_t halfV[256];\
2085 uint8_t halfHV[256];\
2086 copy_block17(full, src, 24, stride, 17);\
2087 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2088 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2089 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2090 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2091}\
2092static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2093 uint8_t full[24*17];\
2094 uint8_t halfH[272];\
2095 copy_block17(full, src, 24, stride, 17);\
2096 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2097 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2098 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2099}\
2100void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2101 uint8_t full[24*17];\
2102 uint8_t halfH[272];\
2103 uint8_t halfV[256];\
2104 uint8_t halfHV[256];\
2105 copy_block17(full, src, 24, stride, 17);\
2106 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2107 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2108 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2109 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2110}\
2111static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2112 uint8_t full[24*17];\
2113 uint8_t halfH[272];\
2114 copy_block17(full, src, 24, stride, 17);\
2115 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2116 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2117 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2118}\
2119static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2120 uint8_t halfH[272];\
2121 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2122 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2123}
2124
2125#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2126#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2127#define op_put(a, b) a = cm[((b) + 16)>>5]
2128#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2129
2130QPEL_MC(0, put_ , _ , op_put)
2131QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2132QPEL_MC(0, avg_ , _ , op_avg)
2133//QPEL_MC(1, avg_no_rnd , _ , op_avg)
2134#undef op_avg
2135#undef op_avg_no_rnd
2136#undef op_put
2137#undef op_put_no_rnd
2138
2139#if 1
2140#define H264_LOWPASS(OPNAME, OP, OP2) \
2141static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2142 const int h=2;\
2143 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2144 int i;\
2145 for(i=0; i<h; i++)\
2146 {\
2147 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2148 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2149 dst+=dstStride;\
2150 src+=srcStride;\
2151 }\
2152}\
2153\
2154static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2155 const int w=2;\
2156 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2157 int i;\
2158 for(i=0; i<w; i++)\
2159 {\
2160 const int srcB= src[-2*srcStride];\
2161 const int srcA= src[-1*srcStride];\
2162 const int src0= src[0 *srcStride];\
2163 const int src1= src[1 *srcStride];\
2164 const int src2= src[2 *srcStride];\
2165 const int src3= src[3 *srcStride];\
2166 const int src4= src[4 *srcStride];\
2167 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2168 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2169 dst++;\
2170 src++;\
2171 }\
2172}\
2173\
2174static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2175 const int h=2;\
2176 const int w=2;\
2177 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2178 int i;\
2179 src -= 2*srcStride;\
2180 for(i=0; i<h+5; i++)\
2181 {\
2182 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2183 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2184 tmp+=tmpStride;\
2185 src+=srcStride;\
2186 }\
2187 tmp -= tmpStride*(h+5-2);\
2188 for(i=0; i<w; i++)\
2189 {\
2190 const int tmpB= tmp[-2*tmpStride];\
2191 const int tmpA= tmp[-1*tmpStride];\
2192 const int tmp0= tmp[0 *tmpStride];\
2193 const int tmp1= tmp[1 *tmpStride];\
2194 const int tmp2= tmp[2 *tmpStride];\
2195 const int tmp3= tmp[3 *tmpStride];\
2196 const int tmp4= tmp[4 *tmpStride];\
2197 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2198 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2199 dst++;\
2200 tmp++;\
2201 }\
2202}\
2203static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2204 const int h=4;\
2205 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2206 int i;\
2207 for(i=0; i<h; i++)\
2208 {\
2209 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2210 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2211 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2212 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2213 dst+=dstStride;\
2214 src+=srcStride;\
2215 }\
2216}\
2217\
2218static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2219 const int w=4;\
2220 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2221 int i;\
2222 for(i=0; i<w; i++)\
2223 {\
2224 const int srcB= src[-2*srcStride];\
2225 const int srcA= src[-1*srcStride];\
2226 const int src0= src[0 *srcStride];\
2227 const int src1= src[1 *srcStride];\
2228 const int src2= src[2 *srcStride];\
2229 const int src3= src[3 *srcStride];\
2230 const int src4= src[4 *srcStride];\
2231 const int src5= src[5 *srcStride];\
2232 const int src6= src[6 *srcStride];\
2233 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2234 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2235 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2236 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2237 dst++;\
2238 src++;\
2239 }\
2240}\
2241\
2242static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2243 const int h=4;\
2244 const int w=4;\
2245 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2246 int i;\
2247 src -= 2*srcStride;\
2248 for(i=0; i<h+5; i++)\
2249 {\
2250 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2251 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2252 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2253 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2254 tmp+=tmpStride;\
2255 src+=srcStride;\
2256 }\
2257 tmp -= tmpStride*(h+5-2);\
2258 for(i=0; i<w; i++)\
2259 {\
2260 const int tmpB= tmp[-2*tmpStride];\
2261 const int tmpA= tmp[-1*tmpStride];\
2262 const int tmp0= tmp[0 *tmpStride];\
2263 const int tmp1= tmp[1 *tmpStride];\
2264 const int tmp2= tmp[2 *tmpStride];\
2265 const int tmp3= tmp[3 *tmpStride];\
2266 const int tmp4= tmp[4 *tmpStride];\
2267 const int tmp5= tmp[5 *tmpStride];\
2268 const int tmp6= tmp[6 *tmpStride];\
2269 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2270 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2271 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2272 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2273 dst++;\
2274 tmp++;\
2275 }\
2276}\
2277\
2278static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2279 const int h=8;\
2280 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2281 int i;\
2282 for(i=0; i<h; i++)\
2283 {\
2284 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2285 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2286 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2287 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2288 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2289 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2290 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2291 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2292 dst+=dstStride;\
2293 src+=srcStride;\
2294 }\
2295}\
2296\
2297static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2298 const int w=8;\
2299 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2300 int i;\
2301 for(i=0; i<w; i++)\
2302 {\
2303 const int srcB= src[-2*srcStride];\
2304 const int srcA= src[-1*srcStride];\
2305 const int src0= src[0 *srcStride];\
2306 const int src1= src[1 *srcStride];\
2307 const int src2= src[2 *srcStride];\
2308 const int src3= src[3 *srcStride];\
2309 const int src4= src[4 *srcStride];\
2310 const int src5= src[5 *srcStride];\
2311 const int src6= src[6 *srcStride];\
2312 const int src7= src[7 *srcStride];\
2313 const int src8= src[8 *srcStride];\
2314 const int src9= src[9 *srcStride];\
2315 const int src10=src[10*srcStride];\
2316 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2317 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2318 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2319 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2320 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2321 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2322 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2323 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2324 dst++;\
2325 src++;\
2326 }\
2327}\
2328\
2329static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2330 const int h=8;\
2331 const int w=8;\
2332 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2333 int i;\
2334 src -= 2*srcStride;\
2335 for(i=0; i<h+5; i++)\
2336 {\
2337 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2338 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2339 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2340 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2341 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2342 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2343 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2344 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2345 tmp+=tmpStride;\
2346 src+=srcStride;\
2347 }\
2348 tmp -= tmpStride*(h+5-2);\
2349 for(i=0; i<w; i++)\
2350 {\
2351 const int tmpB= tmp[-2*tmpStride];\
2352 const int tmpA= tmp[-1*tmpStride];\
2353 const int tmp0= tmp[0 *tmpStride];\
2354 const int tmp1= tmp[1 *tmpStride];\
2355 const int tmp2= tmp[2 *tmpStride];\
2356 const int tmp3= tmp[3 *tmpStride];\
2357 const int tmp4= tmp[4 *tmpStride];\
2358 const int tmp5= tmp[5 *tmpStride];\
2359 const int tmp6= tmp[6 *tmpStride];\
2360 const int tmp7= tmp[7 *tmpStride];\
2361 const int tmp8= tmp[8 *tmpStride];\
2362 const int tmp9= tmp[9 *tmpStride];\
2363 const int tmp10=tmp[10*tmpStride];\
2364 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2365 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2366 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2367 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2368 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2369 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2370 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2371 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2372 dst++;\
2373 tmp++;\
2374 }\
2375}\
2376\
2377static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2378 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2379 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2380 src += 8*srcStride;\
2381 dst += 8*dstStride;\
2382 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2383 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2384}\
2385\
2386static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2387 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2388 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2389 src += 8*srcStride;\
2390 dst += 8*dstStride;\
2391 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2392 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2393}\
2394\
2395static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2396 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2397 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2398 src += 8*srcStride;\
2399 dst += 8*dstStride;\
2400 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2401 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2402}\
2403
2404#define H264_MC(OPNAME, SIZE) \
2405static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2406 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2407}\
2408\
2409static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2410 uint8_t half[SIZE*SIZE];\
2411 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2412 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2413}\
2414\
2415static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2416 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2417}\
2418\
2419static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2420 uint8_t half[SIZE*SIZE];\
2421 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2422 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2423}\
2424\
2425static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2426 uint8_t full[SIZE*(SIZE+5)];\
2427 uint8_t * const full_mid= full + SIZE*2;\
2428 uint8_t half[SIZE*SIZE];\
2429 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2430 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2431 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2432}\
2433\
2434static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2435 uint8_t full[SIZE*(SIZE+5)];\
2436 uint8_t * const full_mid= full + SIZE*2;\
2437 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2438 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2439}\
2440\
2441static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2442 uint8_t full[SIZE*(SIZE+5)];\
2443 uint8_t * const full_mid= full + SIZE*2;\
2444 uint8_t half[SIZE*SIZE];\
2445 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2446 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2447 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2448}\
2449\
2450static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2451 uint8_t full[SIZE*(SIZE+5)];\
2452 uint8_t * const full_mid= full + SIZE*2;\
2453 uint8_t halfH[SIZE*SIZE];\
2454 uint8_t halfV[SIZE*SIZE];\
2455 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2456 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2457 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2458 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2459}\
2460\
2461static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2462 uint8_t full[SIZE*(SIZE+5)];\
2463 uint8_t * const full_mid= full + SIZE*2;\
2464 uint8_t halfH[SIZE*SIZE];\
2465 uint8_t halfV[SIZE*SIZE];\
2466 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2467 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2468 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2469 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2470}\
2471\
2472static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2473 uint8_t full[SIZE*(SIZE+5)];\
2474 uint8_t * const full_mid= full + SIZE*2;\
2475 uint8_t halfH[SIZE*SIZE];\
2476 uint8_t halfV[SIZE*SIZE];\
2477 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2478 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2479 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2480 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2481}\
2482\
2483static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2484 uint8_t full[SIZE*(SIZE+5)];\
2485 uint8_t * const full_mid= full + SIZE*2;\
2486 uint8_t halfH[SIZE*SIZE];\
2487 uint8_t halfV[SIZE*SIZE];\
2488 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2489 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2490 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2491 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2492}\
2493\
2494static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2495 int16_t tmp[SIZE*(SIZE+5)];\
2496 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2497}\
2498\
2499static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2500 int16_t tmp[SIZE*(SIZE+5)];\
2501 uint8_t halfH[SIZE*SIZE];\
2502 uint8_t halfHV[SIZE*SIZE];\
2503 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2504 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2505 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2506}\
2507\
2508static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2509 int16_t tmp[SIZE*(SIZE+5)];\
2510 uint8_t halfH[SIZE*SIZE];\
2511 uint8_t halfHV[SIZE*SIZE];\
2512 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2513 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2514 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2515}\
2516\
2517static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2518 uint8_t full[SIZE*(SIZE+5)];\
2519 uint8_t * const full_mid= full + SIZE*2;\
2520 int16_t tmp[SIZE*(SIZE+5)];\
2521 uint8_t halfV[SIZE*SIZE];\
2522 uint8_t halfHV[SIZE*SIZE];\
2523 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2524 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2525 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2526 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2527}\
2528\
2529static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2530 uint8_t full[SIZE*(SIZE+5)];\
2531 uint8_t * const full_mid= full + SIZE*2;\
2532 int16_t tmp[SIZE*(SIZE+5)];\
2533 uint8_t halfV[SIZE*SIZE];\
2534 uint8_t halfHV[SIZE*SIZE];\
2535 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2536 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2537 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2538 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2539}\
2540
2541#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2542//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2543#define op_put(a, b) a = cm[((b) + 16)>>5]
2544#define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2545#define op2_put(a, b) a = cm[((b) + 512)>>10]
2546
2547H264_LOWPASS(put_ , op_put, op2_put)
2548H264_LOWPASS(avg_ , op_avg, op2_avg)
2549H264_MC(put_, 2)
2550H264_MC(put_, 4)
2551H264_MC(put_, 8)
2552H264_MC(put_, 16)
2553H264_MC(avg_, 4)
2554H264_MC(avg_, 8)
2555H264_MC(avg_, 16)
2556
2557#undef op_avg
2558#undef op_put
2559#undef op2_avg
2560#undef op2_put
2561#endif
2562
2563static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2564 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2565 int i;
2566
2567 for(i=0; i<h; i++){
2568 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2569 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2570 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2571 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2572 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2573 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2574 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2575 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2576 dst+=dstStride;
2577 src+=srcStride;
2578 }
2579}
2580
2581#if CONFIG_CAVS_DECODER
2582/* AVS specific */
2583void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2584 put_pixels8_c(dst, src, stride, 8);
2585}
2586void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2587 avg_pixels8_c(dst, src, stride, 8);
2588}
2589void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2590 put_pixels16_c(dst, src, stride, 16);
2591}
2592void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2593 avg_pixels16_c(dst, src, stride, 16);
2594}
2595#endif /* CONFIG_CAVS_DECODER */
2596
2597#if CONFIG_VC1_DECODER
2598/* VC-1 specific */
2599void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2600 put_pixels8_c(dst, src, stride, 8);
2601}
2602void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2603 avg_pixels8_c(dst, src, stride, 8);
2604}
2605#endif /* CONFIG_VC1_DECODER */
2606
2607#if CONFIG_RV40_DECODER
2608static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2609 put_pixels16_xy2_c(dst, src, stride, 16);
2610}
2611static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2612 avg_pixels16_xy2_c(dst, src, stride, 16);
2613}
2614static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2615 put_pixels8_xy2_c(dst, src, stride, 8);
2616}
2617static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2618 avg_pixels8_xy2_c(dst, src, stride, 8);
2619}
2620#endif /* CONFIG_RV40_DECODER */
2621
2622static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2623 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2624 int i;
2625
2626 for(i=0; i<w; i++){
2627 const int src_1= src[ -srcStride];
2628 const int src0 = src[0 ];
2629 const int src1 = src[ srcStride];
2630 const int src2 = src[2*srcStride];
2631 const int src3 = src[3*srcStride];
2632 const int src4 = src[4*srcStride];
2633 const int src5 = src[5*srcStride];
2634 const int src6 = src[6*srcStride];
2635 const int src7 = src[7*srcStride];
2636 const int src8 = src[8*srcStride];
2637 const int src9 = src[9*srcStride];
2638 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2639 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2640 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2641 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2642 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2643 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2644 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2645 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2646 src++;
2647 dst++;
2648 }
2649}
2650
2651static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2652 put_pixels8_c(dst, src, stride, 8);
2653}
2654
2655static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2656 uint8_t half[64];
2657 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2658 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2659}
2660
2661static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2662 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2663}
2664
2665static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2666 uint8_t half[64];
2667 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2668 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2669}
2670
2671static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2672 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2673}
2674
2675static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2676 uint8_t halfH[88];
2677 uint8_t halfV[64];
2678 uint8_t halfHV[64];
2679 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2680 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2681 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2682 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2683}
2684static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2685 uint8_t halfH[88];
2686 uint8_t halfV[64];
2687 uint8_t halfHV[64];
2688 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2689 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2690 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2691 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2692}
2693static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2694 uint8_t halfH[88];
2695 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2696 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2697}
2698
2699static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2700 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2701 int x;
2702 const int strength= ff_h263_loop_filter_strength[qscale];
2703
2704 for(x=0; x<8; x++){
2705 int d1, d2, ad1;
2706 int p0= src[x-2*stride];
2707 int p1= src[x-1*stride];
2708 int p2= src[x+0*stride];
2709 int p3= src[x+1*stride];
2710 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2711
2712 if (d<-2*strength) d1= 0;
2713 else if(d<- strength) d1=-2*strength - d;
2714 else if(d< strength) d1= d;
2715 else if(d< 2*strength) d1= 2*strength - d;
2716 else d1= 0;
2717
2718 p1 += d1;
2719 p2 -= d1;
2720 if(p1&256) p1= ~(p1>>31);
2721 if(p2&256) p2= ~(p2>>31);
2722
2723 src[x-1*stride] = p1;
2724 src[x+0*stride] = p2;
2725
2726 ad1= FFABS(d1)>>1;
2727
2728 d2= av_clip((p0-p3)/4, -ad1, ad1);
2729
2730 src[x-2*stride] = p0 - d2;
2731 src[x+ stride] = p3 + d2;
2732 }
2733 }
2734}
2735
2736static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2737 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2738 int y;
2739 const int strength= ff_h263_loop_filter_strength[qscale];
2740
2741 for(y=0; y<8; y++){
2742 int d1, d2, ad1;
2743 int p0= src[y*stride-2];
2744 int p1= src[y*stride-1];
2745 int p2= src[y*stride+0];
2746 int p3= src[y*stride+1];
2747 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2748
2749 if (d<-2*strength) d1= 0;
2750 else if(d<- strength) d1=-2*strength - d;
2751 else if(d< strength) d1= d;
2752 else if(d< 2*strength) d1= 2*strength - d;
2753 else d1= 0;
2754
2755 p1 += d1;
2756 p2 -= d1;
2757 if(p1&256) p1= ~(p1>>31);
2758 if(p2&256) p2= ~(p2>>31);
2759
2760 src[y*stride-1] = p1;
2761 src[y*stride+0] = p2;
2762
2763 ad1= FFABS(d1)>>1;
2764
2765 d2= av_clip((p0-p3)/4, -ad1, ad1);
2766
2767 src[y*stride-2] = p0 - d2;
2768 src[y*stride+1] = p3 + d2;
2769 }
2770 }
2771}
2772
2773static void h261_loop_filter_c(uint8_t *src, int stride){
2774 int x,y,xy,yz;
2775 int temp[64];
2776
2777 for(x=0; x<8; x++){
2778 temp[x ] = 4*src[x ];
2779 temp[x + 7*8] = 4*src[x + 7*stride];
2780 }
2781 for(y=1; y<7; y++){
2782 for(x=0; x<8; x++){
2783 xy = y * stride + x;
2784 yz = y * 8 + x;
2785 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2786 }
2787 }
2788
2789 for(y=0; y<8; y++){
2790 src[ y*stride] = (temp[ y*8] + 2)>>2;
2791 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2792 for(x=1; x<7; x++){
2793 xy = y * stride + x;
2794 yz = y * 8 + x;
2795 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2796 }
2797 }
2798}
2799
2800static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2801{
2802 int s, i;
2803
2804 s = 0;
2805 for(i=0;i<h;i++) {
2806 s += abs(pix1[0] - pix2[0]);
2807 s += abs(pix1[1] - pix2[1]);
2808 s += abs(pix1[2] - pix2[2]);
2809 s += abs(pix1[3] - pix2[3]);
2810 s += abs(pix1[4] - pix2[4]);
2811 s += abs(pix1[5] - pix2[5]);
2812 s += abs(pix1[6] - pix2[6]);
2813 s += abs(pix1[7] - pix2[7]);
2814 s += abs(pix1[8] - pix2[8]);
2815 s += abs(pix1[9] - pix2[9]);
2816 s += abs(pix1[10] - pix2[10]);
2817 s += abs(pix1[11] - pix2[11]);
2818 s += abs(pix1[12] - pix2[12]);
2819 s += abs(pix1[13] - pix2[13]);
2820 s += abs(pix1[14] - pix2[14]);
2821 s += abs(pix1[15] - pix2[15]);
2822 pix1 += line_size;
2823 pix2 += line_size;
2824 }
2825 return s;
2826}
2827
2828static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2829{
2830 int s, i;
2831
2832 s = 0;
2833 for(i=0;i<h;i++) {
2834 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2835 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2836 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2837 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2838 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2839 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2840 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2841 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2842 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2843 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2844 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2845 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2846 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2847 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2848 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2849 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2850 pix1 += line_size;
2851 pix2 += line_size;
2852 }
2853 return s;
2854}
2855
2856static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2857{
2858 int s, i;
2859 uint8_t *pix3 = pix2 + line_size;
2860
2861 s = 0;
2862 for(i=0;i<h;i++) {
2863 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2864 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2865 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2866 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2867 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2868 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2869 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2870 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2871 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2872 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2873 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2874 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2875 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2876 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2877 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2878 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2879 pix1 += line_size;
2880 pix2 += line_size;
2881 pix3 += line_size;
2882 }
2883 return s;
2884}
2885
2886static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2887{
2888 int s, i;
2889 uint8_t *pix3 = pix2 + line_size;
2890
2891 s = 0;
2892 for(i=0;i<h;i++) {
2893 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2894 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2895 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2896 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2897 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2898 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2899 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2900 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2901 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2902 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2903 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2904 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2905 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2906 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2907 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2908 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2909 pix1 += line_size;
2910 pix2 += line_size;
2911 pix3 += line_size;
2912 }
2913 return s;
2914}
2915
2916static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2917{
2918 int s, i;
2919
2920 s = 0;
2921 for(i=0;i<h;i++) {
2922 s += abs(pix1[0] - pix2[0]);
2923 s += abs(pix1[1] - pix2[1]);
2924 s += abs(pix1[2] - pix2[2]);
2925 s += abs(pix1[3] - pix2[3]);
2926 s += abs(pix1[4] - pix2[4]);
2927 s += abs(pix1[5] - pix2[5]);
2928 s += abs(pix1[6] - pix2[6]);
2929 s += abs(pix1[7] - pix2[7]);
2930 pix1 += line_size;
2931 pix2 += line_size;
2932 }
2933 return s;
2934}
2935
2936static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2937{
2938 int s, i;
2939
2940 s = 0;
2941 for(i=0;i<h;i++) {
2942 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2943 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2944 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2945 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2946 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2947 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2948 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2949 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2950 pix1 += line_size;
2951 pix2 += line_size;
2952 }
2953 return s;
2954}
2955
2956static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2957{
2958 int s, i;
2959 uint8_t *pix3 = pix2 + line_size;
2960
2961 s = 0;
2962 for(i=0;i<h;i++) {
2963 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2964 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2965 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2966 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2967 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2968 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2969 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2970 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2971 pix1 += line_size;
2972 pix2 += line_size;
2973 pix3 += line_size;
2974 }
2975 return s;
2976}
2977
2978static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2979{
2980 int s, i;
2981 uint8_t *pix3 = pix2 + line_size;
2982
2983 s = 0;
2984 for(i=0;i<h;i++) {
2985 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2986 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2987 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2988 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2989 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2990 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2991 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2992 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2993 pix1 += line_size;
2994 pix2 += line_size;
2995 pix3 += line_size;
2996 }
2997 return s;
2998}
2999
3000static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3001 MpegEncContext *c = v;
3002 int score1=0;
3003 int score2=0;
3004 int x,y;
3005
3006 for(y=0; y<h; y++){
3007 for(x=0; x<16; x++){
3008 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3009 }
3010 if(y+1<h){
3011 for(x=0; x<15; x++){
3012 score2+= FFABS( s1[x ] - s1[x +stride]
3013 - s1[x+1] + s1[x+1+stride])
3014 -FFABS( s2[x ] - s2[x +stride]
3015 - s2[x+1] + s2[x+1+stride]);
3016 }
3017 }
3018 s1+= stride;
3019 s2+= stride;
3020 }
3021
3022 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3023 else return score1 + FFABS(score2)*8;
3024}
3025
3026static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3027 MpegEncContext *c = v;
3028 int score1=0;
3029 int score2=0;
3030 int x,y;
3031
3032 for(y=0; y<h; y++){
3033 for(x=0; x<8; x++){
3034 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3035 }
3036 if(y+1<h){
3037 for(x=0; x<7; x++){
3038 score2+= FFABS( s1[x ] - s1[x +stride]
3039 - s1[x+1] + s1[x+1+stride])
3040 -FFABS( s2[x ] - s2[x +stride]
3041 - s2[x+1] + s2[x+1+stride]);
3042 }
3043 }
3044 s1+= stride;
3045 s2+= stride;
3046 }
3047
3048 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3049 else return score1 + FFABS(score2)*8;
3050}
3051
3052static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3053 int i;
3054 unsigned int sum=0;
3055
3056 for(i=0; i<8*8; i++){
3057 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3058 int w= weight[i];
3059 b>>= RECON_SHIFT;
3060 assert(-512<b && b<512);
3061
3062 sum += (w*b)*(w*b)>>4;
3063 }
3064 return sum>>2;
3065}
3066
3067static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3068 int i;
3069
3070 for(i=0; i<8*8; i++){
3071 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3072 }
3073}
3074
3075/**
3076 * permutes an 8x8 block.
3077 * @param block the block which will be permuted according to the given permutation vector
3078 * @param permutation the permutation vector
3079 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3080 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3081 * (inverse) permutated to scantable order!
3082 */
3083void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3084{
3085 int i;
3086 DCTELEM temp[64];
3087
3088 if(last<=0) return;
3089 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3090
3091 for(i=0; i<=last; i++){
3092 const int j= scantable[i];
3093 temp[j]= block[j];
3094 block[j]=0;
3095 }
3096
3097 for(i=0; i<=last; i++){
3098 const int j= scantable[i];
3099 const int perm_j= permutation[j];
3100 block[perm_j]= temp[j];
3101 }
3102}
3103
3104static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3105 return 0;
3106}
3107
3108void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3109 int i;
3110
3111 memset(cmp, 0, sizeof(void*)*6);
3112
3113 for(i=0; i<6; i++){
3114 switch(type&0xFF){
3115 case FF_CMP_SAD:
3116 cmp[i]= c->sad[i];
3117 break;
3118 case FF_CMP_SATD:
3119 cmp[i]= c->hadamard8_diff[i];
3120 break;
3121 case FF_CMP_SSE:
3122 cmp[i]= c->sse[i];
3123 break;
3124 case FF_CMP_DCT:
3125 cmp[i]= c->dct_sad[i];
3126 break;
3127 case FF_CMP_DCT264:
3128 cmp[i]= c->dct264_sad[i];
3129 break;
3130 case FF_CMP_DCTMAX:
3131 cmp[i]= c->dct_max[i];
3132 break;
3133 case FF_CMP_PSNR:
3134 cmp[i]= c->quant_psnr[i];
3135 break;
3136 case FF_CMP_BIT:
3137 cmp[i]= c->bit[i];
3138 break;
3139 case FF_CMP_RD:
3140 cmp[i]= c->rd[i];
3141 break;
3142 case FF_CMP_VSAD:
3143 cmp[i]= c->vsad[i];
3144 break;
3145 case FF_CMP_VSSE:
3146 cmp[i]= c->vsse[i];
3147 break;
3148 case FF_CMP_ZERO:
3149 cmp[i]= zero_cmp;
3150 break;
3151 case FF_CMP_NSSE:
3152 cmp[i]= c->nsse[i];
3153 break;
3154#if CONFIG_DWT
3155 case FF_CMP_W53:
3156 cmp[i]= c->w53[i];
3157 break;
3158 case FF_CMP_W97:
3159 cmp[i]= c->w97[i];
3160 break;
3161#endif
3162 default:
3163 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3164 }
3165 }
3166}
3167
3168static void clear_block_c(DCTELEM *block)
3169{
3170 memset(block, 0, sizeof(DCTELEM)*64);
3171}
3172
3173/**
3174 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3175 */
3176static void clear_blocks_c(DCTELEM *blocks)
3177{
3178 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3179}
3180
3181static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3182 long i;
3183 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3184 long a = *(long*)(src+i);
3185 long b = *(long*)(dst+i);
3186 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3187 }
3188 for(; i<w; i++)
3189 dst[i+0] += src[i+0];
3190}
3191
3192static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3193 long i;
3194 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3195 long a = *(long*)(src1+i);
3196 long b = *(long*)(src2+i);
3197 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3198 }
3199 for(; i<w; i++)
3200 dst[i] = src1[i]+src2[i];
3201}
3202
3203static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3204 long i;
3205#if !HAVE_FAST_UNALIGNED
3206 if((long)src2 & (sizeof(long)-1)){
3207 for(i=0; i+7<w; i+=8){
3208 dst[i+0] = src1[i+0]-src2[i+0];
3209 dst[i+1] = src1[i+1]-src2[i+1];
3210 dst[i+2] = src1[i+2]-src2[i+2];
3211 dst[i+3] = src1[i+3]-src2[i+3];
3212 dst[i+4] = src1[i+4]-src2[i+4];
3213 dst[i+5] = src1[i+5]-src2[i+5];
3214 dst[i+6] = src1[i+6]-src2[i+6];
3215 dst[i+7] = src1[i+7]-src2[i+7];
3216 }
3217 }else
3218#endif
3219 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3220 long a = *(long*)(src1+i);
3221 long b = *(long*)(src2+i);
3222 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3223 }
3224 for(; i<w; i++)
3225 dst[i+0] = src1[i+0]-src2[i+0];
3226}
3227
3228static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3229 int i;
3230 uint8_t l, lt;
3231
3232 l= *left;
3233 lt= *left_top;
3234
3235 for(i=0; i<w; i++){
3236 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3237 lt= src1[i];
3238 dst[i]= l;
3239 }
3240
3241 *left= l;
3242 *left_top= lt;
3243}
3244
3245static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3246 int i;
3247 uint8_t l, lt;
3248
3249 l= *left;
3250 lt= *left_top;
3251
3252 for(i=0; i<w; i++){
3253 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3254 lt= src1[i];
3255 l= src2[i];
3256 dst[i]= l - pred;
3257 }
3258
3259 *left= l;
3260 *left_top= lt;
3261}
3262
3263static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3264 int i;
3265
3266 for(i=0; i<w-1; i++){
3267 acc+= src[i];
3268 dst[i]= acc;
3269 i++;
3270 acc+= src[i];
3271 dst[i]= acc;
3272 }
3273
3274 for(; i<w; i++){
3275 acc+= src[i];
3276 dst[i]= acc;
3277 }
3278
3279 return acc;
3280}
3281
3282#if HAVE_BIGENDIAN
3283#define B 3
3284#define G 2
3285#define R 1
3286#define A 0
3287#else
3288#define B 0
3289#define G 1
3290#define R 2
3291#define A 3
3292#endif
3293static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3294 int i;
3295 int r,g,b,a;
3296 r= *red;
3297 g= *green;
3298 b= *blue;
3299 a= *alpha;
3300
3301 for(i=0; i<w; i++){
3302 b+= src[4*i+B];
3303 g+= src[4*i+G];
3304 r+= src[4*i+R];
3305 a+= src[4*i+A];
3306
3307 dst[4*i+B]= b;
3308 dst[4*i+G]= g;
3309 dst[4*i+R]= r;
3310 dst[4*i+A]= a;
3311 }
3312
3313 *red= r;
3314 *green= g;
3315 *blue= b;
3316 *alpha= a;
3317}
3318#undef B
3319#undef G
3320#undef R
3321#undef A
3322
3323#define BUTTERFLY2(o1,o2,i1,i2) \
3324o1= (i1)+(i2);\
3325o2= (i1)-(i2);
3326
3327#define BUTTERFLY1(x,y) \
3328{\
3329 int a,b;\
3330 a= x;\
3331 b= y;\
3332 x= a+b;\
3333 y= a-b;\
3334}
3335
3336#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3337
3338static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3339 int i;
3340 int temp[64];
3341 int sum=0;
3342
3343 assert(h==8);
3344
3345 for(i=0; i<8; i++){
3346 //FIXME try pointer walks
3347 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3348 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3349 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3350 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3351
3352 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3353 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3354 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3355 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3356
3357 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3358 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3359 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3360 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3361 }
3362
3363 for(i=0; i<8; i++){
3364 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3365 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3366 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3367 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3368
3369 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3370 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3371 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3372 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3373
3374 sum +=
3375 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3376 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3377 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3378 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3379 }
3380#if 0
3381static int maxi=0;
3382if(sum>maxi){
3383 maxi=sum;
3384 printf("MAX:%d\n", maxi);
3385}
3386#endif
3387 return sum;
3388}
3389
3390static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3391 int i;
3392 int temp[64];
3393 int sum=0;
3394
3395 assert(h==8);
3396
3397 for(i=0; i<8; i++){
3398 //FIXME try pointer walks
3399 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3400 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3401 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3402 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3403
3404 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3405 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3406 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3407 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3408
3409 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3410 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3411 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3412 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3413 }
3414
3415 for(i=0; i<8; i++){
3416 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3417 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3418 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3419 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3420
3421 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3422 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3423 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3424 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3425
3426 sum +=
3427 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3428 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3429 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3430 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3431 }
3432
3433 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3434
3435 return sum;
3436}
3437
3438static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3439 MpegEncContext * const s= (MpegEncContext *)c;
3440 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3441
3442 assert(h==8);
3443
3444 s->dsp.diff_pixels(temp, src1, src2, stride);
3445 s->dsp.fdct(temp);
3446 return s->dsp.sum_abs_dctelem(temp);
3447}
3448
3449#if CONFIG_GPL
3450#define DCT8_1D {\
3451 const int s07 = SRC(0) + SRC(7);\
3452 const int s16 = SRC(1) + SRC(6);\
3453 const int s25 = SRC(2) + SRC(5);\
3454 const int s34 = SRC(3) + SRC(4);\
3455 const int a0 = s07 + s34;\
3456 const int a1 = s16 + s25;\
3457 const int a2 = s07 - s34;\
3458 const int a3 = s16 - s25;\
3459 const int d07 = SRC(0) - SRC(7);\
3460 const int d16 = SRC(1) - SRC(6);\
3461 const int d25 = SRC(2) - SRC(5);\
3462 const int d34 = SRC(3) - SRC(4);\
3463 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3464 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3465 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3466 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3467 DST(0, a0 + a1 ) ;\
3468 DST(1, a4 + (a7>>2)) ;\
3469 DST(2, a2 + (a3>>1)) ;\
3470 DST(3, a5 + (a6>>2)) ;\
3471 DST(4, a0 - a1 ) ;\
3472 DST(5, a6 - (a5>>2)) ;\
3473 DST(6, (a2>>1) - a3 ) ;\
3474 DST(7, (a4>>2) - a7 ) ;\
3475}
3476
3477static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3478 MpegEncContext * const s= (MpegEncContext *)c;
3479 DCTELEM dct[8][8];
3480 int i;
3481 int sum=0;
3482
3483 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3484
3485#define SRC(x) dct[i][x]
3486#define DST(x,v) dct[i][x]= v
3487 for( i = 0; i < 8; i++ )
3488 DCT8_1D
3489#undef SRC
3490#undef DST
3491
3492#define SRC(x) dct[x][i]
3493#define DST(x,v) sum += FFABS(v)
3494 for( i = 0; i < 8; i++ )
3495 DCT8_1D
3496#undef SRC
3497#undef DST
3498 return sum;
3499}
3500#endif
3501
3502static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3503 MpegEncContext * const s= (MpegEncContext *)c;
3504 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3505 int sum=0, i;
3506
3507 assert(h==8);
3508
3509 s->dsp.diff_pixels(temp, src1, src2, stride);
3510 s->dsp.fdct(temp);
3511
3512 for(i=0; i<64; i++)
3513 sum= FFMAX(sum, FFABS(temp[i]));
3514
3515 return sum;
3516}
3517
3518static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3519 MpegEncContext * const s= (MpegEncContext *)c;
3520 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3521 DCTELEM * const bak = temp+64;
3522 int sum=0, i;
3523
3524 assert(h==8);
3525 s->mb_intra=0;
3526
3527 s->dsp.diff_pixels(temp, src1, src2, stride);
3528
3529 memcpy(bak, temp, 64*sizeof(DCTELEM));
3530
3531 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3532 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3533 ff_simple_idct(temp); //FIXME
3534
3535 for(i=0; i<64; i++)
3536 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3537
3538 return sum;
3539}
3540
3541static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3542 MpegEncContext * const s= (MpegEncContext *)c;
3543 const uint8_t *scantable= s->intra_scantable.permutated;
3544 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3545 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3546 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3547 int i, last, run, bits, level, distortion, start_i;
3548 const int esc_length= s->ac_esc_length;
3549 uint8_t * length;
3550 uint8_t * last_length;
3551
3552 assert(h==8);
3553
3554 copy_block8(lsrc1, src1, 8, stride, 8);
3555 copy_block8(lsrc2, src2, 8, stride, 8);
3556
3557 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3558
3559 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3560
3561 bits=0;
3562
3563 if (s->mb_intra) {
3564 start_i = 1;
3565 length = s->intra_ac_vlc_length;
3566 last_length= s->intra_ac_vlc_last_length;
3567 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3568 } else {
3569 start_i = 0;
3570 length = s->inter_ac_vlc_length;
3571 last_length= s->inter_ac_vlc_last_length;
3572 }
3573
3574 if(last>=start_i){
3575 run=0;
3576 for(i=start_i; i<last; i++){
3577 int j= scantable[i];
3578 level= temp[j];
3579
3580 if(level){
3581 level+=64;
3582 if((level&(~127)) == 0){
3583 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3584 }else
3585 bits+= esc_length;
3586 run=0;
3587 }else
3588 run++;
3589 }
3590 i= scantable[last];
3591
3592 level= temp[i] + 64;
3593
3594 assert(level - 64);
3595
3596 if((level&(~127)) == 0){
3597 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3598 }else
3599 bits+= esc_length;
3600
3601 }
3602
3603 if(last>=0){
3604 if(s->mb_intra)
3605 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3606 else
3607 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3608 }
3609
3610 s->dsp.idct_add(lsrc2, 8, temp);
3611
3612 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3613
3614 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3615}
3616
3617static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3618 MpegEncContext * const s= (MpegEncContext *)c;
3619 const uint8_t *scantable= s->intra_scantable.permutated;
3620 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3621 int i, last, run, bits, level, start_i;
3622 const int esc_length= s->ac_esc_length;
3623 uint8_t * length;
3624 uint8_t * last_length;
3625
3626 assert(h==8);
3627
3628 s->dsp.diff_pixels(temp, src1, src2, stride);
3629
3630 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3631
3632 bits=0;
3633
3634 if (s->mb_intra) {
3635 start_i = 1;
3636 length = s->intra_ac_vlc_length;
3637 last_length= s->intra_ac_vlc_last_length;
3638 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3639 } else {
3640 start_i = 0;
3641 length = s->inter_ac_vlc_length;
3642 last_length= s->inter_ac_vlc_last_length;
3643 }
3644
3645 if(last>=start_i){
3646 run=0;
3647 for(i=start_i; i<last; i++){
3648 int j= scantable[i];
3649 level= temp[j];
3650
3651 if(level){
3652 level+=64;
3653 if((level&(~127)) == 0){
3654 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3655 }else
3656 bits+= esc_length;
3657 run=0;
3658 }else
3659 run++;
3660 }
3661 i= scantable[last];
3662
3663 level= temp[i] + 64;
3664
3665 assert(level - 64);
3666
3667 if((level&(~127)) == 0){
3668 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3669 }else
3670 bits+= esc_length;
3671 }
3672
3673 return bits;
3674}
3675
3676#define VSAD_INTRA(size) \
3677static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3678 int score=0; \
3679 int x,y; \
3680 \
3681 for(y=1; y<h; y++){ \
3682 for(x=0; x<size; x+=4){ \
3683 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
3684 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
3685 } \
3686 s+= stride; \
3687 } \
3688 \
3689 return score; \
3690}
3691VSAD_INTRA(8)
3692VSAD_INTRA(16)
3693
3694static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3695 int score=0;
3696 int x,y;
3697
3698 for(y=1; y<h; y++){
3699 for(x=0; x<16; x++){
3700 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3701 }
3702 s1+= stride;
3703 s2+= stride;
3704 }
3705
3706 return score;
3707}
3708
3709#define SQ(a) ((a)*(a))
3710#define VSSE_INTRA(size) \
3711static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3712 int score=0; \
3713 int x,y; \
3714 \
3715 for(y=1; y<h; y++){ \
3716 for(x=0; x<size; x+=4){ \
3717 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
3718 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
3719 } \
3720 s+= stride; \
3721 } \
3722 \
3723 return score; \
3724}
3725VSSE_INTRA(8)
3726VSSE_INTRA(16)
3727
3728static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3729 int score=0;
3730 int x,y;
3731
3732 for(y=1; y<h; y++){
3733 for(x=0; x<16; x++){
3734 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3735 }
3736 s1+= stride;
3737 s2+= stride;
3738 }
3739
3740 return score;
3741}
3742
3743static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3744 int size){
3745 int score=0;
3746 int i;
3747 for(i=0; i<size; i++)
3748 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3749 return score;
3750}
3751
3752WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3753WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3754WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3755#if CONFIG_GPL
3756WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3757#endif
3758WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3759WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3760WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3761WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3762
3763static void vector_fmul_c(float *dst, const float *src, int len){
3764 int i;
3765 for(i=0; i<len; i++)
3766 dst[i] *= src[i];
3767}
3768
3769static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3770 int i;
3771 src1 += len-1;
3772 for(i=0; i<len; i++)
3773 dst[i] = src0[i] * src1[-i];
3774}
3775
3776static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3777 int i;
3778 for(i=0; i<len; i++)
3779 dst[i] = src0[i] * src1[i] + src2[i];
3780}
3781
3782void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
3783 int i,j;
3784 dst += len;
3785 win += len;
3786 src0+= len;
3787 for(i=-len, j=len-1; i<0; i++, j--) {
3788 float s0 = src0[i];
3789 float s1 = src1[j];
3790 float wi = win[i];
3791 float wj = win[j];
3792 dst[i] = s0*wj - s1*wi + add_bias;
3793 dst[j] = s0*wi + s1*wj + add_bias;
3794 }
3795}
3796
3797static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3798 int len)
3799{
3800 int i;
3801 for (i = 0; i < len; i++)
3802 dst[i] = src[i] * mul;
3803}
3804
3805static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3806 const float **sv, float mul, int len)
3807{
3808 int i;
3809 for (i = 0; i < len; i += 2, sv++) {
3810 dst[i ] = src[i ] * sv[0][0] * mul;
3811 dst[i+1] = src[i+1] * sv[0][1] * mul;
3812 }
3813}
3814
3815static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3816 const float **sv, float mul, int len)
3817{
3818 int i;
3819 for (i = 0; i < len; i += 4, sv++) {
3820 dst[i ] = src[i ] * sv[0][0] * mul;
3821 dst[i+1] = src[i+1] * sv[0][1] * mul;
3822 dst[i+2] = src[i+2] * sv[0][2] * mul;
3823 dst[i+3] = src[i+3] * sv[0][3] * mul;
3824 }
3825}
3826
3827static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3828 int len)
3829{
3830 int i;
3831 for (i = 0; i < len; i += 2, sv++) {
3832 dst[i ] = sv[0][0] * mul;
3833 dst[i+1] = sv[0][1] * mul;
3834 }
3835}
3836
3837static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3838 int len)
3839{
3840 int i;
3841 for (i = 0; i < len; i += 4, sv++) {
3842 dst[i ] = sv[0][0] * mul;
3843 dst[i+1] = sv[0][1] * mul;
3844 dst[i+2] = sv[0][2] * mul;
3845 dst[i+3] = sv[0][3] * mul;
3846 }
3847}
3848
3849static void butterflies_float_c(float *restrict v1, float *restrict v2,
3850 int len)
3851{
3852 int i;
3853 for (i = 0; i < len; i++) {
3854 float t = v1[i] - v2[i];
3855 v1[i] += v2[i];
3856 v2[i] = t;
3857 }
3858}
3859
3860static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3861{
3862 float p = 0.0;
3863 int i;
3864
3865 for (i = 0; i < len; i++)
3866 p += v1[i] * v2[i];
3867
3868 return p;
3869}
3870
3871static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
3872 int i;
3873 for(i=0; i<len; i++)
3874 dst[i] = src[i] * mul;
3875}
3876
3877static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3878 uint32_t maxi, uint32_t maxisign)
3879{
3880
3881 if(a > mini) return mini;
3882 else if((a^(1<<31)) > maxisign) return maxi;
3883 else return a;
3884}
3885
3886static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3887 int i;
3888 uint32_t mini = *(uint32_t*)min;
3889 uint32_t maxi = *(uint32_t*)max;
3890 uint32_t maxisign = maxi ^ (1<<31);
3891 uint32_t *dsti = (uint32_t*)dst;
3892 const uint32_t *srci = (const uint32_t*)src;
3893 for(i=0; i<len; i+=8) {
3894 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3895 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3896 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3897 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3898 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3899 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3900 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3901 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3902 }
3903}
3904static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3905 int i;
3906 if(min < 0 && max > 0) {
3907 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3908 } else {
3909 for(i=0; i < len; i+=8) {
3910 dst[i ] = av_clipf(src[i ], min, max);
3911 dst[i + 1] = av_clipf(src[i + 1], min, max);
3912 dst[i + 2] = av_clipf(src[i + 2], min, max);
3913 dst[i + 3] = av_clipf(src[i + 3], min, max);
3914 dst[i + 4] = av_clipf(src[i + 4], min, max);
3915 dst[i + 5] = av_clipf(src[i + 5], min, max);
3916 dst[i + 6] = av_clipf(src[i + 6], min, max);
3917 dst[i + 7] = av_clipf(src[i + 7], min, max);
3918 }
3919 }
3920}
3921
3922static av_always_inline int float_to_int16_one(const float *src){
3923 int_fast32_t tmp = *(const int32_t*)src;
3924 if(tmp & 0xf0000){
3925 tmp = (0x43c0ffff - tmp)>>31;
3926 // is this faster on some gcc/cpu combinations?
3927// if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3928// else tmp = 0;
3929 }
3930 return tmp - 0x8000;
3931}
3932
3933void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
3934 int i;
3935 for(i=0; i<len; i++)
3936 dst[i] = float_to_int16_one(src+i);
3937}
3938
3939void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
3940 int i,j,c;
3941 if(channels==2){
3942 for(i=0; i<len; i++){
3943 dst[2*i] = float_to_int16_one(src[0]+i);
3944 dst[2*i+1] = float_to_int16_one(src[1]+i);
3945 }
3946 }else{
3947 for(c=0; c<channels; c++)
3948 for(i=0, j=c; i<len; i++, j+=channels)
3949 dst[j] = float_to_int16_one(src[c]+i);
3950 }
3951}
3952
3953static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
3954{
3955 int res = 0;
3956
3957 while (order--)
3958 res += (*v1++ * *v2++) >> shift;
3959
3960 return res;
3961}
3962
3963static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
3964{
3965 int res = 0;
3966 while (order--) {
3967 res += *v1 * *v2++;
3968 *v1++ += mul * *v3++;
3969 }
3970 return res;
3971}
3972
3973#define W0 2048
3974#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3975#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3976#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3977#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3978#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3979#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3980#define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
3981
3982static void wmv2_idct_row(short * b)
3983{
3984 int s1,s2;
3985 int a0,a1,a2,a3,a4,a5,a6,a7;
3986 /*step 1*/
3987 a1 = W1*b[1]+W7*b[7];
3988 a7 = W7*b[1]-W1*b[7];
3989 a5 = W5*b[5]+W3*b[3];
3990 a3 = W3*b[5]-W5*b[3];
3991 a2 = W2*b[2]+W6*b[6];
3992 a6 = W6*b[2]-W2*b[6];
3993 a0 = W0*b[0]+W0*b[4];
3994 a4 = W0*b[0]-W0*b[4];
3995 /*step 2*/
3996 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3997 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3998 /*step 3*/
3999 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4000 b[1] = (a4+a6 +s1 + (1<<7))>>8;
4001 b[2] = (a4-a6 +s2 + (1<<7))>>8;
4002 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4003 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4004 b[5] = (a4-a6 -s2 + (1<<7))>>8;
4005 b[6] = (a4+a6 -s1 + (1<<7))>>8;
4006 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4007}
4008static void wmv2_idct_col(short * b)
4009{
4010 int s1,s2;
4011 int a0,a1,a2,a3,a4,a5,a6,a7;
4012 /*step 1, with extended precision*/
4013 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4014 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4015 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4016 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4017 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4018 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4019 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
4020 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
4021 /*step 2*/
4022 s1 = (181*(a1-a5+a7-a3)+128)>>8;
4023 s2 = (181*(a1-a5-a7+a3)+128)>>8;
4024 /*step 3*/
4025 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4026 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
4027 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
4028 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4029
4030 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4031 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
4032 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
4033 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4034}
4035void ff_wmv2_idct_c(short * block){
4036 int i;
4037
4038 for(i=0;i<64;i+=8){
4039 wmv2_idct_row(block+i);
4040 }
4041 for(i=0;i<8;i++){
4042 wmv2_idct_col(block+i);
4043 }
4044}
4045/* XXX: those functions should be suppressed ASAP when all IDCTs are
4046 converted */
4047static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4048{
4049 ff_wmv2_idct_c(block);
4050 put_pixels_clamped_c(block, dest, line_size);
4051}
4052static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4053{
4054 ff_wmv2_idct_c(block);
4055 add_pixels_clamped_c(block, dest, line_size);
4056}
4057static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4058{
4059 j_rev_dct (block);
4060 put_pixels_clamped_c(block, dest, line_size);
4061}
4062static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4063{
4064 j_rev_dct (block);
4065 add_pixels_clamped_c(block, dest, line_size);
4066}
4067
4068static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4069{
4070 j_rev_dct4 (block);
4071 put_pixels_clamped4_c(block, dest, line_size);
4072}
4073static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4074{
4075 j_rev_dct4 (block);
4076 add_pixels_clamped4_c(block, dest, line_size);
4077}
4078
4079static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4080{
4081 j_rev_dct2 (block);
4082 put_pixels_clamped2_c(block, dest, line_size);
4083}
4084static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4085{
4086 j_rev_dct2 (block);
4087 add_pixels_clamped2_c(block, dest, line_size);
4088}
4089
4090static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4091{
4092 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4093
4094 dest[0] = cm[(block[0] + 4)>>3];
4095}
4096static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4097{
4098 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4099
4100 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4101}
4102
4103static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4104
4105/* init static data */
4106av_cold void dsputil_static_init(void)
4107{
4108 int i;
4109
4110 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4111 for(i=0;i<MAX_NEG_CROP;i++) {
4112 ff_cropTbl[i] = 0;
4113 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4114 }
4115
4116 for(i=0;i<512;i++) {
4117 ff_squareTbl[i] = (i - 256) * (i - 256);
4118 }
4119
4120 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4121}
4122
4123int ff_check_alignment(void){
4124 static int did_fail=0;
4125 DECLARE_ALIGNED(16, int, aligned);
4126
4127 if((intptr_t)&aligned & 15){
4128 if(!did_fail){
4129#if HAVE_MMX || HAVE_ALTIVEC
4130 av_log(NULL, AV_LOG_ERROR,
4131 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4132 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4133 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4134 "Do not report crashes to FFmpeg developers.\n");
4135#endif
4136 did_fail=1;
4137 }
4138 return -1;
4139 }
4140 return 0;
4141}
4142
4143av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4144{
4145 int i;
4146
4147 ff_check_alignment();
4148
4149#if CONFIG_ENCODERS
4150 if(avctx->dct_algo==FF_DCT_FASTINT) {
4151 c->fdct = fdct_ifast;
4152 c->fdct248 = fdct_ifast248;
4153 }
4154 else if(avctx->dct_algo==FF_DCT_FAAN) {
4155 c->fdct = ff_faandct;
4156 c->fdct248 = ff_faandct248;
4157 }
4158 else {
4159 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4160 c->fdct248 = ff_fdct248_islow;
4161 }
4162#endif //CONFIG_ENCODERS
4163
4164 if(avctx->lowres==1){
4165 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4166 c->idct_put= ff_jref_idct4_put;
4167 c->idct_add= ff_jref_idct4_add;
4168 }else{
4169 c->idct_put= ff_h264_lowres_idct_put_c;
4170 c->idct_add= ff_h264_lowres_idct_add_c;
4171 }
4172 c->idct = j_rev_dct4;
4173 c->idct_permutation_type= FF_NO_IDCT_PERM;
4174 }else if(avctx->lowres==2){
4175 c->idct_put= ff_jref_idct2_put;
4176 c->idct_add= ff_jref_idct2_add;
4177 c->idct = j_rev_dct2;
4178 c->idct_permutation_type= FF_NO_IDCT_PERM;
4179 }else if(avctx->lowres==3){
4180 c->idct_put= ff_jref_idct1_put;
4181 c->idct_add= ff_jref_idct1_add;
4182 c->idct = j_rev_dct1;
4183 c->idct_permutation_type= FF_NO_IDCT_PERM;
4184 }else{
4185 if(avctx->idct_algo==FF_IDCT_INT){
4186 c->idct_put= ff_jref_idct_put;
4187 c->idct_add= ff_jref_idct_add;
4188 c->idct = j_rev_dct;
4189 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4190 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4191 avctx->idct_algo==FF_IDCT_VP3){
4192 c->idct_put= ff_vp3_idct_put_c;
4193 c->idct_add= ff_vp3_idct_add_c;
4194 c->idct = ff_vp3_idct_c;
4195 c->idct_permutation_type= FF_NO_IDCT_PERM;
4196 }else if(avctx->idct_algo==FF_IDCT_WMV2){
4197 c->idct_put= ff_wmv2_idct_put_c;
4198 c->idct_add= ff_wmv2_idct_add_c;
4199 c->idct = ff_wmv2_idct_c;
4200 c->idct_permutation_type= FF_NO_IDCT_PERM;
4201 }else if(avctx->idct_algo==FF_IDCT_FAAN){
4202 c->idct_put= ff_faanidct_put;
4203 c->idct_add= ff_faanidct_add;
4204 c->idct = ff_faanidct;
4205 c->idct_permutation_type= FF_NO_IDCT_PERM;
4206 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4207 c->idct_put= ff_ea_idct_put_c;
4208 c->idct_permutation_type= FF_NO_IDCT_PERM;
4209 }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4210 c->idct = ff_bink_idct_c;
4211 c->idct_add = ff_bink_idct_add_c;
4212 c->idct_put = ff_bink_idct_put_c;
4213 c->idct_permutation_type = FF_NO_IDCT_PERM;
4214 }else{ //accurate/default
4215 c->idct_put= ff_simple_idct_put;
4216 c->idct_add= ff_simple_idct_add;
4217 c->idct = ff_simple_idct;
4218 c->idct_permutation_type= FF_NO_IDCT_PERM;
4219 }
4220 }
4221
4222 c->get_pixels = get_pixels_c;
4223 c->diff_pixels = diff_pixels_c;
4224 c->put_pixels_clamped = put_pixels_clamped_c;
4225 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4226 c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4227 c->add_pixels_clamped = add_pixels_clamped_c;
4228 c->add_pixels8 = add_pixels8_c;
4229 c->add_pixels4 = add_pixels4_c;
4230 c->sum_abs_dctelem = sum_abs_dctelem_c;
4231 c->gmc1 = gmc1_c;
4232 c->gmc = ff_gmc_c;
4233 c->clear_block = clear_block_c;
4234 c->clear_blocks = clear_blocks_c;
4235 c->pix_sum = pix_sum_c;
4236 c->pix_norm1 = pix_norm1_c;
4237
4238 c->fill_block_tab[0] = fill_block16_c;
4239 c->fill_block_tab[1] = fill_block8_c;
4240 c->scale_block = scale_block_c;
4241
4242 /* TODO [0] 16 [1] 8 */
4243 c->pix_abs[0][0] = pix_abs16_c;
4244 c->pix_abs[0][1] = pix_abs16_x2_c;
4245 c->pix_abs[0][2] = pix_abs16_y2_c;
4246 c->pix_abs[0][3] = pix_abs16_xy2_c;
4247 c->pix_abs[1][0] = pix_abs8_c;
4248 c->pix_abs[1][1] = pix_abs8_x2_c;
4249 c->pix_abs[1][2] = pix_abs8_y2_c;
4250 c->pix_abs[1][3] = pix_abs8_xy2_c;
4251
4252#define dspfunc(PFX, IDX, NUM) \
4253 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
4254 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
4255 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
4256 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4257
4258 dspfunc(put, 0, 16);
4259 dspfunc(put_no_rnd, 0, 16);
4260 dspfunc(put, 1, 8);
4261 dspfunc(put_no_rnd, 1, 8);
4262 dspfunc(put, 2, 4);
4263 dspfunc(put, 3, 2);
4264
4265 dspfunc(avg, 0, 16);
4266 dspfunc(avg_no_rnd, 0, 16);
4267 dspfunc(avg, 1, 8);
4268 dspfunc(avg_no_rnd, 1, 8);
4269 dspfunc(avg, 2, 4);
4270 dspfunc(avg, 3, 2);
4271#undef dspfunc
4272
4273 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4274 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4275
4276 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4277 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4278 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4279 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4280 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4281 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4282 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4283 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4284 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4285
4286 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4287 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4288 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4289 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4290 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4291 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4292 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4293 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4294 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4295
4296#define dspfunc(PFX, IDX, NUM) \
4297 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4298 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4299 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4300 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4301 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4302 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4303 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4304 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4305 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4306 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4307 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4308 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4309 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4310 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4311 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4312 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4313
4314 dspfunc(put_qpel, 0, 16);
4315 dspfunc(put_no_rnd_qpel, 0, 16);
4316
4317 dspfunc(avg_qpel, 0, 16);
4318 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4319
4320 dspfunc(put_qpel, 1, 8);
4321 dspfunc(put_no_rnd_qpel, 1, 8);
4322
4323 dspfunc(avg_qpel, 1, 8);
4324 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4325
4326 dspfunc(put_h264_qpel, 0, 16);
4327 dspfunc(put_h264_qpel, 1, 8);
4328 dspfunc(put_h264_qpel, 2, 4);
4329 dspfunc(put_h264_qpel, 3, 2);
4330 dspfunc(avg_h264_qpel, 0, 16);
4331 dspfunc(avg_h264_qpel, 1, 8);
4332 dspfunc(avg_h264_qpel, 2, 4);
4333
4334#undef dspfunc
4335 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4336 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4337 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4338 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4339 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4340 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4341 c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_no_rnd_vc1_chroma_mc8_c;
4342 c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_no_rnd_vc1_chroma_mc8_c;
4343
4344 c->draw_edges = draw_edges_c;
4345
4346#if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4347 ff_mlp_init(c, avctx);
4348#endif
4349#if CONFIG_VC1_DECODER
4350 ff_vc1dsp_init(c,avctx);
4351#endif
4352#if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4353 ff_intrax8dsp_init(c,avctx);
4354#endif
4355#if CONFIG_RV30_DECODER
4356 ff_rv30dsp_init(c,avctx);
4357#endif
4358#if CONFIG_RV40_DECODER
4359 ff_rv40dsp_init(c,avctx);
4360 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4361 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4362 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4363 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4364#endif
4365
4366 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4367 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4368 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4369 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4370 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4371 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4372 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4373 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4374
4375#define SET_CMP_FUNC(name) \
4376 c->name[0]= name ## 16_c;\
4377 c->name[1]= name ## 8x8_c;
4378
4379 SET_CMP_FUNC(hadamard8_diff)
4380 c->hadamard8_diff[4]= hadamard8_intra16_c;
4381 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4382 SET_CMP_FUNC(dct_sad)
4383 SET_CMP_FUNC(dct_max)
4384#if CONFIG_GPL
4385 SET_CMP_FUNC(dct264_sad)
4386#endif
4387 c->sad[0]= pix_abs16_c;
4388 c->sad[1]= pix_abs8_c;
4389 c->sse[0]= sse16_c;
4390 c->sse[1]= sse8_c;
4391 c->sse[2]= sse4_c;
4392 SET_CMP_FUNC(quant_psnr)
4393 SET_CMP_FUNC(rd)
4394 SET_CMP_FUNC(bit)
4395 c->vsad[0]= vsad16_c;
4396 c->vsad[4]= vsad_intra16_c;
4397 c->vsad[5]= vsad_intra8_c;
4398 c->vsse[0]= vsse16_c;
4399 c->vsse[4]= vsse_intra16_c;
4400 c->vsse[5]= vsse_intra8_c;
4401 c->nsse[0]= nsse16_c;
4402 c->nsse[1]= nsse8_c;
4403#if CONFIG_DWT
4404 ff_dsputil_init_dwt(c);
4405#endif
4406
4407 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4408
4409 c->add_bytes= add_bytes_c;
4410 c->add_bytes_l2= add_bytes_l2_c;
4411 c->diff_bytes= diff_bytes_c;
4412 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4413 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4414 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
4415 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
4416 c->bswap_buf= bswap_buf;
4417#if CONFIG_PNG_DECODER
4418 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4419#endif
4420
4421 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
4422 c->h263_h_loop_filter= h263_h_loop_filter_c;
4423 c->h263_v_loop_filter= h263_v_loop_filter_c;
4424 }
4425
4426 if (CONFIG_VP3_DECODER) {
4427 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4428 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4429 c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
4430 }
4431 if (CONFIG_VP6_DECODER) {
4432 c->vp6_filter_diag4= ff_vp6_filter_diag4_c;
4433 }
4434
4435 c->h261_loop_filter= h261_loop_filter_c;
4436
4437 c->try_8x8basis= try_8x8basis_c;
4438 c->add_8x8basis= add_8x8basis_c;
4439
4440#if CONFIG_VORBIS_DECODER
4441 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4442#endif
4443#if CONFIG_AC3_DECODER
4444 c->ac3_downmix = ff_ac3_downmix_c;
4445#endif
4446#if CONFIG_LPC
4447 c->lpc_compute_autocorr = ff_lpc_compute_autocorr;
4448#endif
4449 c->vector_fmul = vector_fmul_c;
4450 c->vector_fmul_reverse = vector_fmul_reverse_c;
4451 c->vector_fmul_add = vector_fmul_add_c;
4452 c->vector_fmul_window = ff_vector_fmul_window_c;
4453 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4454 c->vector_clipf = vector_clipf_c;
4455 c->float_to_int16 = ff_float_to_int16_c;
4456 c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4457 c->scalarproduct_int16 = scalarproduct_int16_c;
4458 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4459 c->scalarproduct_float = scalarproduct_float_c;
4460 c->butterflies_float = butterflies_float_c;
4461 c->vector_fmul_scalar = vector_fmul_scalar_c;
4462
4463 c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
4464 c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
4465
4466 c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
4467 c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
4468
4469 c->shrink[0]= ff_img_copy_plane;
4470 c->shrink[1]= ff_shrink22;
4471 c->shrink[2]= ff_shrink44;
4472 c->shrink[3]= ff_shrink88;
4473
4474 c->prefetch= just_return;
4475
4476 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4477 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4478
4479 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
4480 if (ARCH_ARM) dsputil_init_arm (c, avctx);
4481 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
4482 if (HAVE_VIS) dsputil_init_vis (c, avctx);
4483 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
4484 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
4485 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
4486 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
4487 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
4488
4489 for(i=0; i<64; i++){
4490 if(!c->put_2tap_qpel_pixels_tab[0][i])
4491 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4492 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4493 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4494 }
4495
4496 c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4497 c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4498 c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4499 c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4500
4501 c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4502 c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4503 c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4504 c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4505
4506 switch(c->idct_permutation_type){
4507 case FF_NO_IDCT_PERM:
4508 for(i=0; i<64; i++)
4509 c->idct_permutation[i]= i;
4510 break;
4511 case FF_LIBMPEG2_IDCT_PERM:
4512 for(i=0; i<64; i++)
4513 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4514 break;
4515 case FF_SIMPLE_IDCT_PERM:
4516 for(i=0; i<64; i++)
4517 c->idct_permutation[i]= simple_mmx_permutation[i];
4518 break;
4519 case FF_TRANSPOSE_IDCT_PERM:
4520 for(i=0; i<64; i++)
4521 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4522 break;
4523 case FF_PARTTRANS_IDCT_PERM:
4524 for(i=0; i<64; i++)
4525 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4526 break;
4527 case FF_SSE2_IDCT_PERM:
4528 for(i=0; i<64; i++)
4529 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4530 break;
4531 default:
4532 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4533 }
4534}
4535
diff --git a/apps/codecs/libwmavoice/dsputil.h b/apps/codecs/libwmavoice/dsputil.h
deleted file mode 100644
index 9ef0270ade..0000000000
--- a/apps/codecs/libwmavoice/dsputil.h
+++ /dev/null
@@ -1,800 +0,0 @@
1/*
2 * DSP utils
3 * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23/**
24 * @file
25 * DSP utils.
26 * note, many functions in here may use MMX which trashes the FPU state, it is
27 * absolutely necessary to call emms_c() between dsp & float/double code
28 */
29
30#ifndef AVCODEC_DSPUTIL_H
31#define AVCODEC_DSPUTIL_H
32
33#include "libavutil/intreadwrite.h"
34#include "avcodec.h"
35
36
37//#define DEBUG
38/* dct code */
39typedef short DCTELEM;
40
41void fdct_ifast (DCTELEM *data);
42void fdct_ifast248 (DCTELEM *data);
43void ff_jpeg_fdct_islow (DCTELEM *data);
44void ff_fdct248_islow (DCTELEM *data);
45
46void j_rev_dct (DCTELEM *data);
47void j_rev_dct4 (DCTELEM *data);
48void j_rev_dct2 (DCTELEM *data);
49void j_rev_dct1 (DCTELEM *data);
50void ff_wmv2_idct_c(DCTELEM *data);
51
52void ff_fdct_mmx(DCTELEM *block);
53void ff_fdct_mmx2(DCTELEM *block);
54void ff_fdct_sse2(DCTELEM *block);
55
56void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride);
57void ff_h264_idct_add_c(uint8_t *dst, DCTELEM *block, int stride);
58void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride);
59void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride);
60void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block);
61void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block);
62void ff_h264_idct_add16_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
63void ff_h264_idct_add16intra_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
64void ff_h264_idct8_add4_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
65void ff_h264_idct_add8_c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
66
67void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1,
68 const float *win, float add_bias, int len);
69void ff_float_to_int16_c(int16_t *dst, const float *src, long len);
70void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels);
71
72/* encoding scans */
73extern const uint8_t ff_alternate_horizontal_scan[64];
74extern const uint8_t ff_alternate_vertical_scan[64];
75extern const uint8_t ff_zigzag_direct[64];
76extern const uint8_t ff_zigzag248_direct[64];
77
78/* pixel operations */
79#define MAX_NEG_CROP 1024
80
81/* temporary */
82extern uint32_t ff_squareTbl[512];
83extern uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP];
84
85/* VP3 DSP functions */
86void ff_vp3_idct_c(DCTELEM *block/* align 16*/);
87void ff_vp3_idct_put_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
88void ff_vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
89void ff_vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size, const DCTELEM *block/*align 16*/);
90
91void ff_vp3_v_loop_filter_c(uint8_t *src, int stride, int *bounding_values);
92void ff_vp3_h_loop_filter_c(uint8_t *src, int stride, int *bounding_values);
93
94/* VP6 DSP functions */
95void ff_vp6_filter_diag4_c(uint8_t *dst, uint8_t *src, int stride,
96 const int16_t *h_weights, const int16_t *v_weights);
97
98/* Bink functions */
99void ff_bink_idct_c (DCTELEM *block);
100void ff_bink_idct_add_c(uint8_t *dest, int linesize, DCTELEM *block);
101void ff_bink_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
102
103/* CAVS functions */
104void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride);
105void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride);
106void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride);
107void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride);
108
109/* VC1 functions */
110void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd);
111void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd);
112
113/* EA functions */
114void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
115
116/* 1/2^n downscaling functions from imgconvert.c */
117void ff_img_copy_plane(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
118void ff_shrink22(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
119void ff_shrink44(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
120void ff_shrink88(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
121
122void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
123 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
124
125/* minimum alignment rules ;)
126If you notice errors in the align stuff, need more alignment for some ASM code
127for some CPU or need to use a function with less aligned data then send a mail
128to the ffmpeg-devel mailing list, ...
129
130!warning These alignments might not match reality, (missing attribute((align))
131stuff somewhere possible).
132I (Michael) did not check them, these are just the alignments which I think
133could be reached easily ...
134
135!future video codecs might need functions with less strict alignment
136*/
137
138/*
139void get_pixels_c(DCTELEM *block, const uint8_t *pixels, int line_size);
140void diff_pixels_c(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride);
141void put_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size);
142void add_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size);
143void clear_blocks_c(DCTELEM *blocks);
144*/
145
146/* add and put pixel (decoding) */
147// blocksizes for op_pixels_func are 8x4,8x8 16x8 16x16
148//h for op_pixels_func is limited to {width/2, width} but never larger than 16 and never smaller then 4
149typedef void (*op_pixels_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int h);
150typedef void (*tpel_mc_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int w, int h);
151typedef void (*qpel_mc_func)(uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);
152typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
153
154typedef void (*op_fill_func)(uint8_t *block/*align width (8 or 16)*/, uint8_t value, int line_size, int h);
155
156#define DEF_OLD_QPEL(name)\
157void ff_put_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\
158void ff_put_no_rnd_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\
159void ff_avg_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);
160
161DEF_OLD_QPEL(qpel16_mc11_old_c)
162DEF_OLD_QPEL(qpel16_mc31_old_c)
163DEF_OLD_QPEL(qpel16_mc12_old_c)
164DEF_OLD_QPEL(qpel16_mc32_old_c)
165DEF_OLD_QPEL(qpel16_mc13_old_c)
166DEF_OLD_QPEL(qpel16_mc33_old_c)
167DEF_OLD_QPEL(qpel8_mc11_old_c)
168DEF_OLD_QPEL(qpel8_mc31_old_c)
169DEF_OLD_QPEL(qpel8_mc12_old_c)
170DEF_OLD_QPEL(qpel8_mc32_old_c)
171DEF_OLD_QPEL(qpel8_mc13_old_c)
172DEF_OLD_QPEL(qpel8_mc33_old_c)
173
174#define CALL_2X_PIXELS(a, b, n)\
175static void a(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
176 b(block , pixels , line_size, h);\
177 b(block+n, pixels+n, line_size, h);\
178}
179
180/* motion estimation */
181// h is limited to {width/2, width, 2*width} but never larger than 16 and never smaller then 2
182// although currently h<4 is not used as functions with width <8 are neither used nor implemented
183typedef int (*me_cmp_func)(void /*MpegEncContext*/ *s, uint8_t *blk1/*align width (8 or 16)*/, uint8_t *blk2/*align 1*/, int line_size, int h)/* __attribute__ ((const))*/;
184
185/**
186 * Scantable.
187 */
188typedef struct ScanTable{
189 const uint8_t *scantable;
190 uint8_t permutated[64];
191 uint8_t raster_end[64];
192#if ARCH_PPC
193 /** Used by dct_quantize_altivec to find last-non-zero */
194 DECLARE_ALIGNED(16, uint8_t, inverse)[64];
195#endif
196} ScanTable;
197
198void ff_init_scantable(uint8_t *, ScanTable *st, const uint8_t *src_scantable);
199
200void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize,
201 int block_w, int block_h,
202 int src_x, int src_y, int w, int h);
203
204/**
205 * DSPContext.
206 */
207typedef struct DSPContext {
208 /* pixel ops : interface with DCT */
209 void (*get_pixels)(DCTELEM *block/*align 16*/, const uint8_t *pixels/*align 8*/, int line_size);
210 void (*diff_pixels)(DCTELEM *block/*align 16*/, const uint8_t *s1/*align 8*/, const uint8_t *s2/*align 8*/, int stride);
211 void (*put_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
212 void (*put_signed_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
213 void (*put_pixels_nonclamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
214 void (*add_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
215 void (*add_pixels8)(uint8_t *pixels, DCTELEM *block, int line_size);
216 void (*add_pixels4)(uint8_t *pixels, DCTELEM *block, int line_size);
217 int (*sum_abs_dctelem)(DCTELEM *block/*align 16*/);
218 /**
219 * translational global motion compensation.
220 */
221 void (*gmc1)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x16, int y16, int rounder);
222 /**
223 * global motion compensation.
224 */
225 void (*gmc )(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int ox, int oy,
226 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
227 void (*clear_block)(DCTELEM *block/*align 16*/);
228 void (*clear_blocks)(DCTELEM *blocks/*align 16*/);
229 int (*pix_sum)(uint8_t * pix, int line_size);
230 int (*pix_norm1)(uint8_t * pix, int line_size);
231// 16x16 8x8 4x4 2x2 16x8 8x4 4x2 8x16 4x8 2x4
232
233 me_cmp_func sad[6]; /* identical to pix_absAxA except additional void * */
234 me_cmp_func sse[6];
235 me_cmp_func hadamard8_diff[6];
236 me_cmp_func dct_sad[6];
237 me_cmp_func quant_psnr[6];
238 me_cmp_func bit[6];
239 me_cmp_func rd[6];
240 me_cmp_func vsad[6];
241 me_cmp_func vsse[6];
242 me_cmp_func nsse[6];
243 me_cmp_func w53[6];
244 me_cmp_func w97[6];
245 me_cmp_func dct_max[6];
246 me_cmp_func dct264_sad[6];
247
248 me_cmp_func me_pre_cmp[6];
249 me_cmp_func me_cmp[6];
250 me_cmp_func me_sub_cmp[6];
251 me_cmp_func mb_cmp[6];
252 me_cmp_func ildct_cmp[6]; //only width 16 used
253 me_cmp_func frame_skip_cmp[6]; //only width 8 used
254
255 int (*ssd_int8_vs_int16)(const int8_t *pix1, const int16_t *pix2,
256 int size);
257
258 /**
259 * Halfpel motion compensation with rounding (a+b+1)>>1.
260 * this is an array[4][4] of motion compensation functions for 4
261 * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
262 * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
263 * @param block destination where the result is stored
264 * @param pixels source
265 * @param line_size number of bytes in a horizontal line of block
266 * @param h height
267 */
268 op_pixels_func put_pixels_tab[4][4];
269
270 /**
271 * Halfpel motion compensation with rounding (a+b+1)>>1.
272 * This is an array[4][4] of motion compensation functions for 4
273 * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
274 * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
275 * @param block destination into which the result is averaged (a+b+1)>>1
276 * @param pixels source
277 * @param line_size number of bytes in a horizontal line of block
278 * @param h height
279 */
280 op_pixels_func avg_pixels_tab[4][4];
281
282 /**
283 * Halfpel motion compensation with no rounding (a+b)>>1.
284 * this is an array[2][4] of motion compensation functions for 2
285 * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
286 * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
287 * @param block destination where the result is stored
288 * @param pixels source
289 * @param line_size number of bytes in a horizontal line of block
290 * @param h height
291 */
292 op_pixels_func put_no_rnd_pixels_tab[4][4];
293
294 /**
295 * Halfpel motion compensation with no rounding (a+b)>>1.
296 * this is an array[2][4] of motion compensation functions for 2
297 * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
298 * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
299 * @param block destination into which the result is averaged (a+b)>>1
300 * @param pixels source
301 * @param line_size number of bytes in a horizontal line of block
302 * @param h height
303 */
304 op_pixels_func avg_no_rnd_pixels_tab[4][4];
305
306 void (*put_no_rnd_pixels_l2[2])(uint8_t *block/*align width (8 or 16)*/, const uint8_t *a/*align 1*/, const uint8_t *b/*align 1*/, int line_size, int h);
307
308 /**
309 * Thirdpel motion compensation with rounding (a+b+1)>>1.
310 * this is an array[12] of motion compensation functions for the 9 thirdpe
311 * positions<br>
312 * *pixels_tab[ xthirdpel + 4*ythirdpel ]
313 * @param block destination where the result is stored
314 * @param pixels source
315 * @param line_size number of bytes in a horizontal line of block
316 * @param h height
317 */
318 tpel_mc_func put_tpel_pixels_tab[11]; //FIXME individual func ptr per width?
319 tpel_mc_func avg_tpel_pixels_tab[11]; //FIXME individual func ptr per width?
320
321 qpel_mc_func put_qpel_pixels_tab[2][16];
322 qpel_mc_func avg_qpel_pixels_tab[2][16];
323 qpel_mc_func put_no_rnd_qpel_pixels_tab[2][16];
324 qpel_mc_func avg_no_rnd_qpel_pixels_tab[2][16];
325 qpel_mc_func put_mspel_pixels_tab[8];
326
327 /**
328 * h264 Chroma MC
329 */
330 h264_chroma_mc_func put_h264_chroma_pixels_tab[3];
331 h264_chroma_mc_func avg_h264_chroma_pixels_tab[3];
332 /* This is really one func used in VC-1 decoding */
333 h264_chroma_mc_func put_no_rnd_vc1_chroma_pixels_tab[3];
334 h264_chroma_mc_func avg_no_rnd_vc1_chroma_pixels_tab[3];
335
336 qpel_mc_func put_h264_qpel_pixels_tab[4][16];
337 qpel_mc_func avg_h264_qpel_pixels_tab[4][16];
338
339 qpel_mc_func put_2tap_qpel_pixels_tab[4][16];
340 qpel_mc_func avg_2tap_qpel_pixels_tab[4][16];
341
342 me_cmp_func pix_abs[2][4];
343
344 /* huffyuv specific */
345 void (*add_bytes)(uint8_t *dst/*align 16*/, uint8_t *src/*align 16*/, int w);
346 void (*add_bytes_l2)(uint8_t *dst/*align 16*/, uint8_t *src1/*align 16*/, uint8_t *src2/*align 16*/, int w);
347 void (*diff_bytes)(uint8_t *dst/*align 16*/, uint8_t *src1/*align 16*/, uint8_t *src2/*align 1*/,int w);
348 /**
349 * subtract huffyuv's variant of median prediction
350 * note, this might read from src1[-1], src2[-1]
351 */
352 void (*sub_hfyu_median_prediction)(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top);
353 void (*add_hfyu_median_prediction)(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
354 int (*add_hfyu_left_prediction)(uint8_t *dst, const uint8_t *src, int w, int left);
355 void (*add_hfyu_left_prediction_bgr32)(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha);
356 /* this might write to dst[w] */
357 void (*add_png_paeth_prediction)(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
358 void (*bswap_buf)(uint32_t *dst, const uint32_t *src, int w);
359
360 void (*h263_v_loop_filter)(uint8_t *src, int stride, int qscale);
361 void (*h263_h_loop_filter)(uint8_t *src, int stride, int qscale);
362
363 void (*h261_loop_filter)(uint8_t *src, int stride);
364
365 void (*x8_v_loop_filter)(uint8_t *src, int stride, int qscale);
366 void (*x8_h_loop_filter)(uint8_t *src, int stride, int qscale);
367
368 void (*vp3_idct_dc_add)(uint8_t *dest/*align 8*/, int line_size, const DCTELEM *block/*align 16*/);
369 void (*vp3_v_loop_filter)(uint8_t *src, int stride, int *bounding_values);
370 void (*vp3_h_loop_filter)(uint8_t *src, int stride, int *bounding_values);
371
372 void (*vp6_filter_diag4)(uint8_t *dst, uint8_t *src, int stride,
373 const int16_t *h_weights,const int16_t *v_weights);
374
375 /* assume len is a multiple of 4, and arrays are 16-byte aligned */
376 void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize);
377 void (*ac3_downmix)(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
378 /* no alignment needed */
379 void (*lpc_compute_autocorr)(const int32_t *data, int len, int lag, double *autoc);
380 /* assume len is a multiple of 8, and arrays are 16-byte aligned */
381 void (*vector_fmul)(float *dst, const float *src, int len);
382 void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len);
383 /* assume len is a multiple of 8, and src arrays are 16-byte aligned */
384 void (*vector_fmul_add)(float *dst, const float *src0, const float *src1, const float *src2, int len);
385 /* assume len is a multiple of 4, and arrays are 16-byte aligned */
386 void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len);
387 /* assume len is a multiple of 8, and arrays are 16-byte aligned */
388 void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
389 void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */);
390 /**
391 * Multiply a vector of floats by a scalar float. Source and
392 * destination vectors must overlap exactly or not at all.
393 * @param dst result vector, 16-byte aligned
394 * @param src input vector, 16-byte aligned
395 * @param mul scalar value
396 * @param len length of vector, multiple of 4
397 */
398 void (*vector_fmul_scalar)(float *dst, const float *src, float mul,
399 int len);
400 /**
401 * Multiply a vector of floats by concatenated short vectors of
402 * floats and by a scalar float. Source and destination vectors
403 * must overlap exactly or not at all.
404 * [0]: short vectors of length 2, 8-byte aligned
405 * [1]: short vectors of length 4, 16-byte aligned
406 * @param dst output vector, 16-byte aligned
407 * @param src input vector, 16-byte aligned
408 * @param sv array of pointers to short vectors
409 * @param mul scalar value
410 * @param len number of elements in src and dst, multiple of 4
411 */
412 void (*vector_fmul_sv_scalar[2])(float *dst, const float *src,
413 const float **sv, float mul, int len);
414 /**
415 * Multiply short vectors of floats by a scalar float, store
416 * concatenated result.
417 * [0]: short vectors of length 2, 8-byte aligned
418 * [1]: short vectors of length 4, 16-byte aligned
419 * @param dst output vector, 16-byte aligned
420 * @param sv array of pointers to short vectors
421 * @param mul scalar value
422 * @param len number of output elements, multiple of 4
423 */
424 void (*sv_fmul_scalar[2])(float *dst, const float **sv,
425 float mul, int len);
426 /**
427 * Calculate the scalar product of two vectors of floats.
428 * @param v1 first vector, 16-byte aligned
429 * @param v2 second vector, 16-byte aligned
430 * @param len length of vectors, multiple of 4
431 */
432 float (*scalarproduct_float)(const float *v1, const float *v2, int len);
433 /**
434 * Calculate the sum and difference of two vectors of floats.
435 * @param v1 first input vector, sum output, 16-byte aligned
436 * @param v2 second input vector, difference output, 16-byte aligned
437 * @param len length of vectors, multiple of 4
438 */
439 void (*butterflies_float)(float *restrict v1, float *restrict v2, int len);
440
441 /* C version: convert floats from the range [384.0,386.0] to ints in [-32768,32767]
442 * simd versions: convert floats from [-32768.0,32767.0] without rescaling and arrays are 16byte aligned */
443 void (*float_to_int16)(int16_t *dst, const float *src, long len);
444 void (*float_to_int16_interleave)(int16_t *dst, const float **src, long len, int channels);
445
446 /* (I)DCT */
447 void (*fdct)(DCTELEM *block/* align 16*/);
448 void (*fdct248)(DCTELEM *block/* align 16*/);
449
450 /* IDCT really*/
451 void (*idct)(DCTELEM *block/* align 16*/);
452
453 /**
454 * block -> idct -> clip to unsigned 8 bit -> dest.
455 * (-1392, 0, 0, ...) -> idct -> (-174, -174, ...) -> put -> (0, 0, ...)
456 * @param line_size size in bytes of a horizontal line of dest
457 */
458 void (*idct_put)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
459
460 /**
461 * block -> idct -> add dest -> clip to unsigned 8 bit -> dest.
462 * @param line_size size in bytes of a horizontal line of dest
463 */
464 void (*idct_add)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
465
466 /**
467 * idct input permutation.
468 * several optimized IDCTs need a permutated input (relative to the normal order of the reference
469 * IDCT)
470 * this permutation must be performed before the idct_put/add, note, normally this can be merged
471 * with the zigzag/alternate scan<br>
472 * an example to avoid confusion:
473 * - (->decode coeffs -> zigzag reorder -> dequant -> reference idct ->...)
474 * - (x -> referece dct -> reference idct -> x)
475 * - (x -> referece dct -> simple_mmx_perm = idct_permutation -> simple_idct_mmx -> x)
476 * - (->decode coeffs -> zigzag reorder -> simple_mmx_perm -> dequant -> simple_idct_mmx ->...)
477 */
478 uint8_t idct_permutation[64];
479 int idct_permutation_type;
480#define FF_NO_IDCT_PERM 1
481#define FF_LIBMPEG2_IDCT_PERM 2
482#define FF_SIMPLE_IDCT_PERM 3
483#define FF_TRANSPOSE_IDCT_PERM 4
484#define FF_PARTTRANS_IDCT_PERM 5
485#define FF_SSE2_IDCT_PERM 6
486
487 int (*try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale);
488 void (*add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale);
489#define BASIS_SHIFT 16
490#define RECON_SHIFT 6
491
492 void (*draw_edges)(uint8_t *buf, int wrap, int width, int height, int w);
493#define EDGE_WIDTH 16
494
495 void (*prefetch)(void *mem, int stride, int h);
496
497 void (*shrink[4])(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
498
499 /* mlp/truehd functions */
500 void (*mlp_filter_channel)(int32_t *state, const int32_t *coeff,
501 int firorder, int iirorder,
502 unsigned int filter_shift, int32_t mask, int blocksize,
503 int32_t *sample_buffer);
504
505 /* vc1 functions */
506 void (*vc1_inv_trans_8x8)(DCTELEM *b);
507 void (*vc1_inv_trans_8x4)(uint8_t *dest, int line_size, DCTELEM *block);
508 void (*vc1_inv_trans_4x8)(uint8_t *dest, int line_size, DCTELEM *block);
509 void (*vc1_inv_trans_4x4)(uint8_t *dest, int line_size, DCTELEM *block);
510 void (*vc1_inv_trans_8x8_dc)(uint8_t *dest, int line_size, DCTELEM *block);
511 void (*vc1_inv_trans_8x4_dc)(uint8_t *dest, int line_size, DCTELEM *block);
512 void (*vc1_inv_trans_4x8_dc)(uint8_t *dest, int line_size, DCTELEM *block);
513 void (*vc1_inv_trans_4x4_dc)(uint8_t *dest, int line_size, DCTELEM *block);
514 void (*vc1_v_overlap)(uint8_t* src, int stride);
515 void (*vc1_h_overlap)(uint8_t* src, int stride);
516 void (*vc1_v_loop_filter4)(uint8_t *src, int stride, int pq);
517 void (*vc1_h_loop_filter4)(uint8_t *src, int stride, int pq);
518 void (*vc1_v_loop_filter8)(uint8_t *src, int stride, int pq);
519 void (*vc1_h_loop_filter8)(uint8_t *src, int stride, int pq);
520 void (*vc1_v_loop_filter16)(uint8_t *src, int stride, int pq);
521 void (*vc1_h_loop_filter16)(uint8_t *src, int stride, int pq);
522 /* put 8x8 block with bicubic interpolation and quarterpel precision
523 * last argument is actually round value instead of height
524 */
525 op_pixels_func put_vc1_mspel_pixels_tab[16];
526 op_pixels_func avg_vc1_mspel_pixels_tab[16];
527
528 /* intrax8 functions */
529 void (*x8_spatial_compensation[12])(uint8_t *src , uint8_t *dst, int linesize);
530 void (*x8_setup_spatial_compensation)(uint8_t *src, uint8_t *dst, int linesize,
531 int * range, int * sum, int edges);
532
533 /**
534 * Calculate scalar product of two vectors.
535 * @param len length of vectors, should be multiple of 16
536 * @param shift number of bits to discard from product
537 */
538 int32_t (*scalarproduct_int16)(const int16_t *v1, const int16_t *v2/*align 16*/, int len, int shift);
539 /* ape functions */
540 /**
541 * Calculate scalar product of v1 and v2,
542 * and v1[i] += v3[i] * mul
543 * @param len length of vectors, should be multiple of 16
544 */
545 int32_t (*scalarproduct_and_madd_int16)(int16_t *v1/*align 16*/, const int16_t *v2, const int16_t *v3, int len, int mul);
546
547 /* rv30 functions */
548 qpel_mc_func put_rv30_tpel_pixels_tab[4][16];
549 qpel_mc_func avg_rv30_tpel_pixels_tab[4][16];
550
551 /* rv40 functions */
552 qpel_mc_func put_rv40_qpel_pixels_tab[4][16];
553 qpel_mc_func avg_rv40_qpel_pixels_tab[4][16];
554 h264_chroma_mc_func put_rv40_chroma_pixels_tab[3];
555 h264_chroma_mc_func avg_rv40_chroma_pixels_tab[3];
556
557 /* bink functions */
558 op_fill_func fill_block_tab[2];
559 void (*scale_block)(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize);
560} DSPContext;
561
562void dsputil_static_init(void);
563void dsputil_init(DSPContext* p, AVCodecContext *avctx);
564
565int ff_check_alignment(void);
566
567/**
568 * permute block according to permuatation.
569 * @param last last non zero element in scantable order
570 */
571void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last);
572
573void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type);
574
575#define BYTE_VEC32(c) ((c)*0x01010101UL)
576
577static inline uint32_t rnd_avg32(uint32_t a, uint32_t b)
578{
579 return (a | b) - (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1);
580}
581
582static inline uint32_t no_rnd_avg32(uint32_t a, uint32_t b)
583{
584 return (a & b) + (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1);
585}
586
587static inline int get_penalty_factor(int lambda, int lambda2, int type){
588 switch(type&0xFF){
589 default:
590 case FF_CMP_SAD:
591 return lambda>>FF_LAMBDA_SHIFT;
592 case FF_CMP_DCT:
593 return (3*lambda)>>(FF_LAMBDA_SHIFT+1);
594 case FF_CMP_W53:
595 return (4*lambda)>>(FF_LAMBDA_SHIFT);
596 case FF_CMP_W97:
597 return (2*lambda)>>(FF_LAMBDA_SHIFT);
598 case FF_CMP_SATD:
599 case FF_CMP_DCT264:
600 return (2*lambda)>>FF_LAMBDA_SHIFT;
601 case FF_CMP_RD:
602 case FF_CMP_PSNR:
603 case FF_CMP_SSE:
604 case FF_CMP_NSSE:
605 return lambda2>>FF_LAMBDA_SHIFT;
606 case FF_CMP_BIT:
607 return 1;
608 }
609}
610
611/**
612 * Empty mmx state.
613 * this must be called between any dsp function and float/double code.
614 * for example sin(); dsp->idct_put(); emms_c(); cos()
615 */
616#define emms_c()
617
618/* should be defined by architectures supporting
619 one or more MultiMedia extension */
620int mm_support(void);
621extern int mm_flags;
622
623void dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx);
624void dsputil_init_arm(DSPContext* c, AVCodecContext *avctx);
625void dsputil_init_bfin(DSPContext* c, AVCodecContext *avctx);
626void dsputil_init_mlib(DSPContext* c, AVCodecContext *avctx);
627void dsputil_init_mmi(DSPContext* c, AVCodecContext *avctx);
628void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx);
629void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx);
630void dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx);
631void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx);
632
633void ff_dsputil_init_dwt(DSPContext *c);
634void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
635void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
636void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
637void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
638void ff_mlp_init(DSPContext* c, AVCodecContext *avctx);
639void ff_mlp_init_x86(DSPContext* c, AVCodecContext *avctx);
640
641#if HAVE_MMX
642
643#undef emms_c
644
645static inline void emms(void)
646{
647 __asm__ volatile ("emms;":::"memory");
648}
649
650
651#define emms_c() \
652{\
653 if (mm_flags & FF_MM_MMX)\
654 emms();\
655}
656
657#elif ARCH_ARM
658
659#if HAVE_NEON
660# define STRIDE_ALIGN 16
661#endif
662
663#elif ARCH_PPC
664
665#define STRIDE_ALIGN 16
666
667#elif HAVE_MMI
668
669#define STRIDE_ALIGN 16
670
671#else
672
673#define mm_flags 0
674#define mm_support() 0
675
676#endif
677
678#ifndef STRIDE_ALIGN
679# define STRIDE_ALIGN 8
680#endif
681
682#define LOCAL_ALIGNED(a, t, v, s, ...) \
683 uint8_t la_##v[sizeof(t s __VA_ARGS__) + (a)]; \
684 t (*v) __VA_ARGS__ = (void *)FFALIGN((uintptr_t)la_##v, a)
685
686#if HAVE_LOCAL_ALIGNED_8
687# define LOCAL_ALIGNED_8(t, v, s, ...) DECLARE_ALIGNED(8, t, v) s __VA_ARGS__
688#else
689# define LOCAL_ALIGNED_8(t, v, s, ...) LOCAL_ALIGNED(8, t, v, s, __VA_ARGS__)
690#endif
691
692#if HAVE_LOCAL_ALIGNED_16
693# define LOCAL_ALIGNED_16(t, v, s, ...) DECLARE_ALIGNED(16, t, v) s __VA_ARGS__
694#else
695# define LOCAL_ALIGNED_16(t, v, s, ...) LOCAL_ALIGNED(16, t, v, s, __VA_ARGS__)
696#endif
697
698/* PSNR */
699void get_psnr(uint8_t *orig_image[3], uint8_t *coded_image[3],
700 int orig_linesize[3], int coded_linesize,
701 AVCodecContext *avctx);
702
703#define WRAPPER8_16(name8, name16)\
704static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
705 return name8(s, dst , src , stride, h)\
706 +name8(s, dst+8 , src+8 , stride, h);\
707}
708
709#define WRAPPER8_16_SQ(name8, name16)\
710static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
711 int score=0;\
712 score +=name8(s, dst , src , stride, 8);\
713 score +=name8(s, dst+8 , src+8 , stride, 8);\
714 if(h==16){\
715 dst += 8*stride;\
716 src += 8*stride;\
717 score +=name8(s, dst , src , stride, 8);\
718 score +=name8(s, dst+8 , src+8 , stride, 8);\
719 }\
720 return score;\
721}
722
723
724static inline void copy_block2(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
725{
726 int i;
727 for(i=0; i<h; i++)
728 {
729 AV_WN16(dst , AV_RN16(src ));
730 dst+=dstStride;
731 src+=srcStride;
732 }
733}
734
735static inline void copy_block4(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
736{
737 int i;
738 for(i=0; i<h; i++)
739 {
740 AV_WN32(dst , AV_RN32(src ));
741 dst+=dstStride;
742 src+=srcStride;
743 }
744}
745
746static inline void copy_block8(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
747{
748 int i;
749 for(i=0; i<h; i++)
750 {
751 AV_WN32(dst , AV_RN32(src ));
752 AV_WN32(dst+4 , AV_RN32(src+4 ));
753 dst+=dstStride;
754 src+=srcStride;
755 }
756}
757
758static inline void copy_block9(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
759{
760 int i;
761 for(i=0; i<h; i++)
762 {
763 AV_WN32(dst , AV_RN32(src ));
764 AV_WN32(dst+4 , AV_RN32(src+4 ));
765 dst[8]= src[8];
766 dst+=dstStride;
767 src+=srcStride;
768 }
769}
770
771static inline void copy_block16(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
772{
773 int i;
774 for(i=0; i<h; i++)
775 {
776 AV_WN32(dst , AV_RN32(src ));
777 AV_WN32(dst+4 , AV_RN32(src+4 ));
778 AV_WN32(dst+8 , AV_RN32(src+8 ));
779 AV_WN32(dst+12, AV_RN32(src+12));
780 dst+=dstStride;
781 src+=srcStride;
782 }
783}
784
785static inline void copy_block17(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
786{
787 int i;
788 for(i=0; i<h; i++)
789 {
790 AV_WN32(dst , AV_RN32(src ));
791 AV_WN32(dst+4 , AV_RN32(src+4 ));
792 AV_WN32(dst+8 , AV_RN32(src+8 ));
793 AV_WN32(dst+12, AV_RN32(src+12));
794 dst[16]= src[16];
795 dst+=dstStride;
796 src+=srcStride;
797 }
798}
799
800#endif /* AVCODEC_DSPUTIL_H */