summaryrefslogtreecommitdiff
path: root/apps/codecs/libatrac/dsputil.c
diff options
context:
space:
mode:
Diffstat (limited to 'apps/codecs/libatrac/dsputil.c')
-rw-r--r--apps/codecs/libatrac/dsputil.c4114
1 files changed, 4114 insertions, 0 deletions
diff --git a/apps/codecs/libatrac/dsputil.c b/apps/codecs/libatrac/dsputil.c
new file mode 100644
index 0000000000..412a934862
--- /dev/null
+++ b/apps/codecs/libatrac/dsputil.c
@@ -0,0 +1,4114 @@
1/*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25/**
26 * @file libavcodec/dsputil.c
27 * DSP utils
28 */
29
30#include "avcodec.h"
31#include "dsputil.h"
32/*#include "simple_idct.h"
33#include "faandct.h"
34#include "faanidct.h"
35#include "mathops.h"
36#include "h263.h"
37#include "snow.h" */
38
39/* snow.c */
40void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
41
42/* vorbis.c */
43void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
44
45/* ac3dec.c */
46void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
47
48/* flacenc.c */
49void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
50
51/* pngdec.c */
52void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
53
54/* eaidct.c */
55void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
56
57uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
58uint32_t ff_squareTbl[512] = {0, };
59
60// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
61#define pb_7f (~0UL/255 * 0x7f)
62#define pb_80 (~0UL/255 * 0x80)
63
64const uint8_t ff_zigzag_direct[64] = {
65 0, 1, 8, 16, 9, 2, 3, 10,
66 17, 24, 32, 25, 18, 11, 4, 5,
67 12, 19, 26, 33, 40, 48, 41, 34,
68 27, 20, 13, 6, 7, 14, 21, 28,
69 35, 42, 49, 56, 57, 50, 43, 36,
70 29, 22, 15, 23, 30, 37, 44, 51,
71 58, 59, 52, 45, 38, 31, 39, 46,
72 53, 60, 61, 54, 47, 55, 62, 63
73};
74
75/* Specific zigzag scan for 248 idct. NOTE that unlike the
76 specification, we interleave the fields */
77const uint8_t ff_zigzag248_direct[64] = {
78 0, 8, 1, 9, 16, 24, 2, 10,
79 17, 25, 32, 40, 48, 56, 33, 41,
80 18, 26, 3, 11, 4, 12, 19, 27,
81 34, 42, 49, 57, 50, 58, 35, 43,
82 20, 28, 5, 13, 6, 14, 21, 29,
83 36, 44, 51, 59, 52, 60, 37, 45,
84 22, 30, 7, 15, 23, 31, 38, 46,
85 53, 61, 54, 62, 39, 47, 55, 63,
86};
87
88/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
89DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
90
91const uint8_t ff_alternate_horizontal_scan[64] = {
92 0, 1, 2, 3, 8, 9, 16, 17,
93 10, 11, 4, 5, 6, 7, 15, 14,
94 13, 12, 19, 18, 24, 25, 32, 33,
95 26, 27, 20, 21, 22, 23, 28, 29,
96 30, 31, 34, 35, 40, 41, 48, 49,
97 42, 43, 36, 37, 38, 39, 44, 45,
98 46, 47, 50, 51, 56, 57, 58, 59,
99 52, 53, 54, 55, 60, 61, 62, 63,
100};
101
102const uint8_t ff_alternate_vertical_scan[64] = {
103 0, 8, 16, 24, 1, 9, 2, 10,
104 17, 25, 32, 40, 48, 56, 57, 49,
105 41, 33, 26, 18, 3, 11, 4, 12,
106 19, 27, 34, 42, 50, 58, 35, 43,
107 51, 59, 20, 28, 5, 13, 6, 14,
108 21, 29, 36, 44, 52, 60, 37, 45,
109 53, 61, 22, 30, 7, 15, 23, 31,
110 38, 46, 54, 62, 39, 47, 55, 63,
111};
112
113/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
114const uint32_t ff_inverse[256]={
115 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
116 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
117 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
118 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
119 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
120 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
121 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
122 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
123 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
124 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
125 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
126 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
127 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
128 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
129 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
130 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
131 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
132 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
133 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
134 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
135 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
136 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
137 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
138 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
139 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
140 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
141 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
142 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
143 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
144 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
145 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
146 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
147};
148
149/* Input permutation for the simple_idct_mmx */
150static const uint8_t simple_mmx_permutation[64]={
151 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
152 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
153 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
154 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
155 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
156 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
157 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
158 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
159};
160
161static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
162
163void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
164 int i;
165 int end;
166
167 st->scantable= src_scantable;
168
169 for(i=0; i<64; i++){
170 int j;
171 j = src_scantable[i];
172 st->permutated[i] = permutation[j];
173#if ARCH_PPC
174 st->inverse[j] = i;
175#endif
176 }
177
178 end=-1;
179 for(i=0; i<64; i++){
180 int j;
181 j = st->permutated[i];
182 if(j>end) end=j;
183 st->raster_end[i]= end;
184 }
185}
186
187#if CONFIG_SNOW_ENCODER //dwt is in snow.c
188static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
189 int s, i, j;
190 const int dec_count= w==8 ? 3 : 4;
191 int tmp[32*32];
192 int level, ori;
193 static const int scale[2][2][4][4]={
194 {
195 {
196 // 9/7 8x8 dec=3
197 {268, 239, 239, 213},
198 { 0, 224, 224, 152},
199 { 0, 135, 135, 110},
200 },{
201 // 9/7 16x16 or 32x32 dec=4
202 {344, 310, 310, 280},
203 { 0, 320, 320, 228},
204 { 0, 175, 175, 136},
205 { 0, 129, 129, 102},
206 }
207 },{
208 {
209 // 5/3 8x8 dec=3
210 {275, 245, 245, 218},
211 { 0, 230, 230, 156},
212 { 0, 138, 138, 113},
213 },{
214 // 5/3 16x16 or 32x32 dec=4
215 {352, 317, 317, 286},
216 { 0, 328, 328, 233},
217 { 0, 180, 180, 140},
218 { 0, 132, 132, 105},
219 }
220 }
221 };
222
223 for (i = 0; i < h; i++) {
224 for (j = 0; j < w; j+=4) {
225 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
226 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
227 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
228 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
229 }
230 pix1 += line_size;
231 pix2 += line_size;
232 }
233
234 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
235
236 s=0;
237 assert(w==h);
238 for(level=0; level<dec_count; level++){
239 for(ori= level ? 1 : 0; ori<4; ori++){
240 int size= w>>(dec_count-level);
241 int sx= (ori&1) ? size : 0;
242 int stride= 32<<(dec_count-level);
243 int sy= (ori&2) ? stride>>1 : 0;
244
245 for(i=0; i<size; i++){
246 for(j=0; j<size; j++){
247 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
248 s += FFABS(v);
249 }
250 }
251 }
252 }
253 assert(s>=0);
254 return s>>9;
255}
256
257static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
258 return w_c(v, pix1, pix2, line_size, 8, h, 1);
259}
260
261static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
262 return w_c(v, pix1, pix2, line_size, 8, h, 0);
263}
264
265static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
266 return w_c(v, pix1, pix2, line_size, 16, h, 1);
267}
268
269static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
270 return w_c(v, pix1, pix2, line_size, 16, h, 0);
271}
272
273int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
274 return w_c(v, pix1, pix2, line_size, 32, h, 1);
275}
276
277int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
278 return w_c(v, pix1, pix2, line_size, 32, h, 0);
279}
280#endif
281
282/**
283 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
284 * @param buf destination buffer
285 * @param src source buffer
286 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
287 * @param block_w width of block
288 * @param block_h height of block
289 * @param src_x x coordinate of the top left sample of the block in the source buffer
290 * @param src_y y coordinate of the top left sample of the block in the source buffer
291 * @param w width of the source buffer
292 * @param h height of the source buffer
293 */
294void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
295 int src_x, int src_y, int w, int h){
296 int x, y;
297 int start_y, start_x, end_y, end_x;
298
299 if(src_y>= h){
300 src+= (h-1-src_y)*linesize;
301 src_y=h-1;
302 }else if(src_y<=-block_h){
303 src+= (1-block_h-src_y)*linesize;
304 src_y=1-block_h;
305 }
306 if(src_x>= w){
307 src+= (w-1-src_x);
308 src_x=w-1;
309 }else if(src_x<=-block_w){
310 src+= (1-block_w-src_x);
311 src_x=1-block_w;
312 }
313
314 start_y= FFMAX(0, -src_y);
315 start_x= FFMAX(0, -src_x);
316 end_y= FFMIN(block_h, h-src_y);
317 end_x= FFMIN(block_w, w-src_x);
318
319 // copy existing part
320 for(y=start_y; y<end_y; y++){
321 for(x=start_x; x<end_x; x++){
322 buf[x + y*linesize]= src[x + y*linesize];
323 }
324 }
325
326 //top
327 for(y=0; y<start_y; y++){
328 for(x=start_x; x<end_x; x++){
329 buf[x + y*linesize]= buf[x + start_y*linesize];
330 }
331 }
332
333 //bottom
334 for(y=end_y; y<block_h; y++){
335 for(x=start_x; x<end_x; x++){
336 buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
337 }
338 }
339
340 for(y=0; y<block_h; y++){
341 //left
342 for(x=0; x<start_x; x++){
343 buf[x + y*linesize]= buf[start_x + y*linesize];
344 }
345
346 //right
347 for(x=end_x; x<block_w; x++){
348 buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
349 }
350 }
351}
352
353#if 0
354static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
355{
356 int i;
357
358 /* read the pixels */
359 for(i=0;i<8;i++) {
360 block[0] = pixels[0];
361 block[1] = pixels[1];
362 block[2] = pixels[2];
363 block[3] = pixels[3];
364 block[4] = pixels[4];
365 block[5] = pixels[5];
366 block[6] = pixels[6];
367 block[7] = pixels[7];
368 pixels += line_size;
369 block += 8;
370 }
371}
372
373static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
374 const uint8_t *s2, int stride){
375 int i;
376
377 /* read the pixels */
378 for(i=0;i<8;i++) {
379 block[0] = s1[0] - s2[0];
380 block[1] = s1[1] - s2[1];
381 block[2] = s1[2] - s2[2];
382 block[3] = s1[3] - s2[3];
383 block[4] = s1[4] - s2[4];
384 block[5] = s1[5] - s2[5];
385 block[6] = s1[6] - s2[6];
386 block[7] = s1[7] - s2[7];
387 s1 += stride;
388 s2 += stride;
389 block += 8;
390 }
391}
392
393
394static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
395 int line_size)
396{
397 int i;
398 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
399
400 /* read the pixels */
401 for(i=0;i<8;i++) {
402 pixels[0] = cm[block[0]];
403 pixels[1] = cm[block[1]];
404 pixels[2] = cm[block[2]];
405 pixels[3] = cm[block[3]];
406 pixels[4] = cm[block[4]];
407 pixels[5] = cm[block[5]];
408 pixels[6] = cm[block[6]];
409 pixels[7] = cm[block[7]];
410
411 pixels += line_size;
412 block += 8;
413 }
414}
415
416static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
417 int line_size)
418{
419 int i;
420 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
421
422 /* read the pixels */
423 for(i=0;i<4;i++) {
424 pixels[0] = cm[block[0]];
425 pixels[1] = cm[block[1]];
426 pixels[2] = cm[block[2]];
427 pixels[3] = cm[block[3]];
428
429 pixels += line_size;
430 block += 8;
431 }
432}
433
434static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
435 int line_size)
436{
437 int i;
438 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
439
440 /* read the pixels */
441 for(i=0;i<2;i++) {
442 pixels[0] = cm[block[0]];
443 pixels[1] = cm[block[1]];
444
445 pixels += line_size;
446 block += 8;
447 }
448}
449
450static void put_signed_pixels_clamped_c(const DCTELEM *block,
451 uint8_t *restrict pixels,
452 int line_size)
453{
454 int i, j;
455
456 for (i = 0; i < 8; i++) {
457 for (j = 0; j < 8; j++) {
458 if (*block < -128)
459 *pixels = 0;
460 else if (*block > 127)
461 *pixels = 255;
462 else
463 *pixels = (uint8_t)(*block + 128);
464 block++;
465 pixels++;
466 }
467 pixels += (line_size - 8);
468 }
469}
470
471static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
472 int line_size)
473{
474 int i;
475 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
476
477 /* read the pixels */
478 for(i=0;i<8;i++) {
479 pixels[0] = cm[pixels[0] + block[0]];
480 pixels[1] = cm[pixels[1] + block[1]];
481 pixels[2] = cm[pixels[2] + block[2]];
482 pixels[3] = cm[pixels[3] + block[3]];
483 pixels[4] = cm[pixels[4] + block[4]];
484 pixels[5] = cm[pixels[5] + block[5]];
485 pixels[6] = cm[pixels[6] + block[6]];
486 pixels[7] = cm[pixels[7] + block[7]];
487 pixels += line_size;
488 block += 8;
489 }
490}
491
492static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
493 int line_size)
494{
495 int i;
496 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
497
498 /* read the pixels */
499 for(i=0;i<4;i++) {
500 pixels[0] = cm[pixels[0] + block[0]];
501 pixels[1] = cm[pixels[1] + block[1]];
502 pixels[2] = cm[pixels[2] + block[2]];
503 pixels[3] = cm[pixels[3] + block[3]];
504 pixels += line_size;
505 block += 8;
506 }
507}
508
509static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
510 int line_size)
511{
512 int i;
513 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
514
515 /* read the pixels */
516 for(i=0;i<2;i++) {
517 pixels[0] = cm[pixels[0] + block[0]];
518 pixels[1] = cm[pixels[1] + block[1]];
519 pixels += line_size;
520 block += 8;
521 }
522}
523
524static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
525{
526 int i;
527 for(i=0;i<8;i++) {
528 pixels[0] += block[0];
529 pixels[1] += block[1];
530 pixels[2] += block[2];
531 pixels[3] += block[3];
532 pixels[4] += block[4];
533 pixels[5] += block[5];
534 pixels[6] += block[6];
535 pixels[7] += block[7];
536 pixels += line_size;
537 block += 8;
538 }
539}
540
541static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
542{
543 int i;
544 for(i=0;i<4;i++) {
545 pixels[0] += block[0];
546 pixels[1] += block[1];
547 pixels[2] += block[2];
548 pixels[3] += block[3];
549 pixels += line_size;
550 block += 4;
551 }
552}
553
554static int sum_abs_dctelem_c(DCTELEM *block)
555{
556 int sum=0, i;
557 for(i=0; i<64; i++)
558 sum+= FFABS(block[i]);
559 return sum;
560}
561
562#if 0
563
564#define PIXOP2(OPNAME, OP) \
565static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
566{\
567 int i;\
568 for(i=0; i<h; i++){\
569 OP(*((uint64_t*)block), AV_RN64(pixels));\
570 pixels+=line_size;\
571 block +=line_size;\
572 }\
573}\
574\
575static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
576{\
577 int i;\
578 for(i=0; i<h; i++){\
579 const uint64_t a= AV_RN64(pixels );\
580 const uint64_t b= AV_RN64(pixels+1);\
581 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
582 pixels+=line_size;\
583 block +=line_size;\
584 }\
585}\
586\
587static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
588{\
589 int i;\
590 for(i=0; i<h; i++){\
591 const uint64_t a= AV_RN64(pixels );\
592 const uint64_t b= AV_RN64(pixels+1);\
593 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
594 pixels+=line_size;\
595 block +=line_size;\
596 }\
597}\
598\
599static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
600{\
601 int i;\
602 for(i=0; i<h; i++){\
603 const uint64_t a= AV_RN64(pixels );\
604 const uint64_t b= AV_RN64(pixels+line_size);\
605 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
606 pixels+=line_size;\
607 block +=line_size;\
608 }\
609}\
610\
611static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
612{\
613 int i;\
614 for(i=0; i<h; i++){\
615 const uint64_t a= AV_RN64(pixels );\
616 const uint64_t b= AV_RN64(pixels+line_size);\
617 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
618 pixels+=line_size;\
619 block +=line_size;\
620 }\
621}\
622\
623static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
624{\
625 int i;\
626 const uint64_t a= AV_RN64(pixels );\
627 const uint64_t b= AV_RN64(pixels+1);\
628 uint64_t l0= (a&0x0303030303030303ULL)\
629 + (b&0x0303030303030303ULL)\
630 + 0x0202020202020202ULL;\
631 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
632 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
633 uint64_t l1,h1;\
634\
635 pixels+=line_size;\
636 for(i=0; i<h; i+=2){\
637 uint64_t a= AV_RN64(pixels );\
638 uint64_t b= AV_RN64(pixels+1);\
639 l1= (a&0x0303030303030303ULL)\
640 + (b&0x0303030303030303ULL);\
641 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
642 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
643 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
644 pixels+=line_size;\
645 block +=line_size;\
646 a= AV_RN64(pixels );\
647 b= AV_RN64(pixels+1);\
648 l0= (a&0x0303030303030303ULL)\
649 + (b&0x0303030303030303ULL)\
650 + 0x0202020202020202ULL;\
651 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
652 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
653 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
654 pixels+=line_size;\
655 block +=line_size;\
656 }\
657}\
658\
659static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
660{\
661 int i;\
662 const uint64_t a= AV_RN64(pixels );\
663 const uint64_t b= AV_RN64(pixels+1);\
664 uint64_t l0= (a&0x0303030303030303ULL)\
665 + (b&0x0303030303030303ULL)\
666 + 0x0101010101010101ULL;\
667 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
668 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
669 uint64_t l1,h1;\
670\
671 pixels+=line_size;\
672 for(i=0; i<h; i+=2){\
673 uint64_t a= AV_RN64(pixels );\
674 uint64_t b= AV_RN64(pixels+1);\
675 l1= (a&0x0303030303030303ULL)\
676 + (b&0x0303030303030303ULL);\
677 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
678 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
679 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
680 pixels+=line_size;\
681 block +=line_size;\
682 a= AV_RN64(pixels );\
683 b= AV_RN64(pixels+1);\
684 l0= (a&0x0303030303030303ULL)\
685 + (b&0x0303030303030303ULL)\
686 + 0x0101010101010101ULL;\
687 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
688 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
689 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
690 pixels+=line_size;\
691 block +=line_size;\
692 }\
693}\
694\
695CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
696CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
697CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
698CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
699CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
700CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
701CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
702
703#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
704#else // 64 bit variant
705
706#define PIXOP2(OPNAME, OP) \
707static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
708 int i;\
709 for(i=0; i<h; i++){\
710 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
711 pixels+=line_size;\
712 block +=line_size;\
713 }\
714}\
715static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
716 int i;\
717 for(i=0; i<h; i++){\
718 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
719 pixels+=line_size;\
720 block +=line_size;\
721 }\
722}\
723static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
724 int i;\
725 for(i=0; i<h; i++){\
726 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
727 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
728 pixels+=line_size;\
729 block +=line_size;\
730 }\
731}\
732static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
733 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
734}\
735\
736static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
737 int src_stride1, int src_stride2, int h){\
738 int i;\
739 for(i=0; i<h; i++){\
740 uint32_t a,b;\
741 a= AV_RN32(&src1[i*src_stride1 ]);\
742 b= AV_RN32(&src2[i*src_stride2 ]);\
743 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
744 a= AV_RN32(&src1[i*src_stride1+4]);\
745 b= AV_RN32(&src2[i*src_stride2+4]);\
746 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
747 }\
748}\
749\
750static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
751 int src_stride1, int src_stride2, int h){\
752 int i;\
753 for(i=0; i<h; i++){\
754 uint32_t a,b;\
755 a= AV_RN32(&src1[i*src_stride1 ]);\
756 b= AV_RN32(&src2[i*src_stride2 ]);\
757 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
758 a= AV_RN32(&src1[i*src_stride1+4]);\
759 b= AV_RN32(&src2[i*src_stride2+4]);\
760 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
761 }\
762}\
763\
764static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
765 int src_stride1, int src_stride2, int h){\
766 int i;\
767 for(i=0; i<h; i++){\
768 uint32_t a,b;\
769 a= AV_RN32(&src1[i*src_stride1 ]);\
770 b= AV_RN32(&src2[i*src_stride2 ]);\
771 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
772 }\
773}\
774\
775static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
776 int src_stride1, int src_stride2, int h){\
777 int i;\
778 for(i=0; i<h; i++){\
779 uint32_t a,b;\
780 a= AV_RN16(&src1[i*src_stride1 ]);\
781 b= AV_RN16(&src2[i*src_stride2 ]);\
782 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
783 }\
784}\
785\
786static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
787 int src_stride1, int src_stride2, int h){\
788 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
789 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
790}\
791\
792static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
793 int src_stride1, int src_stride2, int h){\
794 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
795 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
796}\
797\
798static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
799 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
800}\
801\
802static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
803 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
804}\
805\
806static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
807 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
808}\
809\
810static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
811 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
812}\
813\
814static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
815 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
816 int i;\
817 for(i=0; i<h; i++){\
818 uint32_t a, b, c, d, l0, l1, h0, h1;\
819 a= AV_RN32(&src1[i*src_stride1]);\
820 b= AV_RN32(&src2[i*src_stride2]);\
821 c= AV_RN32(&src3[i*src_stride3]);\
822 d= AV_RN32(&src4[i*src_stride4]);\
823 l0= (a&0x03030303UL)\
824 + (b&0x03030303UL)\
825 + 0x02020202UL;\
826 h0= ((a&0xFCFCFCFCUL)>>2)\
827 + ((b&0xFCFCFCFCUL)>>2);\
828 l1= (c&0x03030303UL)\
829 + (d&0x03030303UL);\
830 h1= ((c&0xFCFCFCFCUL)>>2)\
831 + ((d&0xFCFCFCFCUL)>>2);\
832 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
833 a= AV_RN32(&src1[i*src_stride1+4]);\
834 b= AV_RN32(&src2[i*src_stride2+4]);\
835 c= AV_RN32(&src3[i*src_stride3+4]);\
836 d= AV_RN32(&src4[i*src_stride4+4]);\
837 l0= (a&0x03030303UL)\
838 + (b&0x03030303UL)\
839 + 0x02020202UL;\
840 h0= ((a&0xFCFCFCFCUL)>>2)\
841 + ((b&0xFCFCFCFCUL)>>2);\
842 l1= (c&0x03030303UL)\
843 + (d&0x03030303UL);\
844 h1= ((c&0xFCFCFCFCUL)>>2)\
845 + ((d&0xFCFCFCFCUL)>>2);\
846 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
847 }\
848}\
849\
850static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
851 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
852}\
853\
854static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
855 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
856}\
857\
858static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
859 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
860}\
861\
862static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
863 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
864}\
865\
866static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
867 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
868 int i;\
869 for(i=0; i<h; i++){\
870 uint32_t a, b, c, d, l0, l1, h0, h1;\
871 a= AV_RN32(&src1[i*src_stride1]);\
872 b= AV_RN32(&src2[i*src_stride2]);\
873 c= AV_RN32(&src3[i*src_stride3]);\
874 d= AV_RN32(&src4[i*src_stride4]);\
875 l0= (a&0x03030303UL)\
876 + (b&0x03030303UL)\
877 + 0x01010101UL;\
878 h0= ((a&0xFCFCFCFCUL)>>2)\
879 + ((b&0xFCFCFCFCUL)>>2);\
880 l1= (c&0x03030303UL)\
881 + (d&0x03030303UL);\
882 h1= ((c&0xFCFCFCFCUL)>>2)\
883 + ((d&0xFCFCFCFCUL)>>2);\
884 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
885 a= AV_RN32(&src1[i*src_stride1+4]);\
886 b= AV_RN32(&src2[i*src_stride2+4]);\
887 c= AV_RN32(&src3[i*src_stride3+4]);\
888 d= AV_RN32(&src4[i*src_stride4+4]);\
889 l0= (a&0x03030303UL)\
890 + (b&0x03030303UL)\
891 + 0x01010101UL;\
892 h0= ((a&0xFCFCFCFCUL)>>2)\
893 + ((b&0xFCFCFCFCUL)>>2);\
894 l1= (c&0x03030303UL)\
895 + (d&0x03030303UL);\
896 h1= ((c&0xFCFCFCFCUL)>>2)\
897 + ((d&0xFCFCFCFCUL)>>2);\
898 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
899 }\
900}\
901static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
902 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
903 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
904 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
905}\
906static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
907 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
908 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
909 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
910}\
911\
912static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
913{\
914 int i, a0, b0, a1, b1;\
915 a0= pixels[0];\
916 b0= pixels[1] + 2;\
917 a0 += b0;\
918 b0 += pixels[2];\
919\
920 pixels+=line_size;\
921 for(i=0; i<h; i+=2){\
922 a1= pixels[0];\
923 b1= pixels[1];\
924 a1 += b1;\
925 b1 += pixels[2];\
926\
927 block[0]= (a1+a0)>>2; /* FIXME non put */\
928 block[1]= (b1+b0)>>2;\
929\
930 pixels+=line_size;\
931 block +=line_size;\
932\
933 a0= pixels[0];\
934 b0= pixels[1] + 2;\
935 a0 += b0;\
936 b0 += pixels[2];\
937\
938 block[0]= (a1+a0)>>2;\
939 block[1]= (b1+b0)>>2;\
940 pixels+=line_size;\
941 block +=line_size;\
942 }\
943}\
944\
945static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
946{\
947 int i;\
948 const uint32_t a= AV_RN32(pixels );\
949 const uint32_t b= AV_RN32(pixels+1);\
950 uint32_t l0= (a&0x03030303UL)\
951 + (b&0x03030303UL)\
952 + 0x02020202UL;\
953 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
954 + ((b&0xFCFCFCFCUL)>>2);\
955 uint32_t l1,h1;\
956\
957 pixels+=line_size;\
958 for(i=0; i<h; i+=2){\
959 uint32_t a= AV_RN32(pixels );\
960 uint32_t b= AV_RN32(pixels+1);\
961 l1= (a&0x03030303UL)\
962 + (b&0x03030303UL);\
963 h1= ((a&0xFCFCFCFCUL)>>2)\
964 + ((b&0xFCFCFCFCUL)>>2);\
965 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
966 pixels+=line_size;\
967 block +=line_size;\
968 a= AV_RN32(pixels );\
969 b= AV_RN32(pixels+1);\
970 l0= (a&0x03030303UL)\
971 + (b&0x03030303UL)\
972 + 0x02020202UL;\
973 h0= ((a&0xFCFCFCFCUL)>>2)\
974 + ((b&0xFCFCFCFCUL)>>2);\
975 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
976 pixels+=line_size;\
977 block +=line_size;\
978 }\
979}\
980\
981static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
982{\
983 int j;\
984 for(j=0; j<2; j++){\
985 int i;\
986 const uint32_t a= AV_RN32(pixels );\
987 const uint32_t b= AV_RN32(pixels+1);\
988 uint32_t l0= (a&0x03030303UL)\
989 + (b&0x03030303UL)\
990 + 0x02020202UL;\
991 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
992 + ((b&0xFCFCFCFCUL)>>2);\
993 uint32_t l1,h1;\
994\
995 pixels+=line_size;\
996 for(i=0; i<h; i+=2){\
997 uint32_t a= AV_RN32(pixels );\
998 uint32_t b= AV_RN32(pixels+1);\
999 l1= (a&0x03030303UL)\
1000 + (b&0x03030303UL);\
1001 h1= ((a&0xFCFCFCFCUL)>>2)\
1002 + ((b&0xFCFCFCFCUL)>>2);\
1003 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1004 pixels+=line_size;\
1005 block +=line_size;\
1006 a= AV_RN32(pixels );\
1007 b= AV_RN32(pixels+1);\
1008 l0= (a&0x03030303UL)\
1009 + (b&0x03030303UL)\
1010 + 0x02020202UL;\
1011 h0= ((a&0xFCFCFCFCUL)>>2)\
1012 + ((b&0xFCFCFCFCUL)>>2);\
1013 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1014 pixels+=line_size;\
1015 block +=line_size;\
1016 }\
1017 pixels+=4-line_size*(h+1);\
1018 block +=4-line_size*h;\
1019 }\
1020}\
1021\
1022static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1023{\
1024 int j;\
1025 for(j=0; j<2; j++){\
1026 int i;\
1027 const uint32_t a= AV_RN32(pixels );\
1028 const uint32_t b= AV_RN32(pixels+1);\
1029 uint32_t l0= (a&0x03030303UL)\
1030 + (b&0x03030303UL)\
1031 + 0x01010101UL;\
1032 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1033 + ((b&0xFCFCFCFCUL)>>2);\
1034 uint32_t l1,h1;\
1035\
1036 pixels+=line_size;\
1037 for(i=0; i<h; i+=2){\
1038 uint32_t a= AV_RN32(pixels );\
1039 uint32_t b= AV_RN32(pixels+1);\
1040 l1= (a&0x03030303UL)\
1041 + (b&0x03030303UL);\
1042 h1= ((a&0xFCFCFCFCUL)>>2)\
1043 + ((b&0xFCFCFCFCUL)>>2);\
1044 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1045 pixels+=line_size;\
1046 block +=line_size;\
1047 a= AV_RN32(pixels );\
1048 b= AV_RN32(pixels+1);\
1049 l0= (a&0x03030303UL)\
1050 + (b&0x03030303UL)\
1051 + 0x01010101UL;\
1052 h0= ((a&0xFCFCFCFCUL)>>2)\
1053 + ((b&0xFCFCFCFCUL)>>2);\
1054 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1055 pixels+=line_size;\
1056 block +=line_size;\
1057 }\
1058 pixels+=4-line_size*(h+1);\
1059 block +=4-line_size*h;\
1060 }\
1061}\
1062\
1063CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1064CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1065CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1066CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1067CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1068CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1069CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1070CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1071
1072#define op_avg(a, b) a = rnd_avg32(a, b)
1073#endif
1074#define op_put(a, b) a = b
1075
1076PIXOP2(avg, op_avg)
1077PIXOP2(put, op_put)
1078#undef op_avg
1079#undef op_put
1080
1081#define avg2(a,b) ((a+b+1)>>1)
1082#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1083
1084static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1085 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1086}
1087
1088static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1089 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1090}
1091
1092static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1093{
1094 const int A=(16-x16)*(16-y16);
1095 const int B=( x16)*(16-y16);
1096 const int C=(16-x16)*( y16);
1097 const int D=( x16)*( y16);
1098 int i;
1099
1100 for(i=0; i<h; i++)
1101 {
1102 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1103 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1104 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1105 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1106 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1107 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1108 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1109 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1110 dst+= stride;
1111 src+= stride;
1112 }
1113}
1114
1115void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1116 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1117{
1118 int y, vx, vy;
1119 const int s= 1<<shift;
1120
1121 width--;
1122 height--;
1123
1124 for(y=0; y<h; y++){
1125 int x;
1126
1127 vx= ox;
1128 vy= oy;
1129 for(x=0; x<8; x++){ //XXX FIXME optimize
1130 int src_x, src_y, frac_x, frac_y, index;
1131
1132 src_x= vx>>16;
1133 src_y= vy>>16;
1134 frac_x= src_x&(s-1);
1135 frac_y= src_y&(s-1);
1136 src_x>>=shift;
1137 src_y>>=shift;
1138
1139 if((unsigned)src_x < width){
1140 if((unsigned)src_y < height){
1141 index= src_x + src_y*stride;
1142 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1143 + src[index +1]* frac_x )*(s-frac_y)
1144 + ( src[index+stride ]*(s-frac_x)
1145 + src[index+stride+1]* frac_x )* frac_y
1146 + r)>>(shift*2);
1147 }else{
1148 index= src_x + av_clip(src_y, 0, height)*stride;
1149 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1150 + src[index +1]* frac_x )*s
1151 + r)>>(shift*2);
1152 }
1153 }else{
1154 if((unsigned)src_y < height){
1155 index= av_clip(src_x, 0, width) + src_y*stride;
1156 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1157 + src[index+stride ]* frac_y )*s
1158 + r)>>(shift*2);
1159 }else{
1160 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1161 dst[y*stride + x]= src[index ];
1162 }
1163 }
1164
1165 vx+= dxx;
1166 vy+= dyx;
1167 }
1168 ox += dxy;
1169 oy += dyy;
1170 }
1171}
1172
1173static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1174 switch(width){
1175 case 2: put_pixels2_c (dst, src, stride, height); break;
1176 case 4: put_pixels4_c (dst, src, stride, height); break;
1177 case 8: put_pixels8_c (dst, src, stride, height); break;
1178 case 16:put_pixels16_c(dst, src, stride, height); break;
1179 }
1180}
1181
1182static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1183 int i,j;
1184 for (i=0; i < height; i++) {
1185 for (j=0; j < width; j++) {
1186 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1187 }
1188 src += stride;
1189 dst += stride;
1190 }
1191}
1192
1193static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1194 int i,j;
1195 for (i=0; i < height; i++) {
1196 for (j=0; j < width; j++) {
1197 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1198 }
1199 src += stride;
1200 dst += stride;
1201 }
1202}
1203
1204static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1205 int i,j;
1206 for (i=0; i < height; i++) {
1207 for (j=0; j < width; j++) {
1208 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1209 }
1210 src += stride;
1211 dst += stride;
1212 }
1213}
1214
1215static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1216 int i,j;
1217 for (i=0; i < height; i++) {
1218 for (j=0; j < width; j++) {
1219 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1220 }
1221 src += stride;
1222 dst += stride;
1223 }
1224}
1225
1226static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1227 int i,j;
1228 for (i=0; i < height; i++) {
1229 for (j=0; j < width; j++) {
1230 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1231 }
1232 src += stride;
1233 dst += stride;
1234 }
1235}
1236
1237static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1238 int i,j;
1239 for (i=0; i < height; i++) {
1240 for (j=0; j < width; j++) {
1241 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1242 }
1243 src += stride;
1244 dst += stride;
1245 }
1246}
1247
1248static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1249 int i,j;
1250 for (i=0; i < height; i++) {
1251 for (j=0; j < width; j++) {
1252 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1253 }
1254 src += stride;
1255 dst += stride;
1256 }
1257}
1258
1259static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1260 int i,j;
1261 for (i=0; i < height; i++) {
1262 for (j=0; j < width; j++) {
1263 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1264 }
1265 src += stride;
1266 dst += stride;
1267 }
1268}
1269
1270static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1271 switch(width){
1272 case 2: avg_pixels2_c (dst, src, stride, height); break;
1273 case 4: avg_pixels4_c (dst, src, stride, height); break;
1274 case 8: avg_pixels8_c (dst, src, stride, height); break;
1275 case 16:avg_pixels16_c(dst, src, stride, height); break;
1276 }
1277}
1278
1279static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1280 int i,j;
1281 for (i=0; i < height; i++) {
1282 for (j=0; j < width; j++) {
1283 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1284 }
1285 src += stride;
1286 dst += stride;
1287 }
1288}
1289
1290static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1291 int i,j;
1292 for (i=0; i < height; i++) {
1293 for (j=0; j < width; j++) {
1294 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1295 }
1296 src += stride;
1297 dst += stride;
1298 }
1299}
1300
1301static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1302 int i,j;
1303 for (i=0; i < height; i++) {
1304 for (j=0; j < width; j++) {
1305 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1306 }
1307 src += stride;
1308 dst += stride;
1309 }
1310}
1311
1312static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1313 int i,j;
1314 for (i=0; i < height; i++) {
1315 for (j=0; j < width; j++) {
1316 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1317 }
1318 src += stride;
1319 dst += stride;
1320 }
1321}
1322
1323static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1324 int i,j;
1325 for (i=0; i < height; i++) {
1326 for (j=0; j < width; j++) {
1327 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1328 }
1329 src += stride;
1330 dst += stride;
1331 }
1332}
1333
1334static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1335 int i,j;
1336 for (i=0; i < height; i++) {
1337 for (j=0; j < width; j++) {
1338 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1339 }
1340 src += stride;
1341 dst += stride;
1342 }
1343}
1344
1345static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1346 int i,j;
1347 for (i=0; i < height; i++) {
1348 for (j=0; j < width; j++) {
1349 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1350 }
1351 src += stride;
1352 dst += stride;
1353 }
1354}
1355
1356static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1357 int i,j;
1358 for (i=0; i < height; i++) {
1359 for (j=0; j < width; j++) {
1360 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1361 }
1362 src += stride;
1363 dst += stride;
1364 }
1365}
1366#if 0
1367#define TPEL_WIDTH(width)\
1368static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1369 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1370static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1371 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1372static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1373 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1374static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1375 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1376static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1377 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1378static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1379 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1380static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1381 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1382static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1383 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1384static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1385 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1386#endif
1387
1388#define H264_CHROMA_MC(OPNAME, OP)\
1389static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1390 const int A=(8-x)*(8-y);\
1391 const int B=( x)*(8-y);\
1392 const int C=(8-x)*( y);\
1393 const int D=( x)*( y);\
1394 int i;\
1395 \
1396 assert(x<8 && y<8 && x>=0 && y>=0);\
1397\
1398 if(D){\
1399 for(i=0; i<h; i++){\
1400 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1401 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1402 dst+= stride;\
1403 src+= stride;\
1404 }\
1405 }else{\
1406 const int E= B+C;\
1407 const int step= C ? stride : 1;\
1408 for(i=0; i<h; i++){\
1409 OP(dst[0], (A*src[0] + E*src[step+0]));\
1410 OP(dst[1], (A*src[1] + E*src[step+1]));\
1411 dst+= stride;\
1412 src+= stride;\
1413 }\
1414 }\
1415}\
1416\
1417static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1418 const int A=(8-x)*(8-y);\
1419 const int B=( x)*(8-y);\
1420 const int C=(8-x)*( y);\
1421 const int D=( x)*( y);\
1422 int i;\
1423 \
1424 assert(x<8 && y<8 && x>=0 && y>=0);\
1425\
1426 if(D){\
1427 for(i=0; i<h; i++){\
1428 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1429 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1430 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1431 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1432 dst+= stride;\
1433 src+= stride;\
1434 }\
1435 }else{\
1436 const int E= B+C;\
1437 const int step= C ? stride : 1;\
1438 for(i=0; i<h; i++){\
1439 OP(dst[0], (A*src[0] + E*src[step+0]));\
1440 OP(dst[1], (A*src[1] + E*src[step+1]));\
1441 OP(dst[2], (A*src[2] + E*src[step+2]));\
1442 OP(dst[3], (A*src[3] + E*src[step+3]));\
1443 dst+= stride;\
1444 src+= stride;\
1445 }\
1446 }\
1447}\
1448\
1449static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1450 const int A=(8-x)*(8-y);\
1451 const int B=( x)*(8-y);\
1452 const int C=(8-x)*( y);\
1453 const int D=( x)*( y);\
1454 int i;\
1455 \
1456 assert(x<8 && y<8 && x>=0 && y>=0);\
1457\
1458 if(D){\
1459 for(i=0; i<h; i++){\
1460 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1461 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1462 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1463 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1464 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1465 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1466 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1467 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1468 dst+= stride;\
1469 src+= stride;\
1470 }\
1471 }else{\
1472 const int E= B+C;\
1473 const int step= C ? stride : 1;\
1474 for(i=0; i<h; i++){\
1475 OP(dst[0], (A*src[0] + E*src[step+0]));\
1476 OP(dst[1], (A*src[1] + E*src[step+1]));\
1477 OP(dst[2], (A*src[2] + E*src[step+2]));\
1478 OP(dst[3], (A*src[3] + E*src[step+3]));\
1479 OP(dst[4], (A*src[4] + E*src[step+4]));\
1480 OP(dst[5], (A*src[5] + E*src[step+5]));\
1481 OP(dst[6], (A*src[6] + E*src[step+6]));\
1482 OP(dst[7], (A*src[7] + E*src[step+7]));\
1483 dst+= stride;\
1484 src+= stride;\
1485 }\
1486 }\
1487}
1488
1489#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1490#define op_put(a, b) a = (((b) + 32)>>6)
1491
1492H264_CHROMA_MC(put_ , op_put)
1493H264_CHROMA_MC(avg_ , op_avg)
1494#undef op_avg
1495#undef op_put
1496
1497static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1498 const int A=(8-x)*(8-y);
1499 const int B=( x)*(8-y);
1500 const int C=(8-x)*( y);
1501 const int D=( x)*( y);
1502 int i;
1503
1504 assert(x<8 && y<8 && x>=0 && y>=0);
1505
1506 for(i=0; i<h; i++)
1507 {
1508 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1509 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1510 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1511 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1512 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1513 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1514 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1515 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1516 dst+= stride;
1517 src+= stride;
1518 }
1519}
1520
1521#define QPEL_MC(r, OPNAME, RND, OP) \
1522static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1523 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1524 int i;\
1525 for(i=0; i<h; i++)\
1526 {\
1527 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1528 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1529 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1530 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1531 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1532 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1533 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1534 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1535 dst+=dstStride;\
1536 src+=srcStride;\
1537 }\
1538}\
1539\
1540static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1541 const int w=8;\
1542 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1543 int i;\
1544 for(i=0; i<w; i++)\
1545 {\
1546 const int src0= src[0*srcStride];\
1547 const int src1= src[1*srcStride];\
1548 const int src2= src[2*srcStride];\
1549 const int src3= src[3*srcStride];\
1550 const int src4= src[4*srcStride];\
1551 const int src5= src[5*srcStride];\
1552 const int src6= src[6*srcStride];\
1553 const int src7= src[7*srcStride];\
1554 const int src8= src[8*srcStride];\
1555 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1556 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1557 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1558 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1559 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1560 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1561 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1562 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1563 dst++;\
1564 src++;\
1565 }\
1566}\
1567\
1568static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1569 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1570 int i;\
1571 \
1572 for(i=0; i<h; i++)\
1573 {\
1574 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1575 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1576 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1577 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1578 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1579 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1580 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1581 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1582 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1583 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1584 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1585 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1586 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1587 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1588 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1589 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1590 dst+=dstStride;\
1591 src+=srcStride;\
1592 }\
1593}\
1594\
1595static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1596 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1597 int i;\
1598 const int w=16;\
1599 for(i=0; i<w; i++)\
1600 {\
1601 const int src0= src[0*srcStride];\
1602 const int src1= src[1*srcStride];\
1603 const int src2= src[2*srcStride];\
1604 const int src3= src[3*srcStride];\
1605 const int src4= src[4*srcStride];\
1606 const int src5= src[5*srcStride];\
1607 const int src6= src[6*srcStride];\
1608 const int src7= src[7*srcStride];\
1609 const int src8= src[8*srcStride];\
1610 const int src9= src[9*srcStride];\
1611 const int src10= src[10*srcStride];\
1612 const int src11= src[11*srcStride];\
1613 const int src12= src[12*srcStride];\
1614 const int src13= src[13*srcStride];\
1615 const int src14= src[14*srcStride];\
1616 const int src15= src[15*srcStride];\
1617 const int src16= src[16*srcStride];\
1618 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1619 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1620 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1621 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1622 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1623 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1624 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1625 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1626 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1627 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1628 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1629 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1630 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1631 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1632 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1633 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1634 dst++;\
1635 src++;\
1636 }\
1637}\
1638\
1639static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1640 OPNAME ## pixels8_c(dst, src, stride, 8);\
1641}\
1642\
1643static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1644 uint8_t half[64];\
1645 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1646 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1647}\
1648\
1649static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1650 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1651}\
1652\
1653static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1654 uint8_t half[64];\
1655 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1656 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1657}\
1658\
1659static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1660 uint8_t full[16*9];\
1661 uint8_t half[64];\
1662 copy_block9(full, src, 16, stride, 9);\
1663 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1664 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1665}\
1666\
1667static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1668 uint8_t full[16*9];\
1669 copy_block9(full, src, 16, stride, 9);\
1670 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1671}\
1672\
1673static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1674 uint8_t full[16*9];\
1675 uint8_t half[64];\
1676 copy_block9(full, src, 16, stride, 9);\
1677 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1678 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1679}\
1680void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1681 uint8_t full[16*9];\
1682 uint8_t halfH[72];\
1683 uint8_t halfV[64];\
1684 uint8_t halfHV[64];\
1685 copy_block9(full, src, 16, stride, 9);\
1686 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1687 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1688 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1689 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1690}\
1691static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1692 uint8_t full[16*9];\
1693 uint8_t halfH[72];\
1694 uint8_t halfHV[64];\
1695 copy_block9(full, src, 16, stride, 9);\
1696 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1697 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1698 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1699 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1700}\
1701void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1702 uint8_t full[16*9];\
1703 uint8_t halfH[72];\
1704 uint8_t halfV[64];\
1705 uint8_t halfHV[64];\
1706 copy_block9(full, src, 16, stride, 9);\
1707 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1708 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1709 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1710 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1711}\
1712static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1713 uint8_t full[16*9];\
1714 uint8_t halfH[72];\
1715 uint8_t halfHV[64];\
1716 copy_block9(full, src, 16, stride, 9);\
1717 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1718 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1719 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1720 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1721}\
1722void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1723 uint8_t full[16*9];\
1724 uint8_t halfH[72];\
1725 uint8_t halfV[64];\
1726 uint8_t halfHV[64];\
1727 copy_block9(full, src, 16, stride, 9);\
1728 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1729 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1730 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1731 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1732}\
1733static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1734 uint8_t full[16*9];\
1735 uint8_t halfH[72];\
1736 uint8_t halfHV[64];\
1737 copy_block9(full, src, 16, stride, 9);\
1738 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1739 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1740 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1741 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1742}\
1743void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1744 uint8_t full[16*9];\
1745 uint8_t halfH[72];\
1746 uint8_t halfV[64];\
1747 uint8_t halfHV[64];\
1748 copy_block9(full, src, 16, stride, 9);\
1749 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1750 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1751 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1752 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1753}\
1754static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1755 uint8_t full[16*9];\
1756 uint8_t halfH[72];\
1757 uint8_t halfHV[64];\
1758 copy_block9(full, src, 16, stride, 9);\
1759 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1760 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1761 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1762 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1763}\
1764static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1765 uint8_t halfH[72];\
1766 uint8_t halfHV[64];\
1767 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1768 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1769 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1770}\
1771static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1772 uint8_t halfH[72];\
1773 uint8_t halfHV[64];\
1774 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1775 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1776 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1777}\
1778void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1779 uint8_t full[16*9];\
1780 uint8_t halfH[72];\
1781 uint8_t halfV[64];\
1782 uint8_t halfHV[64];\
1783 copy_block9(full, src, 16, stride, 9);\
1784 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1785 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1786 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1787 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1788}\
1789static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1790 uint8_t full[16*9];\
1791 uint8_t halfH[72];\
1792 copy_block9(full, src, 16, stride, 9);\
1793 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1794 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1795 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1796}\
1797void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1798 uint8_t full[16*9];\
1799 uint8_t halfH[72];\
1800 uint8_t halfV[64];\
1801 uint8_t halfHV[64];\
1802 copy_block9(full, src, 16, stride, 9);\
1803 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1804 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1805 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1806 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1807}\
1808static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1809 uint8_t full[16*9];\
1810 uint8_t halfH[72];\
1811 copy_block9(full, src, 16, stride, 9);\
1812 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1813 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1814 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1815}\
1816static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1817 uint8_t halfH[72];\
1818 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1819 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1820}\
1821static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1822 OPNAME ## pixels16_c(dst, src, stride, 16);\
1823}\
1824\
1825static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1826 uint8_t half[256];\
1827 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1828 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1829}\
1830\
1831static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1832 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1833}\
1834\
1835static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1836 uint8_t half[256];\
1837 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1838 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1839}\
1840\
1841static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1842 uint8_t full[24*17];\
1843 uint8_t half[256];\
1844 copy_block17(full, src, 24, stride, 17);\
1845 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1846 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1847}\
1848\
1849static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1850 uint8_t full[24*17];\
1851 copy_block17(full, src, 24, stride, 17);\
1852 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1853}\
1854\
1855static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1856 uint8_t full[24*17];\
1857 uint8_t half[256];\
1858 copy_block17(full, src, 24, stride, 17);\
1859 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1860 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1861}\
1862void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1863 uint8_t full[24*17];\
1864 uint8_t halfH[272];\
1865 uint8_t halfV[256];\
1866 uint8_t halfHV[256];\
1867 copy_block17(full, src, 24, stride, 17);\
1868 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1869 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1870 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1871 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1872}\
1873static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1874 uint8_t full[24*17];\
1875 uint8_t halfH[272];\
1876 uint8_t halfHV[256];\
1877 copy_block17(full, src, 24, stride, 17);\
1878 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1879 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1880 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1881 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1882}\
1883void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1884 uint8_t full[24*17];\
1885 uint8_t halfH[272];\
1886 uint8_t halfV[256];\
1887 uint8_t halfHV[256];\
1888 copy_block17(full, src, 24, stride, 17);\
1889 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1890 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1891 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1892 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1893}\
1894static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1895 uint8_t full[24*17];\
1896 uint8_t halfH[272];\
1897 uint8_t halfHV[256];\
1898 copy_block17(full, src, 24, stride, 17);\
1899 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1900 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1901 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1902 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1903}\
1904void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1905 uint8_t full[24*17];\
1906 uint8_t halfH[272];\
1907 uint8_t halfV[256];\
1908 uint8_t halfHV[256];\
1909 copy_block17(full, src, 24, stride, 17);\
1910 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1911 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1912 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1913 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1914}\
1915static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1916 uint8_t full[24*17];\
1917 uint8_t halfH[272];\
1918 uint8_t halfHV[256];\
1919 copy_block17(full, src, 24, stride, 17);\
1920 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1921 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1922 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1923 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1924}\
1925void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1926 uint8_t full[24*17];\
1927 uint8_t halfH[272];\
1928 uint8_t halfV[256];\
1929 uint8_t halfHV[256];\
1930 copy_block17(full, src, 24, stride, 17);\
1931 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1932 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1933 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1934 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1935}\
1936static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1937 uint8_t full[24*17];\
1938 uint8_t halfH[272];\
1939 uint8_t halfHV[256];\
1940 copy_block17(full, src, 24, stride, 17);\
1941 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1942 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1943 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1944 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1945}\
1946static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1947 uint8_t halfH[272];\
1948 uint8_t halfHV[256];\
1949 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1950 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1951 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1952}\
1953static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1954 uint8_t halfH[272];\
1955 uint8_t halfHV[256];\
1956 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1957 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1958 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1959}\
1960void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1961 uint8_t full[24*17];\
1962 uint8_t halfH[272];\
1963 uint8_t halfV[256];\
1964 uint8_t halfHV[256];\
1965 copy_block17(full, src, 24, stride, 17);\
1966 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1967 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1968 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1969 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1970}\
1971static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1972 uint8_t full[24*17];\
1973 uint8_t halfH[272];\
1974 copy_block17(full, src, 24, stride, 17);\
1975 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1976 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1977 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1978}\
1979void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1980 uint8_t full[24*17];\
1981 uint8_t halfH[272];\
1982 uint8_t halfV[256];\
1983 uint8_t halfHV[256];\
1984 copy_block17(full, src, 24, stride, 17);\
1985 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1986 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1987 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1988 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1989}\
1990static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1991 uint8_t full[24*17];\
1992 uint8_t halfH[272];\
1993 copy_block17(full, src, 24, stride, 17);\
1994 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1995 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1996 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1997}\
1998static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1999 uint8_t halfH[272];\
2000 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2001 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2002}
2003
2004#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2005#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2006#define op_put(a, b) a = cm[((b) + 16)>>5]
2007#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2008
2009QPEL_MC(0, put_ , _ , op_put)
2010QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2011QPEL_MC(0, avg_ , _ , op_avg)
2012//QPEL_MC(1, avg_no_rnd , _ , op_avg)
2013#undef op_avg
2014#undef op_avg_no_rnd
2015#undef op_put
2016#undef op_put_no_rnd
2017
2018#if 1
2019#define H264_LOWPASS(OPNAME, OP, OP2) \
2020static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2021 const int h=2;\
2022 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2023 int i;\
2024 for(i=0; i<h; i++)\
2025 {\
2026 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2027 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2028 dst+=dstStride;\
2029 src+=srcStride;\
2030 }\
2031}\
2032\
2033static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2034 const int w=2;\
2035 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2036 int i;\
2037 for(i=0; i<w; i++)\
2038 {\
2039 const int srcB= src[-2*srcStride];\
2040 const int srcA= src[-1*srcStride];\
2041 const int src0= src[0 *srcStride];\
2042 const int src1= src[1 *srcStride];\
2043 const int src2= src[2 *srcStride];\
2044 const int src3= src[3 *srcStride];\
2045 const int src4= src[4 *srcStride];\
2046 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2047 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2048 dst++;\
2049 src++;\
2050 }\
2051}\
2052\
2053static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2054 const int h=2;\
2055 const int w=2;\
2056 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2057 int i;\
2058 src -= 2*srcStride;\
2059 for(i=0; i<h+5; i++)\
2060 {\
2061 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2062 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2063 tmp+=tmpStride;\
2064 src+=srcStride;\
2065 }\
2066 tmp -= tmpStride*(h+5-2);\
2067 for(i=0; i<w; i++)\
2068 {\
2069 const int tmpB= tmp[-2*tmpStride];\
2070 const int tmpA= tmp[-1*tmpStride];\
2071 const int tmp0= tmp[0 *tmpStride];\
2072 const int tmp1= tmp[1 *tmpStride];\
2073 const int tmp2= tmp[2 *tmpStride];\
2074 const int tmp3= tmp[3 *tmpStride];\
2075 const int tmp4= tmp[4 *tmpStride];\
2076 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2077 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2078 dst++;\
2079 tmp++;\
2080 }\
2081}\
2082static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2083 const int h=4;\
2084 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2085 int i;\
2086 for(i=0; i<h; i++)\
2087 {\
2088 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2089 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2090 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2091 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2092 dst+=dstStride;\
2093 src+=srcStride;\
2094 }\
2095}\
2096\
2097static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2098 const int w=4;\
2099 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2100 int i;\
2101 for(i=0; i<w; i++)\
2102 {\
2103 const int srcB= src[-2*srcStride];\
2104 const int srcA= src[-1*srcStride];\
2105 const int src0= src[0 *srcStride];\
2106 const int src1= src[1 *srcStride];\
2107 const int src2= src[2 *srcStride];\
2108 const int src3= src[3 *srcStride];\
2109 const int src4= src[4 *srcStride];\
2110 const int src5= src[5 *srcStride];\
2111 const int src6= src[6 *srcStride];\
2112 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2113 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2114 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2115 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2116 dst++;\
2117 src++;\
2118 }\
2119}\
2120\
2121static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2122 const int h=4;\
2123 const int w=4;\
2124 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2125 int i;\
2126 src -= 2*srcStride;\
2127 for(i=0; i<h+5; i++)\
2128 {\
2129 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2130 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2131 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2132 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2133 tmp+=tmpStride;\
2134 src+=srcStride;\
2135 }\
2136 tmp -= tmpStride*(h+5-2);\
2137 for(i=0; i<w; i++)\
2138 {\
2139 const int tmpB= tmp[-2*tmpStride];\
2140 const int tmpA= tmp[-1*tmpStride];\
2141 const int tmp0= tmp[0 *tmpStride];\
2142 const int tmp1= tmp[1 *tmpStride];\
2143 const int tmp2= tmp[2 *tmpStride];\
2144 const int tmp3= tmp[3 *tmpStride];\
2145 const int tmp4= tmp[4 *tmpStride];\
2146 const int tmp5= tmp[5 *tmpStride];\
2147 const int tmp6= tmp[6 *tmpStride];\
2148 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2149 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2150 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2151 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2152 dst++;\
2153 tmp++;\
2154 }\
2155}\
2156\
2157static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2158 const int h=8;\
2159 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2160 int i;\
2161 for(i=0; i<h; i++)\
2162 {\
2163 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2164 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2165 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2166 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2167 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2168 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2169 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2170 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2171 dst+=dstStride;\
2172 src+=srcStride;\
2173 }\
2174}\
2175\
2176static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2177 const int w=8;\
2178 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2179 int i;\
2180 for(i=0; i<w; i++)\
2181 {\
2182 const int srcB= src[-2*srcStride];\
2183 const int srcA= src[-1*srcStride];\
2184 const int src0= src[0 *srcStride];\
2185 const int src1= src[1 *srcStride];\
2186 const int src2= src[2 *srcStride];\
2187 const int src3= src[3 *srcStride];\
2188 const int src4= src[4 *srcStride];\
2189 const int src5= src[5 *srcStride];\
2190 const int src6= src[6 *srcStride];\
2191 const int src7= src[7 *srcStride];\
2192 const int src8= src[8 *srcStride];\
2193 const int src9= src[9 *srcStride];\
2194 const int src10=src[10*srcStride];\
2195 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2196 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2197 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2198 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2199 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2200 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2201 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2202 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2203 dst++;\
2204 src++;\
2205 }\
2206}\
2207\
2208static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2209 const int h=8;\
2210 const int w=8;\
2211 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2212 int i;\
2213 src -= 2*srcStride;\
2214 for(i=0; i<h+5; i++)\
2215 {\
2216 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2217 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2218 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2219 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2220 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2221 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2222 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2223 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2224 tmp+=tmpStride;\
2225 src+=srcStride;\
2226 }\
2227 tmp -= tmpStride*(h+5-2);\
2228 for(i=0; i<w; i++)\
2229 {\
2230 const int tmpB= tmp[-2*tmpStride];\
2231 const int tmpA= tmp[-1*tmpStride];\
2232 const int tmp0= tmp[0 *tmpStride];\
2233 const int tmp1= tmp[1 *tmpStride];\
2234 const int tmp2= tmp[2 *tmpStride];\
2235 const int tmp3= tmp[3 *tmpStride];\
2236 const int tmp4= tmp[4 *tmpStride];\
2237 const int tmp5= tmp[5 *tmpStride];\
2238 const int tmp6= tmp[6 *tmpStride];\
2239 const int tmp7= tmp[7 *tmpStride];\
2240 const int tmp8= tmp[8 *tmpStride];\
2241 const int tmp9= tmp[9 *tmpStride];\
2242 const int tmp10=tmp[10*tmpStride];\
2243 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2244 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2245 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2246 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2247 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2248 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2249 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2250 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2251 dst++;\
2252 tmp++;\
2253 }\
2254}\
2255\
2256static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2257 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2258 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2259 src += 8*srcStride;\
2260 dst += 8*dstStride;\
2261 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2262 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2263}\
2264\
2265static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2266 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2267 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2268 src += 8*srcStride;\
2269 dst += 8*dstStride;\
2270 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2271 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2272}\
2273\
2274static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2275 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2276 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2277 src += 8*srcStride;\
2278 dst += 8*dstStride;\
2279 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2280 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2281}\
2282
2283#define H264_MC(OPNAME, SIZE) \
2284static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2285 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2286}\
2287\
2288static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2289 uint8_t half[SIZE*SIZE];\
2290 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2291 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2292}\
2293\
2294static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2295 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2296}\
2297\
2298static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2299 uint8_t half[SIZE*SIZE];\
2300 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2301 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2302}\
2303\
2304static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2305 uint8_t full[SIZE*(SIZE+5)];\
2306 uint8_t * const full_mid= full + SIZE*2;\
2307 uint8_t half[SIZE*SIZE];\
2308 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2309 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2310 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2311}\
2312\
2313static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2314 uint8_t full[SIZE*(SIZE+5)];\
2315 uint8_t * const full_mid= full + SIZE*2;\
2316 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2317 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2318}\
2319\
2320static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2321 uint8_t full[SIZE*(SIZE+5)];\
2322 uint8_t * const full_mid= full + SIZE*2;\
2323 uint8_t half[SIZE*SIZE];\
2324 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2325 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2326 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2327}\
2328\
2329static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2330 uint8_t full[SIZE*(SIZE+5)];\
2331 uint8_t * const full_mid= full + SIZE*2;\
2332 uint8_t halfH[SIZE*SIZE];\
2333 uint8_t halfV[SIZE*SIZE];\
2334 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2335 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2336 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2337 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2338}\
2339\
2340static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2341 uint8_t full[SIZE*(SIZE+5)];\
2342 uint8_t * const full_mid= full + SIZE*2;\
2343 uint8_t halfH[SIZE*SIZE];\
2344 uint8_t halfV[SIZE*SIZE];\
2345 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2346 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2347 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2348 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2349}\
2350\
2351static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2352 uint8_t full[SIZE*(SIZE+5)];\
2353 uint8_t * const full_mid= full + SIZE*2;\
2354 uint8_t halfH[SIZE*SIZE];\
2355 uint8_t halfV[SIZE*SIZE];\
2356 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2357 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2358 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2359 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2360}\
2361\
2362static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2363 uint8_t full[SIZE*(SIZE+5)];\
2364 uint8_t * const full_mid= full + SIZE*2;\
2365 uint8_t halfH[SIZE*SIZE];\
2366 uint8_t halfV[SIZE*SIZE];\
2367 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2368 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2369 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2370 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2371}\
2372\
2373static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2374 int16_t tmp[SIZE*(SIZE+5)];\
2375 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2376}\
2377\
2378static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2379 int16_t tmp[SIZE*(SIZE+5)];\
2380 uint8_t halfH[SIZE*SIZE];\
2381 uint8_t halfHV[SIZE*SIZE];\
2382 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2383 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2384 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2385}\
2386\
2387static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2388 int16_t tmp[SIZE*(SIZE+5)];\
2389 uint8_t halfH[SIZE*SIZE];\
2390 uint8_t halfHV[SIZE*SIZE];\
2391 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2392 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2393 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2394}\
2395\
2396static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2397 uint8_t full[SIZE*(SIZE+5)];\
2398 uint8_t * const full_mid= full + SIZE*2;\
2399 int16_t tmp[SIZE*(SIZE+5)];\
2400 uint8_t halfV[SIZE*SIZE];\
2401 uint8_t halfHV[SIZE*SIZE];\
2402 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2403 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2404 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2405 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2406}\
2407\
2408static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2409 uint8_t full[SIZE*(SIZE+5)];\
2410 uint8_t * const full_mid= full + SIZE*2;\
2411 int16_t tmp[SIZE*(SIZE+5)];\
2412 uint8_t halfV[SIZE*SIZE];\
2413 uint8_t halfHV[SIZE*SIZE];\
2414 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2415 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2416 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2417 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2418}\
2419
2420#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2421//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2422#define op_put(a, b) a = cm[((b) + 16)>>5]
2423#define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2424#define op2_put(a, b) a = cm[((b) + 512)>>10]
2425
2426H264_LOWPASS(put_ , op_put, op2_put)
2427H264_LOWPASS(avg_ , op_avg, op2_avg)
2428H264_MC(put_, 2)
2429H264_MC(put_, 4)
2430H264_MC(put_, 8)
2431H264_MC(put_, 16)
2432H264_MC(avg_, 4)
2433H264_MC(avg_, 8)
2434H264_MC(avg_, 16)
2435
2436#undef op_avg
2437#undef op_put
2438#undef op2_avg
2439#undef op2_put
2440#endif
2441
2442#define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2443#define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2444#define H264_WEIGHT(W,H) \
2445static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2446 int y; \
2447 offset <<= log2_denom; \
2448 if(log2_denom) offset += 1<<(log2_denom-1); \
2449 for(y=0; y<H; y++, block += stride){ \
2450 op_scale1(0); \
2451 op_scale1(1); \
2452 if(W==2) continue; \
2453 op_scale1(2); \
2454 op_scale1(3); \
2455 if(W==4) continue; \
2456 op_scale1(4); \
2457 op_scale1(5); \
2458 op_scale1(6); \
2459 op_scale1(7); \
2460 if(W==8) continue; \
2461 op_scale1(8); \
2462 op_scale1(9); \
2463 op_scale1(10); \
2464 op_scale1(11); \
2465 op_scale1(12); \
2466 op_scale1(13); \
2467 op_scale1(14); \
2468 op_scale1(15); \
2469 } \
2470} \
2471static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2472 int y; \
2473 offset = ((offset + 1) | 1) << log2_denom; \
2474 for(y=0; y<H; y++, dst += stride, src += stride){ \
2475 op_scale2(0); \
2476 op_scale2(1); \
2477 if(W==2) continue; \
2478 op_scale2(2); \
2479 op_scale2(3); \
2480 if(W==4) continue; \
2481 op_scale2(4); \
2482 op_scale2(5); \
2483 op_scale2(6); \
2484 op_scale2(7); \
2485 if(W==8) continue; \
2486 op_scale2(8); \
2487 op_scale2(9); \
2488 op_scale2(10); \
2489 op_scale2(11); \
2490 op_scale2(12); \
2491 op_scale2(13); \
2492 op_scale2(14); \
2493 op_scale2(15); \
2494 } \
2495}
2496
2497H264_WEIGHT(16,16)
2498H264_WEIGHT(16,8)
2499H264_WEIGHT(8,16)
2500H264_WEIGHT(8,8)
2501H264_WEIGHT(8,4)
2502H264_WEIGHT(4,8)
2503H264_WEIGHT(4,4)
2504H264_WEIGHT(4,2)
2505H264_WEIGHT(2,4)
2506H264_WEIGHT(2,2)
2507
2508#undef op_scale1
2509#undef op_scale2
2510#undef H264_WEIGHT
2511
2512static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2513 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2514 int i;
2515
2516 for(i=0; i<h; i++){
2517 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2518 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2519 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2520 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2521 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2522 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2523 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2524 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2525 dst+=dstStride;
2526 src+=srcStride;
2527 }
2528}
2529
2530#if CONFIG_CAVS_DECODER
2531/* AVS specific */
2532void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2533
2534void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2535 put_pixels8_c(dst, src, stride, 8);
2536}
2537void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2538 avg_pixels8_c(dst, src, stride, 8);
2539}
2540void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2541 put_pixels16_c(dst, src, stride, 16);
2542}
2543void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2544 avg_pixels16_c(dst, src, stride, 16);
2545}
2546#endif /* CONFIG_CAVS_DECODER */
2547
2548#if CONFIG_VC1_DECODER || CONFIG_WMV3_DECODER
2549/* VC-1 specific */
2550void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2551
2552void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2553 put_pixels8_c(dst, src, stride, 8);
2554}
2555#endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2556
2557void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2558
2559/* H264 specific */
2560void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2561
2562#if CONFIG_RV30_DECODER
2563void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
2564#endif /* CONFIG_RV30_DECODER */
2565
2566#if CONFIG_RV40_DECODER
2567static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2568 put_pixels16_xy2_c(dst, src, stride, 16);
2569}
2570static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2571 avg_pixels16_xy2_c(dst, src, stride, 16);
2572}
2573static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2574 put_pixels8_xy2_c(dst, src, stride, 8);
2575}
2576static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2577 avg_pixels8_xy2_c(dst, src, stride, 8);
2578}
2579
2580void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
2581#endif /* CONFIG_RV40_DECODER */
2582
2583static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2584 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2585 int i;
2586
2587 for(i=0; i<w; i++){
2588 const int src_1= src[ -srcStride];
2589 const int src0 = src[0 ];
2590 const int src1 = src[ srcStride];
2591 const int src2 = src[2*srcStride];
2592 const int src3 = src[3*srcStride];
2593 const int src4 = src[4*srcStride];
2594 const int src5 = src[5*srcStride];
2595 const int src6 = src[6*srcStride];
2596 const int src7 = src[7*srcStride];
2597 const int src8 = src[8*srcStride];
2598 const int src9 = src[9*srcStride];
2599 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2600 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2601 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2602 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2603 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2604 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2605 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2606 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2607 src++;
2608 dst++;
2609 }
2610}
2611
2612static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2613 put_pixels8_c(dst, src, stride, 8);
2614}
2615
2616static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2617 uint8_t half[64];
2618 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2619 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2620}
2621
2622static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2623 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2624}
2625
2626static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2627 uint8_t half[64];
2628 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2629 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2630}
2631
2632static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2633 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2634}
2635
2636static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2637 uint8_t halfH[88];
2638 uint8_t halfV[64];
2639 uint8_t halfHV[64];
2640 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2641 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2642 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2643 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2644}
2645static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2646 uint8_t halfH[88];
2647 uint8_t halfV[64];
2648 uint8_t halfHV[64];
2649 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2650 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2651 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2652 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2653}
2654static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2655 uint8_t halfH[88];
2656 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2657 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2658}
2659
2660static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2661 if(CONFIG_ANY_H263) {
2662 int x;
2663 const int strength= ff_h263_loop_filter_strength[qscale];
2664
2665 for(x=0; x<8; x++){
2666 int d1, d2, ad1;
2667 int p0= src[x-2*stride];
2668 int p1= src[x-1*stride];
2669 int p2= src[x+0*stride];
2670 int p3= src[x+1*stride];
2671 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2672
2673 if (d<-2*strength) d1= 0;
2674 else if(d<- strength) d1=-2*strength - d;
2675 else if(d< strength) d1= d;
2676 else if(d< 2*strength) d1= 2*strength - d;
2677 else d1= 0;
2678
2679 p1 += d1;
2680 p2 -= d1;
2681 if(p1&256) p1= ~(p1>>31);
2682 if(p2&256) p2= ~(p2>>31);
2683
2684 src[x-1*stride] = p1;
2685 src[x+0*stride] = p2;
2686
2687 ad1= FFABS(d1)>>1;
2688
2689 d2= av_clip((p0-p3)/4, -ad1, ad1);
2690
2691 src[x-2*stride] = p0 - d2;
2692 src[x+ stride] = p3 + d2;
2693 }
2694 }
2695}
2696
2697static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2698 if(CONFIG_ANY_H263) {
2699 int y;
2700 const int strength= ff_h263_loop_filter_strength[qscale];
2701
2702 for(y=0; y<8; y++){
2703 int d1, d2, ad1;
2704 int p0= src[y*stride-2];
2705 int p1= src[y*stride-1];
2706 int p2= src[y*stride+0];
2707 int p3= src[y*stride+1];
2708 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2709
2710 if (d<-2*strength) d1= 0;
2711 else if(d<- strength) d1=-2*strength - d;
2712 else if(d< strength) d1= d;
2713 else if(d< 2*strength) d1= 2*strength - d;
2714 else d1= 0;
2715
2716 p1 += d1;
2717 p2 -= d1;
2718 if(p1&256) p1= ~(p1>>31);
2719 if(p2&256) p2= ~(p2>>31);
2720
2721 src[y*stride-1] = p1;
2722 src[y*stride+0] = p2;
2723
2724 ad1= FFABS(d1)>>1;
2725
2726 d2= av_clip((p0-p3)/4, -ad1, ad1);
2727
2728 src[y*stride-2] = p0 - d2;
2729 src[y*stride+1] = p3 + d2;
2730 }
2731 }
2732}
2733
2734static void h261_loop_filter_c(uint8_t *src, int stride){
2735 int x,y,xy,yz;
2736 int temp[64];
2737
2738 for(x=0; x<8; x++){
2739 temp[x ] = 4*src[x ];
2740 temp[x + 7*8] = 4*src[x + 7*stride];
2741 }
2742 for(y=1; y<7; y++){
2743 for(x=0; x<8; x++){
2744 xy = y * stride + x;
2745 yz = y * 8 + x;
2746 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2747 }
2748 }
2749
2750 for(y=0; y<8; y++){
2751 src[ y*stride] = (temp[ y*8] + 2)>>2;
2752 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2753 for(x=1; x<7; x++){
2754 xy = y * stride + x;
2755 yz = y * 8 + x;
2756 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2757 }
2758 }
2759}
2760
2761static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2762{
2763 int i, d;
2764 for( i = 0; i < 4; i++ ) {
2765 if( tc0[i] < 0 ) {
2766 pix += 4*ystride;
2767 continue;
2768 }
2769 for( d = 0; d < 4; d++ ) {
2770 const int p0 = pix[-1*xstride];
2771 const int p1 = pix[-2*xstride];
2772 const int p2 = pix[-3*xstride];
2773 const int q0 = pix[0];
2774 const int q1 = pix[1*xstride];
2775 const int q2 = pix[2*xstride];
2776
2777 if( FFABS( p0 - q0 ) < alpha &&
2778 FFABS( p1 - p0 ) < beta &&
2779 FFABS( q1 - q0 ) < beta ) {
2780
2781 int tc = tc0[i];
2782 int i_delta;
2783
2784 if( FFABS( p2 - p0 ) < beta ) {
2785 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2786 tc++;
2787 }
2788 if( FFABS( q2 - q0 ) < beta ) {
2789 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2790 tc++;
2791 }
2792
2793 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2794 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
2795 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
2796 }
2797 pix += ystride;
2798 }
2799 }
2800}
2801static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2802{
2803 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2804}
2805static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2806{
2807 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2808}
2809
2810static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2811{
2812 int d;
2813 for( d = 0; d < 16; d++ ) {
2814 const int p2 = pix[-3*xstride];
2815 const int p1 = pix[-2*xstride];
2816 const int p0 = pix[-1*xstride];
2817
2818 const int q0 = pix[ 0*xstride];
2819 const int q1 = pix[ 1*xstride];
2820 const int q2 = pix[ 2*xstride];
2821
2822 if( FFABS( p0 - q0 ) < alpha &&
2823 FFABS( p1 - p0 ) < beta &&
2824 FFABS( q1 - q0 ) < beta ) {
2825
2826 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
2827 if( FFABS( p2 - p0 ) < beta)
2828 {
2829 const int p3 = pix[-4*xstride];
2830 /* p0', p1', p2' */
2831 pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
2832 pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
2833 pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
2834 } else {
2835 /* p0' */
2836 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
2837 }
2838 if( FFABS( q2 - q0 ) < beta)
2839 {
2840 const int q3 = pix[3*xstride];
2841 /* q0', q1', q2' */
2842 pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
2843 pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
2844 pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
2845 } else {
2846 /* q0' */
2847 pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
2848 }
2849 }else{
2850 /* p0', q0' */
2851 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
2852 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
2853 }
2854 }
2855 pix += ystride;
2856 }
2857}
2858static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2859{
2860 h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
2861}
2862static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2863{
2864 h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
2865}
2866
2867static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2868{
2869 int i, d;
2870 for( i = 0; i < 4; i++ ) {
2871 const int tc = tc0[i];
2872 if( tc <= 0 ) {
2873 pix += 2*ystride;
2874 continue;
2875 }
2876 for( d = 0; d < 2; d++ ) {
2877 const int p0 = pix[-1*xstride];
2878 const int p1 = pix[-2*xstride];
2879 const int q0 = pix[0];
2880 const int q1 = pix[1*xstride];
2881
2882 if( FFABS( p0 - q0 ) < alpha &&
2883 FFABS( p1 - p0 ) < beta &&
2884 FFABS( q1 - q0 ) < beta ) {
2885
2886 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2887
2888 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
2889 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
2890 }
2891 pix += ystride;
2892 }
2893 }
2894}
2895static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2896{
2897 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2898}
2899static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2900{
2901 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2902}
2903
2904static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2905{
2906 int d;
2907 for( d = 0; d < 8; d++ ) {
2908 const int p0 = pix[-1*xstride];
2909 const int p1 = pix[-2*xstride];
2910 const int q0 = pix[0];
2911 const int q1 = pix[1*xstride];
2912
2913 if( FFABS( p0 - q0 ) < alpha &&
2914 FFABS( p1 - p0 ) < beta &&
2915 FFABS( q1 - q0 ) < beta ) {
2916
2917 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
2918 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
2919 }
2920 pix += ystride;
2921 }
2922}
2923static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2924{
2925 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2926}
2927static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2928{
2929 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2930}
2931
2932static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2933{
2934 int s, i;
2935
2936 s = 0;
2937 for(i=0;i<h;i++) {
2938 s += abs(pix1[0] - pix2[0]);
2939 s += abs(pix1[1] - pix2[1]);
2940 s += abs(pix1[2] - pix2[2]);
2941 s += abs(pix1[3] - pix2[3]);
2942 s += abs(pix1[4] - pix2[4]);
2943 s += abs(pix1[5] - pix2[5]);
2944 s += abs(pix1[6] - pix2[6]);
2945 s += abs(pix1[7] - pix2[7]);
2946 s += abs(pix1[8] - pix2[8]);
2947 s += abs(pix1[9] - pix2[9]);
2948 s += abs(pix1[10] - pix2[10]);
2949 s += abs(pix1[11] - pix2[11]);
2950 s += abs(pix1[12] - pix2[12]);
2951 s += abs(pix1[13] - pix2[13]);
2952 s += abs(pix1[14] - pix2[14]);
2953 s += abs(pix1[15] - pix2[15]);
2954 pix1 += line_size;
2955 pix2 += line_size;
2956 }
2957 return s;
2958}
2959
2960static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2961{
2962 int s, i;
2963
2964 s = 0;
2965 for(i=0;i<h;i++) {
2966 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2967 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2968 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2969 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2970 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2971 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2972 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2973 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2974 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2975 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2976 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2977 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2978 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2979 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2980 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2981 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2982 pix1 += line_size;
2983 pix2 += line_size;
2984 }
2985 return s;
2986}
2987
2988static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2989{
2990 int s, i;
2991 uint8_t *pix3 = pix2 + line_size;
2992
2993 s = 0;
2994 for(i=0;i<h;i++) {
2995 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2996 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2997 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2998 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2999 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3000 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3001 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3002 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3003 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3004 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3005 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3006 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3007 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3008 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3009 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3010 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3011 pix1 += line_size;
3012 pix2 += line_size;
3013 pix3 += line_size;
3014 }
3015 return s;
3016}
3017
3018static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3019{
3020 int s, i;
3021 uint8_t *pix3 = pix2 + line_size;
3022
3023 s = 0;
3024 for(i=0;i<h;i++) {
3025 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3026 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3027 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3028 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3029 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3030 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3031 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3032 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3033 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3034 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3035 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3036 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3037 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3038 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3039 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3040 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3041 pix1 += line_size;
3042 pix2 += line_size;
3043 pix3 += line_size;
3044 }
3045 return s;
3046}
3047
3048static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3049{
3050 int s, i;
3051
3052 s = 0;
3053 for(i=0;i<h;i++) {
3054 s += abs(pix1[0] - pix2[0]);
3055 s += abs(pix1[1] - pix2[1]);
3056 s += abs(pix1[2] - pix2[2]);
3057 s += abs(pix1[3] - pix2[3]);
3058 s += abs(pix1[4] - pix2[4]);
3059 s += abs(pix1[5] - pix2[5]);
3060 s += abs(pix1[6] - pix2[6]);
3061 s += abs(pix1[7] - pix2[7]);
3062 pix1 += line_size;
3063 pix2 += line_size;
3064 }
3065 return s;
3066}
3067
3068static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3069{
3070 int s, i;
3071
3072 s = 0;
3073 for(i=0;i<h;i++) {
3074 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3075 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3076 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3077 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3078 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3079 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3080 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3081 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3082 pix1 += line_size;
3083 pix2 += line_size;
3084 }
3085 return s;
3086}
3087
3088static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3089{
3090 int s, i;
3091 uint8_t *pix3 = pix2 + line_size;
3092
3093 s = 0;
3094 for(i=0;i<h;i++) {
3095 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3096 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3097 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3098 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3099 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3100 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3101 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3102 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3103 pix1 += line_size;
3104 pix2 += line_size;
3105 pix3 += line_size;
3106 }
3107 return s;
3108}
3109
3110static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3111{
3112 int s, i;
3113 uint8_t *pix3 = pix2 + line_size;
3114
3115 s = 0;
3116 for(i=0;i<h;i++) {
3117 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3118 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3119 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3120 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3121 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3122 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3123 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3124 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3125 pix1 += line_size;
3126 pix2 += line_size;
3127 pix3 += line_size;
3128 }
3129 return s;
3130}
3131
3132static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3133 MpegEncContext *c = v;
3134 int score1=0;
3135 int score2=0;
3136 int x,y;
3137
3138 for(y=0; y<h; y++){
3139 for(x=0; x<16; x++){
3140 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3141 }
3142 if(y+1<h){
3143 for(x=0; x<15; x++){
3144 score2+= FFABS( s1[x ] - s1[x +stride]
3145 - s1[x+1] + s1[x+1+stride])
3146 -FFABS( s2[x ] - s2[x +stride]
3147 - s2[x+1] + s2[x+1+stride]);
3148 }
3149 }
3150 s1+= stride;
3151 s2+= stride;
3152 }
3153
3154 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3155 else return score1 + FFABS(score2)*8;
3156}
3157
3158static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3159 MpegEncContext *c = v;
3160 int score1=0;
3161 int score2=0;
3162 int x,y;
3163
3164 for(y=0; y<h; y++){
3165 for(x=0; x<8; x++){
3166 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3167 }
3168 if(y+1<h){
3169 for(x=0; x<7; x++){
3170 score2+= FFABS( s1[x ] - s1[x +stride]
3171 - s1[x+1] + s1[x+1+stride])
3172 -FFABS( s2[x ] - s2[x +stride]
3173 - s2[x+1] + s2[x+1+stride]);
3174 }
3175 }
3176 s1+= stride;
3177 s2+= stride;
3178 }
3179
3180 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3181 else return score1 + FFABS(score2)*8;
3182}
3183
3184static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3185 int i;
3186 unsigned int sum=0;
3187
3188 for(i=0; i<8*8; i++){
3189 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3190 int w= weight[i];
3191 b>>= RECON_SHIFT;
3192 assert(-512<b && b<512);
3193
3194 sum += (w*b)*(w*b)>>4;
3195 }
3196 return sum>>2;
3197}
3198
3199static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3200 int i;
3201
3202 for(i=0; i<8*8; i++){
3203 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3204 }
3205}
3206
3207/**
3208 * permutes an 8x8 block.
3209 * @param block the block which will be permuted according to the given permutation vector
3210 * @param permutation the permutation vector
3211 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3212 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3213 * (inverse) permutated to scantable order!
3214 */
3215void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3216{
3217 int i;
3218 DCTELEM temp[64];
3219
3220 if(last<=0) return;
3221 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3222
3223 for(i=0; i<=last; i++){
3224 const int j= scantable[i];
3225 temp[j]= block[j];
3226 block[j]=0;
3227 }
3228
3229 for(i=0; i<=last; i++){
3230 const int j= scantable[i];
3231 const int perm_j= permutation[j];
3232 block[perm_j]= temp[j];
3233 }
3234}
3235
3236static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3237 return 0;
3238}
3239
3240void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3241 int i;
3242
3243 memset(cmp, 0, sizeof(void*)*6);
3244
3245 for(i=0; i<6; i++){
3246 switch(type&0xFF){
3247 case FF_CMP_SAD:
3248 cmp[i]= c->sad[i];
3249 break;
3250 case FF_CMP_SATD:
3251 cmp[i]= c->hadamard8_diff[i];
3252 break;
3253 case FF_CMP_SSE:
3254 cmp[i]= c->sse[i];
3255 break;
3256 case FF_CMP_DCT:
3257 cmp[i]= c->dct_sad[i];
3258 break;
3259 case FF_CMP_DCT264:
3260 cmp[i]= c->dct264_sad[i];
3261 break;
3262 case FF_CMP_DCTMAX:
3263 cmp[i]= c->dct_max[i];
3264 break;
3265 case FF_CMP_PSNR:
3266 cmp[i]= c->quant_psnr[i];
3267 break;
3268 case FF_CMP_BIT:
3269 cmp[i]= c->bit[i];
3270 break;
3271 case FF_CMP_RD:
3272 cmp[i]= c->rd[i];
3273 break;
3274 case FF_CMP_VSAD:
3275 cmp[i]= c->vsad[i];
3276 break;
3277 case FF_CMP_VSSE:
3278 cmp[i]= c->vsse[i];
3279 break;
3280 case FF_CMP_ZERO:
3281 cmp[i]= zero_cmp;
3282 break;
3283 case FF_CMP_NSSE:
3284 cmp[i]= c->nsse[i];
3285 break;
3286#if CONFIG_SNOW_ENCODER
3287 case FF_CMP_W53:
3288 cmp[i]= c->w53[i];
3289 break;
3290 case FF_CMP_W97:
3291 cmp[i]= c->w97[i];
3292 break;
3293#endif
3294 default:
3295 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3296 }
3297 }
3298}
3299
3300static void clear_block_c(DCTELEM *block)
3301{
3302 memset(block, 0, sizeof(DCTELEM)*64);
3303}
3304
3305/**
3306 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3307 */
3308static void clear_blocks_c(DCTELEM *blocks)
3309{
3310 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3311}
3312
3313static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3314 long i;
3315 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3316 long a = *(long*)(src+i);
3317 long b = *(long*)(dst+i);
3318 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3319 }
3320 for(; i<w; i++)
3321 dst[i+0] += src[i+0];
3322}
3323
3324static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3325 long i;
3326 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3327 long a = *(long*)(src1+i);
3328 long b = *(long*)(src2+i);
3329 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3330 }
3331 for(; i<w; i++)
3332 dst[i] = src1[i]+src2[i];
3333}
3334
3335static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3336 long i;
3337#if !HAVE_FAST_UNALIGNED
3338 if((long)src2 & (sizeof(long)-1)){
3339 for(i=0; i+7<w; i+=8){
3340 dst[i+0] = src1[i+0]-src2[i+0];
3341 dst[i+1] = src1[i+1]-src2[i+1];
3342 dst[i+2] = src1[i+2]-src2[i+2];
3343 dst[i+3] = src1[i+3]-src2[i+3];
3344 dst[i+4] = src1[i+4]-src2[i+4];
3345 dst[i+5] = src1[i+5]-src2[i+5];
3346 dst[i+6] = src1[i+6]-src2[i+6];
3347 dst[i+7] = src1[i+7]-src2[i+7];
3348 }
3349 }else
3350#endif
3351 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3352 long a = *(long*)(src1+i);
3353 long b = *(long*)(src2+i);
3354 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3355 }
3356 for(; i<w; i++)
3357 dst[i+0] = src1[i+0]-src2[i+0];
3358}
3359
3360static void add_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *diff, int w, int *left, int *left_top){
3361 int i;
3362 uint8_t l, lt;
3363
3364 l= *left;
3365 lt= *left_top;
3366
3367 for(i=0; i<w; i++){
3368 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3369 lt= src1[i];
3370 dst[i]= l;
3371 }
3372
3373 *left= l;
3374 *left_top= lt;
3375}
3376
3377static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3378 int i;
3379 uint8_t l, lt;
3380
3381 l= *left;
3382 lt= *left_top;
3383
3384 for(i=0; i<w; i++){
3385 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3386 lt= src1[i];
3387 l= src2[i];
3388 dst[i]= l - pred;
3389 }
3390
3391 *left= l;
3392 *left_top= lt;
3393}
3394
3395#define BUTTERFLY2(o1,o2,i1,i2) \
3396o1= (i1)+(i2);\
3397o2= (i1)-(i2);
3398
3399#define BUTTERFLY1(x,y) \
3400{\
3401 int a,b;\
3402 a= x;\
3403 b= y;\
3404 x= a+b;\
3405 y= a-b;\
3406}
3407
3408#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3409
3410static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3411 int i;
3412 int temp[64];
3413 int sum=0;
3414
3415 assert(h==8);
3416
3417 for(i=0; i<8; i++){
3418 //FIXME try pointer walks
3419 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3420 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3421 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3422 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3423
3424 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3425 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3426 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3427 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3428
3429 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3430 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3431 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3432 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3433 }
3434
3435 for(i=0; i<8; i++){
3436 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3437 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3438 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3439 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3440
3441 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3442 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3443 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3444 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3445
3446 sum +=
3447 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3448 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3449 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3450 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3451 }
3452#if 0
3453static int maxi=0;
3454if(sum>maxi){
3455 maxi=sum;
3456 printf("MAX:%d\n", maxi);
3457}
3458#endif
3459 return sum;
3460}
3461
3462static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3463 int i;
3464 int temp[64];
3465 int sum=0;
3466
3467 assert(h==8);
3468
3469 for(i=0; i<8; i++){
3470 //FIXME try pointer walks
3471 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3472 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3473 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3474 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3475
3476 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3477 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3478 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3479 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3480
3481 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3482 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3483 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3484 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3485 }
3486
3487 for(i=0; i<8; i++){
3488 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3489 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3490 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3491 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3492
3493 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3494 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3495 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3496 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3497
3498 sum +=
3499 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3500 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3501 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3502 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3503 }
3504
3505 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3506
3507 return sum;
3508}
3509
3510static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3511 MpegEncContext * const s= (MpegEncContext *)c;
3512 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3513 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3514
3515 assert(h==8);
3516
3517 s->dsp.diff_pixels(temp, src1, src2, stride);
3518 s->dsp.fdct(temp);
3519 return s->dsp.sum_abs_dctelem(temp);
3520}
3521
3522#if CONFIG_GPL
3523#define DCT8_1D {\
3524 const int s07 = SRC(0) + SRC(7);\
3525 const int s16 = SRC(1) + SRC(6);\
3526 const int s25 = SRC(2) + SRC(5);\
3527 const int s34 = SRC(3) + SRC(4);\
3528 const int a0 = s07 + s34;\
3529 const int a1 = s16 + s25;\
3530 const int a2 = s07 - s34;\
3531 const int a3 = s16 - s25;\
3532 const int d07 = SRC(0) - SRC(7);\
3533 const int d16 = SRC(1) - SRC(6);\
3534 const int d25 = SRC(2) - SRC(5);\
3535 const int d34 = SRC(3) - SRC(4);\
3536 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3537 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3538 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3539 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3540 DST(0, a0 + a1 ) ;\
3541 DST(1, a4 + (a7>>2)) ;\
3542 DST(2, a2 + (a3>>1)) ;\
3543 DST(3, a5 + (a6>>2)) ;\
3544 DST(4, a0 - a1 ) ;\
3545 DST(5, a6 - (a5>>2)) ;\
3546 DST(6, (a2>>1) - a3 ) ;\
3547 DST(7, (a4>>2) - a7 ) ;\
3548}
3549
3550static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3551 MpegEncContext * const s= (MpegEncContext *)c;
3552 DCTELEM dct[8][8];
3553 int i;
3554 int sum=0;
3555
3556 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3557
3558#define SRC(x) dct[i][x]
3559#define DST(x,v) dct[i][x]= v
3560 for( i = 0; i < 8; i++ )
3561 DCT8_1D
3562#undef SRC
3563#undef DST
3564
3565#define SRC(x) dct[x][i]
3566#define DST(x,v) sum += FFABS(v)
3567 for( i = 0; i < 8; i++ )
3568 DCT8_1D
3569#undef SRC
3570#undef DST
3571 return sum;
3572}
3573#endif
3574
3575static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3576 MpegEncContext * const s= (MpegEncContext *)c;
3577 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3578 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3579 int sum=0, i;
3580
3581 assert(h==8);
3582
3583 s->dsp.diff_pixels(temp, src1, src2, stride);
3584 s->dsp.fdct(temp);
3585
3586 for(i=0; i<64; i++)
3587 sum= FFMAX(sum, FFABS(temp[i]));
3588
3589 return sum;
3590}
3591
3592static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3593 MpegEncContext * const s= (MpegEncContext *)c;
3594 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3595 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3596 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3597 int sum=0, i;
3598
3599 assert(h==8);
3600 s->mb_intra=0;
3601
3602 s->dsp.diff_pixels(temp, src1, src2, stride);
3603
3604 memcpy(bak, temp, 64*sizeof(DCTELEM));
3605
3606 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3607 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3608 ff_simple_idct(temp); //FIXME
3609
3610 for(i=0; i<64; i++)
3611 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3612
3613 return sum;
3614}
3615
3616static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3617 MpegEncContext * const s= (MpegEncContext *)c;
3618 const uint8_t *scantable= s->intra_scantable.permutated;
3619 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3620 DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3621 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3622 uint8_t * const bak= (uint8_t*)aligned_bak;
3623 int i, last, run, bits, level, distortion, start_i;
3624 const int esc_length= s->ac_esc_length;
3625 uint8_t * length;
3626 uint8_t * last_length;
3627
3628 assert(h==8);
3629
3630 for(i=0; i<8; i++){
3631 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3632 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3633 }
3634
3635 s->dsp.diff_pixels(temp, src1, src2, stride);
3636
3637 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3638
3639 bits=0;
3640
3641 if (s->mb_intra) {
3642 start_i = 1;
3643 length = s->intra_ac_vlc_length;
3644 last_length= s->intra_ac_vlc_last_length;
3645 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3646 } else {
3647 start_i = 0;
3648 length = s->inter_ac_vlc_length;
3649 last_length= s->inter_ac_vlc_last_length;
3650 }
3651
3652 if(last>=start_i){
3653 run=0;
3654 for(i=start_i; i<last; i++){
3655 int j= scantable[i];
3656 level= temp[j];
3657
3658 if(level){
3659 level+=64;
3660 if((level&(~127)) == 0){
3661 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3662 }else
3663 bits+= esc_length;
3664 run=0;
3665 }else
3666 run++;
3667 }
3668 i= scantable[last];
3669
3670 level= temp[i] + 64;
3671
3672 assert(level - 64);
3673
3674 if((level&(~127)) == 0){
3675 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3676 }else
3677 bits+= esc_length;
3678
3679 }
3680
3681 if(last>=0){
3682 if(s->mb_intra)
3683 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3684 else
3685 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3686 }
3687
3688 s->dsp.idct_add(bak, stride, temp);
3689
3690 distortion= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3691
3692 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3693}
3694
3695static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3696 MpegEncContext * const s= (MpegEncContext *)c;
3697 const uint8_t *scantable= s->intra_scantable.permutated;
3698 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3699 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3700 int i, last, run, bits, level, start_i;
3701 const int esc_length= s->ac_esc_length;
3702 uint8_t * length;
3703 uint8_t * last_length;
3704
3705 assert(h==8);
3706
3707 s->dsp.diff_pixels(temp, src1, src2, stride);
3708
3709 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3710
3711 bits=0;
3712
3713 if (s->mb_intra) {
3714 start_i = 1;
3715 length = s->intra_ac_vlc_length;
3716 last_length= s->intra_ac_vlc_last_length;
3717 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3718 } else {
3719 start_i = 0;
3720 length = s->inter_ac_vlc_length;
3721 last_length= s->inter_ac_vlc_last_length;
3722 }
3723
3724 if(last>=start_i){
3725 run=0;
3726 for(i=start_i; i<last; i++){
3727 int j= scantable[i];
3728 level= temp[j];
3729
3730 if(level){
3731 level+=64;
3732 if((level&(~127)) == 0){
3733 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3734 }else
3735 bits+= esc_length;
3736 run=0;
3737 }else
3738 run++;
3739 }
3740 i= scantable[last];
3741
3742 level= temp[i] + 64;
3743
3744 assert(level - 64);
3745
3746 if((level&(~127)) == 0){
3747 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3748 }else
3749 bits+= esc_length;
3750 }
3751
3752 return bits;
3753}
3754
3755#define VSAD_INTRA(size) \
3756static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3757 int score=0; \
3758 int x,y; \
3759 \
3760 for(y=1; y<h; y++){ \
3761 for(x=0; x<size; x+=4){ \
3762 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
3763 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
3764 } \
3765 s+= stride; \
3766 } \
3767 \
3768 return score; \
3769}
3770VSAD_INTRA(8)
3771VSAD_INTRA(16)
3772
3773static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3774 int score=0;
3775 int x,y;
3776
3777 for(y=1; y<h; y++){
3778 for(x=0; x<16; x++){
3779 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3780 }
3781 s1+= stride;
3782 s2+= stride;
3783 }
3784
3785 return score;
3786}
3787
3788#define SQ(a) ((a)*(a))
3789#define VSSE_INTRA(size) \
3790static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3791 int score=0; \
3792 int x,y; \
3793 \
3794 for(y=1; y<h; y++){ \
3795 for(x=0; x<size; x+=4){ \
3796 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
3797 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
3798 } \
3799 s+= stride; \
3800 } \
3801 \
3802 return score; \
3803}
3804VSSE_INTRA(8)
3805VSSE_INTRA(16)
3806
3807static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3808 int score=0;
3809 int x,y;
3810
3811 for(y=1; y<h; y++){
3812 for(x=0; x<16; x++){
3813 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3814 }
3815 s1+= stride;
3816 s2+= stride;
3817 }
3818
3819 return score;
3820}
3821
3822static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3823 int size){
3824 int score=0;
3825 int i;
3826 for(i=0; i<size; i++)
3827 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3828 return score;
3829}
3830
3831WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3832WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3833WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3834#if CONFIG_GPL
3835WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3836#endif
3837WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3838WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3839WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3840WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3841#endif
3842static void vector_fmul_c(float *dst, const float *src, int len){
3843 int i;
3844 for(i=0; i<len; i++)
3845 dst[i] *= src[i];
3846}
3847
3848static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3849 int i;
3850 src1 += len-1;
3851 for(i=0; i<len; i++)
3852 dst[i] = src0[i] * src1[-i];
3853}
3854
3855void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3856 int i;
3857 for(i=0; i<len; i++)
3858 dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3859}
3860
3861void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
3862 int i,j;
3863 dst += len;
3864 win += len;
3865 src0+= len;
3866 for(i=-len, j=len-1; i<0; i++, j--) {
3867 float s0 = src0[i];
3868 float s1 = src1[j];
3869 float wi = win[i];
3870 float wj = win[j];
3871 dst[i] = s0*wj - s1*wi + add_bias;
3872 dst[j] = s0*wi + s1*wj + add_bias;
3873 }
3874}
3875#if 0
3876static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
3877 int i;
3878 for(i=0; i<len; i++)
3879 dst[i] = src[i] * mul;
3880}
3881
3882static av_always_inline int float_to_int16_one(const float *src){
3883 int_fast32_t tmp = *(const int32_t*)src;
3884 if(tmp & 0xf0000){
3885 tmp = (0x43c0ffff - tmp)>>31;
3886 // is this faster on some gcc/cpu combinations?
3887// if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3888// else tmp = 0;
3889 }
3890 return tmp - 0x8000;
3891}
3892
3893void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
3894 int i;
3895 for(i=0; i<len; i++)
3896 dst[i] = float_to_int16_one(src+i);
3897}
3898
3899void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
3900 int i,j,c;
3901 if(channels==2){
3902 for(i=0; i<len; i++){
3903 dst[2*i] = float_to_int16_one(src[0]+i);
3904 dst[2*i+1] = float_to_int16_one(src[1]+i);
3905 }
3906 }else{
3907 for(c=0; c<channels; c++)
3908 for(i=0, j=c; i<len; i++, j+=channels)
3909 dst[j] = float_to_int16_one(src[c]+i);
3910 }
3911}
3912
3913static void add_int16_c(int16_t * v1, int16_t * v2, int order)
3914{
3915 while (order--)
3916 *v1++ += *v2++;
3917}
3918
3919static void sub_int16_c(int16_t * v1, int16_t * v2, int order)
3920{
3921 while (order--)
3922 *v1++ -= *v2++;
3923}
3924
3925static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
3926{
3927 int res = 0;
3928
3929 while (order--)
3930 res += (*v1++ * *v2++) >> shift;
3931
3932 return res;
3933}
3934
3935#define W0 2048
3936#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3937#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3938#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3939#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3940#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3941#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3942#define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
3943
3944static void wmv2_idct_row(short * b)
3945{
3946 int s1,s2;
3947 int a0,a1,a2,a3,a4,a5,a6,a7;
3948 /*step 1*/
3949 a1 = W1*b[1]+W7*b[7];
3950 a7 = W7*b[1]-W1*b[7];
3951 a5 = W5*b[5]+W3*b[3];
3952 a3 = W3*b[5]-W5*b[3];
3953 a2 = W2*b[2]+W6*b[6];
3954 a6 = W6*b[2]-W2*b[6];
3955 a0 = W0*b[0]+W0*b[4];
3956 a4 = W0*b[0]-W0*b[4];
3957 /*step 2*/
3958 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3959 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3960 /*step 3*/
3961 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3962 b[1] = (a4+a6 +s1 + (1<<7))>>8;
3963 b[2] = (a4-a6 +s2 + (1<<7))>>8;
3964 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3965 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3966 b[5] = (a4-a6 -s2 + (1<<7))>>8;
3967 b[6] = (a4+a6 -s1 + (1<<7))>>8;
3968 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3969}
3970static void wmv2_idct_col(short * b)
3971{
3972 int s1,s2;
3973 int a0,a1,a2,a3,a4,a5,a6,a7;
3974 /*step 1, with extended precision*/
3975 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3976 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3977 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3978 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
3979 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
3980 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
3981 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
3982 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
3983 /*step 2*/
3984 s1 = (181*(a1-a5+a7-a3)+128)>>8;
3985 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3986 /*step 3*/
3987 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
3988 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
3989 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
3990 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
3991
3992 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
3993 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
3994 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
3995 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
3996}
3997void ff_wmv2_idct_c(short * block){
3998 int i;
3999
4000 for(i=0;i<64;i+=8){
4001 wmv2_idct_row(block+i);
4002 }
4003 for(i=0;i<8;i++){
4004 wmv2_idct_col(block+i);
4005 }
4006}
4007/* XXX: those functions should be suppressed ASAP when all IDCTs are
4008 converted */
4009static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4010{
4011 ff_wmv2_idct_c(block);
4012 put_pixels_clamped_c(block, dest, line_size);
4013}
4014static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4015{
4016 ff_wmv2_idct_c(block);
4017 add_pixels_clamped_c(block, dest, line_size);
4018}
4019static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4020{
4021 j_rev_dct (block);
4022 put_pixels_clamped_c(block, dest, line_size);
4023}
4024static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4025{
4026 j_rev_dct (block);
4027 add_pixels_clamped_c(block, dest, line_size);
4028}
4029
4030static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4031{
4032 j_rev_dct4 (block);
4033 put_pixels_clamped4_c(block, dest, line_size);
4034}
4035static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4036{
4037 j_rev_dct4 (block);
4038 add_pixels_clamped4_c(block, dest, line_size);
4039}
4040
4041static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4042{
4043 j_rev_dct2 (block);
4044 put_pixels_clamped2_c(block, dest, line_size);
4045}
4046static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4047{
4048 j_rev_dct2 (block);
4049 add_pixels_clamped2_c(block, dest, line_size);
4050}
4051
4052static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4053{
4054 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4055
4056 dest[0] = cm[(block[0] + 4)>>3];
4057}
4058static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4059{
4060 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4061
4062 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4063}
4064
4065static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4066#endif
4067/* init static data */
4068void dsputil_static_init(void)
4069{
4070 int i;
4071
4072 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4073 for(i=0;i<MAX_NEG_CROP;i++) {
4074 ff_cropTbl[i] = 0;
4075 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4076 }
4077
4078 for(i=0;i<512;i++) {
4079 ff_squareTbl[i] = (i - 256) * (i - 256);
4080 }
4081
4082 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4083}
4084
4085int ff_check_alignment(void){
4086 static int did_fail=0;
4087 DECLARE_ALIGNED_16(int, aligned);
4088
4089 if((long)&aligned & 15){
4090 if(!did_fail){
4091#if HAVE_MMX || HAVE_ALTIVEC
4092 av_log(NULL, AV_LOG_ERROR,
4093 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4094 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4095 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4096 "Do not report crashes to FFmpeg developers.\n");
4097#endif
4098 did_fail=1;
4099 }
4100 return -1;
4101 }
4102 return 0;
4103}
4104
4105void dsputil_init(DSPContext* c)
4106{
4107 ff_check_alignment();
4108
4109 c->vector_fmul = vector_fmul_c;
4110 c->vector_fmul_reverse = vector_fmul_reverse_c;
4111 c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4112 c->vector_fmul_window = ff_vector_fmul_window_c;
4113}
4114