summaryrefslogtreecommitdiff
path: root/apps/codecs/libwmapro/dsputil.h
diff options
context:
space:
mode:
authorMohamed Tarek <mt@rockbox.org>2010-04-30 11:11:56 +0000
committerMohamed Tarek <mt@rockbox.org>2010-04-30 11:11:56 +0000
commitcf43e5083b9e0f87de262ea31fd8067225ebfcda (patch)
tree073e6f4cd9561564d85e410a35432e1f4ead5b11 /apps/codecs/libwmapro/dsputil.h
parentbc3c5c16571487bf71fed8c22b30ee40481e156e (diff)
downloadrockbox-cf43e5083b9e0f87de262ea31fd8067225ebfcda.tar.gz
rockbox-cf43e5083b9e0f87de262ea31fd8067225ebfcda.zip
Add libwmapro to apps/codecs. These files comprise a set of unmodified files needed from ffmpeg's libavcodec and libavutil to compile and use the wma pro decoder standalone. The files were taken from ffmpeg's svn r22886 dated 15 April 2010.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@25763 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/codecs/libwmapro/dsputil.h')
-rw-r--r--apps/codecs/libwmapro/dsputil.h808
1 files changed, 808 insertions, 0 deletions
diff --git a/apps/codecs/libwmapro/dsputil.h b/apps/codecs/libwmapro/dsputil.h
new file mode 100644
index 0000000000..d1816e66ba
--- /dev/null
+++ b/apps/codecs/libwmapro/dsputil.h
@@ -0,0 +1,808 @@
1/*
2 * DSP utils
3 * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23/**
24 * @file libavcodec/dsputil.h
25 * DSP utils.
26 * note, many functions in here may use MMX which trashes the FPU state, it is
27 * absolutely necessary to call emms_c() between dsp & float/double code
28 */
29
30#ifndef AVCODEC_DSPUTIL_H
31#define AVCODEC_DSPUTIL_H
32
33#include "libavutil/intreadwrite.h"
34#include "avcodec.h"
35
36
37//#define DEBUG
38/* dct code */
39typedef short DCTELEM;
40
41void fdct_ifast (DCTELEM *data);
42void fdct_ifast248 (DCTELEM *data);
43void ff_jpeg_fdct_islow (DCTELEM *data);
44void ff_fdct248_islow (DCTELEM *data);
45
46void j_rev_dct (DCTELEM *data);
47void j_rev_dct4 (DCTELEM *data);
48void j_rev_dct2 (DCTELEM *data);
49void j_rev_dct1 (DCTELEM *data);
50void ff_wmv2_idct_c(DCTELEM *data);
51
52void ff_fdct_mmx(DCTELEM *block);
53void ff_fdct_mmx2(DCTELEM *block);
54void ff_fdct_sse2(DCTELEM *block);
55
56void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride);
57void ff_h264_idct_add_c(uint8_t *dst, DCTELEM *block, int stride);
58void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride);
59void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride);
60void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block);
61void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block);
62void ff_h264_idct_add16_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
63void ff_h264_idct_add16intra_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
64void ff_h264_idct8_add4_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
65void ff_h264_idct_add8_c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
66
67void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1,
68 const float *win, float add_bias, int len);
69void ff_float_to_int16_c(int16_t *dst, const float *src, long len);
70void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels);
71
72/* encoding scans */
73extern const uint8_t ff_alternate_horizontal_scan[64];
74extern const uint8_t ff_alternate_vertical_scan[64];
75extern const uint8_t ff_zigzag_direct[64];
76extern const uint8_t ff_zigzag248_direct[64];
77
78/* pixel operations */
79#define MAX_NEG_CROP 1024
80
81/* temporary */
82extern uint32_t ff_squareTbl[512];
83extern uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP];
84
85/* VP3 DSP functions */
86void ff_vp3_idct_c(DCTELEM *block/* align 16*/);
87void ff_vp3_idct_put_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
88void ff_vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
89
90void ff_vp3_v_loop_filter_c(uint8_t *src, int stride, int *bounding_values);
91void ff_vp3_h_loop_filter_c(uint8_t *src, int stride, int *bounding_values);
92
93/* VP6 DSP functions */
94void ff_vp6_filter_diag4_c(uint8_t *dst, uint8_t *src, int stride,
95 const int16_t *h_weights, const int16_t *v_weights);
96
97/* Bink functions */
98void ff_bink_idct_c (DCTELEM *block);
99void ff_bink_idct_add_c(uint8_t *dest, int linesize, DCTELEM *block);
100void ff_bink_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
101
102/* CAVS functions */
103void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride);
104void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride);
105void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride);
106void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride);
107
108/* VC1 functions */
109void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd);
110void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd);
111
112/* EA functions */
113void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
114
115/* 1/2^n downscaling functions from imgconvert.c */
116void ff_img_copy_plane(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
117void ff_shrink22(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
118void ff_shrink44(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
119void ff_shrink88(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
120
121void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
122 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
123
124/* minimum alignment rules ;)
125If you notice errors in the align stuff, need more alignment for some ASM code
126for some CPU or need to use a function with less aligned data then send a mail
127to the ffmpeg-devel mailing list, ...
128
129!warning These alignments might not match reality, (missing attribute((align))
130stuff somewhere possible).
131I (Michael) did not check them, these are just the alignments which I think
132could be reached easily ...
133
134!future video codecs might need functions with less strict alignment
135*/
136
137/*
138void get_pixels_c(DCTELEM *block, const uint8_t *pixels, int line_size);
139void diff_pixels_c(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride);
140void put_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size);
141void add_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size);
142void clear_blocks_c(DCTELEM *blocks);
143*/
144
145/* add and put pixel (decoding) */
146// blocksizes for op_pixels_func are 8x4,8x8 16x8 16x16
147//h for op_pixels_func is limited to {width/2, width} but never larger than 16 and never smaller then 4
148typedef void (*op_pixels_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int h);
149typedef void (*tpel_mc_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int w, int h);
150typedef void (*qpel_mc_func)(uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);
151typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
152
153typedef void (*op_fill_func)(uint8_t *block/*align width (8 or 16)*/, uint8_t value, int line_size, int h);
154
155#define DEF_OLD_QPEL(name)\
156void ff_put_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\
157void ff_put_no_rnd_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\
158void ff_avg_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);
159
160DEF_OLD_QPEL(qpel16_mc11_old_c)
161DEF_OLD_QPEL(qpel16_mc31_old_c)
162DEF_OLD_QPEL(qpel16_mc12_old_c)
163DEF_OLD_QPEL(qpel16_mc32_old_c)
164DEF_OLD_QPEL(qpel16_mc13_old_c)
165DEF_OLD_QPEL(qpel16_mc33_old_c)
166DEF_OLD_QPEL(qpel8_mc11_old_c)
167DEF_OLD_QPEL(qpel8_mc31_old_c)
168DEF_OLD_QPEL(qpel8_mc12_old_c)
169DEF_OLD_QPEL(qpel8_mc32_old_c)
170DEF_OLD_QPEL(qpel8_mc13_old_c)
171DEF_OLD_QPEL(qpel8_mc33_old_c)
172
173#define CALL_2X_PIXELS(a, b, n)\
174static void a(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
175 b(block , pixels , line_size, h);\
176 b(block+n, pixels+n, line_size, h);\
177}
178
179/* motion estimation */
180// h is limited to {width/2, width, 2*width} but never larger than 16 and never smaller then 2
181// although currently h<4 is not used as functions with width <8 are neither used nor implemented
182typedef int (*me_cmp_func)(void /*MpegEncContext*/ *s, uint8_t *blk1/*align width (8 or 16)*/, uint8_t *blk2/*align 1*/, int line_size, int h)/* __attribute__ ((const))*/;
183
184/**
185 * Scantable.
186 */
187typedef struct ScanTable{
188 const uint8_t *scantable;
189 uint8_t permutated[64];
190 uint8_t raster_end[64];
191#if ARCH_PPC
192 /** Used by dct_quantize_altivec to find last-non-zero */
193 DECLARE_ALIGNED(16, uint8_t, inverse)[64];
194#endif
195} ScanTable;
196
197void ff_init_scantable(uint8_t *, ScanTable *st, const uint8_t *src_scantable);
198
199void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize,
200 int block_w, int block_h,
201 int src_x, int src_y, int w, int h);
202
203/**
204 * DSPContext.
205 */
206typedef struct DSPContext {
207 /* pixel ops : interface with DCT */
208 void (*get_pixels)(DCTELEM *block/*align 16*/, const uint8_t *pixels/*align 8*/, int line_size);
209 void (*diff_pixels)(DCTELEM *block/*align 16*/, const uint8_t *s1/*align 8*/, const uint8_t *s2/*align 8*/, int stride);
210 void (*put_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
211 void (*put_signed_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
212 void (*put_pixels_nonclamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
213 void (*add_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
214 void (*add_pixels8)(uint8_t *pixels, DCTELEM *block, int line_size);
215 void (*add_pixels4)(uint8_t *pixels, DCTELEM *block, int line_size);
216 int (*sum_abs_dctelem)(DCTELEM *block/*align 16*/);
217 /**
218 * translational global motion compensation.
219 */
220 void (*gmc1)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x16, int y16, int rounder);
221 /**
222 * global motion compensation.
223 */
224 void (*gmc )(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int ox, int oy,
225 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
226 void (*clear_block)(DCTELEM *block/*align 16*/);
227 void (*clear_blocks)(DCTELEM *blocks/*align 16*/);
228 int (*pix_sum)(uint8_t * pix, int line_size);
229 int (*pix_norm1)(uint8_t * pix, int line_size);
230// 16x16 8x8 4x4 2x2 16x8 8x4 4x2 8x16 4x8 2x4
231
232 me_cmp_func sad[6]; /* identical to pix_absAxA except additional void * */
233 me_cmp_func sse[6];
234 me_cmp_func hadamard8_diff[6];
235 me_cmp_func dct_sad[6];
236 me_cmp_func quant_psnr[6];
237 me_cmp_func bit[6];
238 me_cmp_func rd[6];
239 me_cmp_func vsad[6];
240 me_cmp_func vsse[6];
241 me_cmp_func nsse[6];
242 me_cmp_func w53[6];
243 me_cmp_func w97[6];
244 me_cmp_func dct_max[6];
245 me_cmp_func dct264_sad[6];
246
247 me_cmp_func me_pre_cmp[6];
248 me_cmp_func me_cmp[6];
249 me_cmp_func me_sub_cmp[6];
250 me_cmp_func mb_cmp[6];
251 me_cmp_func ildct_cmp[6]; //only width 16 used
252 me_cmp_func frame_skip_cmp[6]; //only width 8 used
253
254 int (*ssd_int8_vs_int16)(const int8_t *pix1, const int16_t *pix2,
255 int size);
256
257 /**
258 * Halfpel motion compensation with rounding (a+b+1)>>1.
259 * this is an array[4][4] of motion compensation functions for 4
260 * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
261 * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
262 * @param block destination where the result is stored
263 * @param pixels source
264 * @param line_size number of bytes in a horizontal line of block
265 * @param h height
266 */
267 op_pixels_func put_pixels_tab[4][4];
268
269 /**
270 * Halfpel motion compensation with rounding (a+b+1)>>1.
271 * This is an array[4][4] of motion compensation functions for 4
272 * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
273 * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
274 * @param block destination into which the result is averaged (a+b+1)>>1
275 * @param pixels source
276 * @param line_size number of bytes in a horizontal line of block
277 * @param h height
278 */
279 op_pixels_func avg_pixels_tab[4][4];
280
281 /**
282 * Halfpel motion compensation with no rounding (a+b)>>1.
283 * this is an array[2][4] of motion compensation functions for 2
284 * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
285 * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
286 * @param block destination where the result is stored
287 * @param pixels source
288 * @param line_size number of bytes in a horizontal line of block
289 * @param h height
290 */
291 op_pixels_func put_no_rnd_pixels_tab[4][4];
292
293 /**
294 * Halfpel motion compensation with no rounding (a+b)>>1.
295 * this is an array[2][4] of motion compensation functions for 2
296 * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
297 * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
298 * @param block destination into which the result is averaged (a+b)>>1
299 * @param pixels source
300 * @param line_size number of bytes in a horizontal line of block
301 * @param h height
302 */
303 op_pixels_func avg_no_rnd_pixels_tab[4][4];
304
305 void (*put_no_rnd_pixels_l2[2])(uint8_t *block/*align width (8 or 16)*/, const uint8_t *a/*align 1*/, const uint8_t *b/*align 1*/, int line_size, int h);
306
307 /**
308 * Thirdpel motion compensation with rounding (a+b+1)>>1.
309 * this is an array[12] of motion compensation functions for the 9 thirdpe
310 * positions<br>
311 * *pixels_tab[ xthirdpel + 4*ythirdpel ]
312 * @param block destination where the result is stored
313 * @param pixels source
314 * @param line_size number of bytes in a horizontal line of block
315 * @param h height
316 */
317 tpel_mc_func put_tpel_pixels_tab[11]; //FIXME individual func ptr per width?
318 tpel_mc_func avg_tpel_pixels_tab[11]; //FIXME individual func ptr per width?
319
320 qpel_mc_func put_qpel_pixels_tab[2][16];
321 qpel_mc_func avg_qpel_pixels_tab[2][16];
322 qpel_mc_func put_no_rnd_qpel_pixels_tab[2][16];
323 qpel_mc_func avg_no_rnd_qpel_pixels_tab[2][16];
324 qpel_mc_func put_mspel_pixels_tab[8];
325
326 /**
327 * h264 Chroma MC
328 */
329 h264_chroma_mc_func put_h264_chroma_pixels_tab[3];
330 h264_chroma_mc_func avg_h264_chroma_pixels_tab[3];
331 /* This is really one func used in VC-1 decoding */
332 h264_chroma_mc_func put_no_rnd_vc1_chroma_pixels_tab[3];
333 h264_chroma_mc_func avg_no_rnd_vc1_chroma_pixels_tab[3];
334
335 qpel_mc_func put_h264_qpel_pixels_tab[4][16];
336 qpel_mc_func avg_h264_qpel_pixels_tab[4][16];
337
338 qpel_mc_func put_2tap_qpel_pixels_tab[4][16];
339 qpel_mc_func avg_2tap_qpel_pixels_tab[4][16];
340
341 /* AVS specific */
342 qpel_mc_func put_cavs_qpel_pixels_tab[2][16];
343 qpel_mc_func avg_cavs_qpel_pixels_tab[2][16];
344 void (*cavs_filter_lv)(uint8_t *pix, int stride, int alpha, int beta, int tc, int bs1, int bs2);
345 void (*cavs_filter_lh)(uint8_t *pix, int stride, int alpha, int beta, int tc, int bs1, int bs2);
346 void (*cavs_filter_cv)(uint8_t *pix, int stride, int alpha, int beta, int tc, int bs1, int bs2);
347 void (*cavs_filter_ch)(uint8_t *pix, int stride, int alpha, int beta, int tc, int bs1, int bs2);
348 void (*cavs_idct8_add)(uint8_t *dst, DCTELEM *block, int stride);
349
350 me_cmp_func pix_abs[2][4];
351
352 /* huffyuv specific */
353 void (*add_bytes)(uint8_t *dst/*align 16*/, uint8_t *src/*align 16*/, int w);
354 void (*add_bytes_l2)(uint8_t *dst/*align 16*/, uint8_t *src1/*align 16*/, uint8_t *src2/*align 16*/, int w);
355 void (*diff_bytes)(uint8_t *dst/*align 16*/, uint8_t *src1/*align 16*/, uint8_t *src2/*align 1*/,int w);
356 /**
357 * subtract huffyuv's variant of median prediction
358 * note, this might read from src1[-1], src2[-1]
359 */
360 void (*sub_hfyu_median_prediction)(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top);
361 void (*add_hfyu_median_prediction)(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
362 int (*add_hfyu_left_prediction)(uint8_t *dst, const uint8_t *src, int w, int left);
363 void (*add_hfyu_left_prediction_bgr32)(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha);
364 /* this might write to dst[w] */
365 void (*add_png_paeth_prediction)(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
366 void (*bswap_buf)(uint32_t *dst, const uint32_t *src, int w);
367
368 void (*h263_v_loop_filter)(uint8_t *src, int stride, int qscale);
369 void (*h263_h_loop_filter)(uint8_t *src, int stride, int qscale);
370
371 void (*h261_loop_filter)(uint8_t *src, int stride);
372
373 void (*x8_v_loop_filter)(uint8_t *src, int stride, int qscale);
374 void (*x8_h_loop_filter)(uint8_t *src, int stride, int qscale);
375
376 void (*vp3_v_loop_filter)(uint8_t *src, int stride, int *bounding_values);
377 void (*vp3_h_loop_filter)(uint8_t *src, int stride, int *bounding_values);
378
379 void (*vp6_filter_diag4)(uint8_t *dst, uint8_t *src, int stride,
380 const int16_t *h_weights,const int16_t *v_weights);
381
382 /* assume len is a multiple of 4, and arrays are 16-byte aligned */
383 void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize);
384 void (*ac3_downmix)(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
385 /* no alignment needed */
386 void (*lpc_compute_autocorr)(const int32_t *data, int len, int lag, double *autoc);
387 /* assume len is a multiple of 8, and arrays are 16-byte aligned */
388 void (*vector_fmul)(float *dst, const float *src, int len);
389 void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len);
390 /* assume len is a multiple of 8, and src arrays are 16-byte aligned */
391 void (*vector_fmul_add)(float *dst, const float *src0, const float *src1, const float *src2, int len);
392 /* assume len is a multiple of 4, and arrays are 16-byte aligned */
393 void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len);
394 /* assume len is a multiple of 8, and arrays are 16-byte aligned */
395 void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
396 void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */);
397 /**
398 * Multiply a vector of floats by a scalar float. Source and
399 * destination vectors must overlap exactly or not at all.
400 * @param dst result vector, 16-byte aligned
401 * @param src input vector, 16-byte aligned
402 * @param mul scalar value
403 * @param len length of vector, multiple of 4
404 */
405 void (*vector_fmul_scalar)(float *dst, const float *src, float mul,
406 int len);
407 /**
408 * Multiply a vector of floats by concatenated short vectors of
409 * floats and by a scalar float. Source and destination vectors
410 * must overlap exactly or not at all.
411 * [0]: short vectors of length 2, 8-byte aligned
412 * [1]: short vectors of length 4, 16-byte aligned
413 * @param dst output vector, 16-byte aligned
414 * @param src input vector, 16-byte aligned
415 * @param sv array of pointers to short vectors
416 * @param mul scalar value
417 * @param len number of elements in src and dst, multiple of 4
418 */
419 void (*vector_fmul_sv_scalar[2])(float *dst, const float *src,
420 const float **sv, float mul, int len);
421 /**
422 * Multiply short vectors of floats by a scalar float, store
423 * concatenated result.
424 * [0]: short vectors of length 2, 8-byte aligned
425 * [1]: short vectors of length 4, 16-byte aligned
426 * @param dst output vector, 16-byte aligned
427 * @param sv array of pointers to short vectors
428 * @param mul scalar value
429 * @param len number of output elements, multiple of 4
430 */
431 void (*sv_fmul_scalar[2])(float *dst, const float **sv,
432 float mul, int len);
433 /**
434 * Calculate the scalar product of two vectors of floats.
435 * @param v1 first vector, 16-byte aligned
436 * @param v2 second vector, 16-byte aligned
437 * @param len length of vectors, multiple of 4
438 */
439 float (*scalarproduct_float)(const float *v1, const float *v2, int len);
440 /**
441 * Calculate the sum and difference of two vectors of floats.
442 * @param v1 first input vector, sum output, 16-byte aligned
443 * @param v2 second input vector, difference output, 16-byte aligned
444 * @param len length of vectors, multiple of 4
445 */
446 void (*butterflies_float)(float *restrict v1, float *restrict v2, int len);
447
448 /* C version: convert floats from the range [384.0,386.0] to ints in [-32768,32767]
449 * simd versions: convert floats from [-32768.0,32767.0] without rescaling and arrays are 16byte aligned */
450 void (*float_to_int16)(int16_t *dst, const float *src, long len);
451 void (*float_to_int16_interleave)(int16_t *dst, const float **src, long len, int channels);
452
453 /* (I)DCT */
454 void (*fdct)(DCTELEM *block/* align 16*/);
455 void (*fdct248)(DCTELEM *block/* align 16*/);
456
457 /* IDCT really*/
458 void (*idct)(DCTELEM *block/* align 16*/);
459
460 /**
461 * block -> idct -> clip to unsigned 8 bit -> dest.
462 * (-1392, 0, 0, ...) -> idct -> (-174, -174, ...) -> put -> (0, 0, ...)
463 * @param line_size size in bytes of a horizontal line of dest
464 */
465 void (*idct_put)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
466
467 /**
468 * block -> idct -> add dest -> clip to unsigned 8 bit -> dest.
469 * @param line_size size in bytes of a horizontal line of dest
470 */
471 void (*idct_add)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
472
473 /**
474 * idct input permutation.
475 * several optimized IDCTs need a permutated input (relative to the normal order of the reference
476 * IDCT)
477 * this permutation must be performed before the idct_put/add, note, normally this can be merged
478 * with the zigzag/alternate scan<br>
479 * an example to avoid confusion:
480 * - (->decode coeffs -> zigzag reorder -> dequant -> reference idct ->...)
481 * - (x -> referece dct -> reference idct -> x)
482 * - (x -> referece dct -> simple_mmx_perm = idct_permutation -> simple_idct_mmx -> x)
483 * - (->decode coeffs -> zigzag reorder -> simple_mmx_perm -> dequant -> simple_idct_mmx ->...)
484 */
485 uint8_t idct_permutation[64];
486 int idct_permutation_type;
487#define FF_NO_IDCT_PERM 1
488#define FF_LIBMPEG2_IDCT_PERM 2
489#define FF_SIMPLE_IDCT_PERM 3
490#define FF_TRANSPOSE_IDCT_PERM 4
491#define FF_PARTTRANS_IDCT_PERM 5
492#define FF_SSE2_IDCT_PERM 6
493
494 int (*try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale);
495 void (*add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale);
496#define BASIS_SHIFT 16
497#define RECON_SHIFT 6
498
499 void (*draw_edges)(uint8_t *buf, int wrap, int width, int height, int w);
500#define EDGE_WIDTH 16
501
502 void (*prefetch)(void *mem, int stride, int h);
503
504 void (*shrink[4])(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
505
506 /* mlp/truehd functions */
507 void (*mlp_filter_channel)(int32_t *state, const int32_t *coeff,
508 int firorder, int iirorder,
509 unsigned int filter_shift, int32_t mask, int blocksize,
510 int32_t *sample_buffer);
511
512 /* vc1 functions */
513 void (*vc1_inv_trans_8x8)(DCTELEM *b);
514 void (*vc1_inv_trans_8x4)(uint8_t *dest, int line_size, DCTELEM *block);
515 void (*vc1_inv_trans_4x8)(uint8_t *dest, int line_size, DCTELEM *block);
516 void (*vc1_inv_trans_4x4)(uint8_t *dest, int line_size, DCTELEM *block);
517 void (*vc1_inv_trans_8x8_dc)(uint8_t *dest, int line_size, DCTELEM *block);
518 void (*vc1_inv_trans_8x4_dc)(uint8_t *dest, int line_size, DCTELEM *block);
519 void (*vc1_inv_trans_4x8_dc)(uint8_t *dest, int line_size, DCTELEM *block);
520 void (*vc1_inv_trans_4x4_dc)(uint8_t *dest, int line_size, DCTELEM *block);
521 void (*vc1_v_overlap)(uint8_t* src, int stride);
522 void (*vc1_h_overlap)(uint8_t* src, int stride);
523 void (*vc1_v_loop_filter4)(uint8_t *src, int stride, int pq);
524 void (*vc1_h_loop_filter4)(uint8_t *src, int stride, int pq);
525 void (*vc1_v_loop_filter8)(uint8_t *src, int stride, int pq);
526 void (*vc1_h_loop_filter8)(uint8_t *src, int stride, int pq);
527 void (*vc1_v_loop_filter16)(uint8_t *src, int stride, int pq);
528 void (*vc1_h_loop_filter16)(uint8_t *src, int stride, int pq);
529 /* put 8x8 block with bicubic interpolation and quarterpel precision
530 * last argument is actually round value instead of height
531 */
532 op_pixels_func put_vc1_mspel_pixels_tab[16];
533 op_pixels_func avg_vc1_mspel_pixels_tab[16];
534
535 /* intrax8 functions */
536 void (*x8_spatial_compensation[12])(uint8_t *src , uint8_t *dst, int linesize);
537 void (*x8_setup_spatial_compensation)(uint8_t *src, uint8_t *dst, int linesize,
538 int * range, int * sum, int edges);
539
540 /**
541 * Calculate scalar product of two vectors.
542 * @param len length of vectors, should be multiple of 16
543 * @param shift number of bits to discard from product
544 */
545 int32_t (*scalarproduct_int16)(int16_t *v1, int16_t *v2/*align 16*/, int len, int shift);
546 /* ape functions */
547 /**
548 * Calculate scalar product of v1 and v2,
549 * and v1[i] += v3[i] * mul
550 * @param len length of vectors, should be multiple of 16
551 */
552 int32_t (*scalarproduct_and_madd_int16)(int16_t *v1/*align 16*/, int16_t *v2, int16_t *v3, int len, int mul);
553
554 /* rv30 functions */
555 qpel_mc_func put_rv30_tpel_pixels_tab[4][16];
556 qpel_mc_func avg_rv30_tpel_pixels_tab[4][16];
557
558 /* rv40 functions */
559 qpel_mc_func put_rv40_qpel_pixels_tab[4][16];
560 qpel_mc_func avg_rv40_qpel_pixels_tab[4][16];
561 h264_chroma_mc_func put_rv40_chroma_pixels_tab[3];
562 h264_chroma_mc_func avg_rv40_chroma_pixels_tab[3];
563
564 /* bink functions */
565 op_fill_func fill_block_tab[2];
566 void (*scale_block)(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize);
567} DSPContext;
568
569void dsputil_static_init(void);
570void dsputil_init(DSPContext* p, AVCodecContext *avctx);
571
572int ff_check_alignment(void);
573
574/**
575 * permute block according to permuatation.
576 * @param last last non zero element in scantable order
577 */
578void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last);
579
580void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type);
581
582#define BYTE_VEC32(c) ((c)*0x01010101UL)
583
584static inline uint32_t rnd_avg32(uint32_t a, uint32_t b)
585{
586 return (a | b) - (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1);
587}
588
589static inline uint32_t no_rnd_avg32(uint32_t a, uint32_t b)
590{
591 return (a & b) + (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1);
592}
593
594static inline int get_penalty_factor(int lambda, int lambda2, int type){
595 switch(type&0xFF){
596 default:
597 case FF_CMP_SAD:
598 return lambda>>FF_LAMBDA_SHIFT;
599 case FF_CMP_DCT:
600 return (3*lambda)>>(FF_LAMBDA_SHIFT+1);
601 case FF_CMP_W53:
602 return (4*lambda)>>(FF_LAMBDA_SHIFT);
603 case FF_CMP_W97:
604 return (2*lambda)>>(FF_LAMBDA_SHIFT);
605 case FF_CMP_SATD:
606 case FF_CMP_DCT264:
607 return (2*lambda)>>FF_LAMBDA_SHIFT;
608 case FF_CMP_RD:
609 case FF_CMP_PSNR:
610 case FF_CMP_SSE:
611 case FF_CMP_NSSE:
612 return lambda2>>FF_LAMBDA_SHIFT;
613 case FF_CMP_BIT:
614 return 1;
615 }
616}
617
618/**
619 * Empty mmx state.
620 * this must be called between any dsp function and float/double code.
621 * for example sin(); dsp->idct_put(); emms_c(); cos()
622 */
623#define emms_c()
624
625/* should be defined by architectures supporting
626 one or more MultiMedia extension */
627int mm_support(void);
628extern int mm_flags;
629
630void dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx);
631void dsputil_init_arm(DSPContext* c, AVCodecContext *avctx);
632void dsputil_init_bfin(DSPContext* c, AVCodecContext *avctx);
633void dsputil_init_mlib(DSPContext* c, AVCodecContext *avctx);
634void dsputil_init_mmi(DSPContext* c, AVCodecContext *avctx);
635void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx);
636void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx);
637void dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx);
638void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx);
639
640void ff_dsputil_init_dwt(DSPContext *c);
641void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
642void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
643void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
644void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
645void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
646void ff_mlp_init(DSPContext* c, AVCodecContext *avctx);
647void ff_mlp_init_x86(DSPContext* c, AVCodecContext *avctx);
648
649#if HAVE_MMX
650
651#undef emms_c
652
653static inline void emms(void)
654{
655 __asm__ volatile ("emms;":::"memory");
656}
657
658
659#define emms_c() \
660{\
661 if (mm_flags & FF_MM_MMX)\
662 emms();\
663}
664
665#elif ARCH_ARM
666
667#if HAVE_NEON
668# define STRIDE_ALIGN 16
669#endif
670
671#elif ARCH_PPC
672
673#define STRIDE_ALIGN 16
674
675#elif HAVE_MMI
676
677#define STRIDE_ALIGN 16
678
679#else
680
681#define mm_flags 0
682#define mm_support() 0
683
684#endif
685
686#ifndef STRIDE_ALIGN
687# define STRIDE_ALIGN 8
688#endif
689
690#define LOCAL_ALIGNED(a, t, v, s, ...) \
691 uint8_t la_##v[sizeof(t s __VA_ARGS__) + (a)]; \
692 t (*v) __VA_ARGS__ = (void *)FFALIGN((uintptr_t)la_##v, a)
693
694#if HAVE_LOCAL_ALIGNED_8
695# define LOCAL_ALIGNED_8(t, v, s, ...) DECLARE_ALIGNED(8, t, v) s __VA_ARGS__
696#else
697# define LOCAL_ALIGNED_8(t, v, s, ...) LOCAL_ALIGNED(8, t, v, s, __VA_ARGS__)
698#endif
699
700#if HAVE_LOCAL_ALIGNED_16
701# define LOCAL_ALIGNED_16(t, v, s, ...) DECLARE_ALIGNED(16, t, v) s __VA_ARGS__
702#else
703# define LOCAL_ALIGNED_16(t, v, s, ...) LOCAL_ALIGNED(16, t, v, s, __VA_ARGS__)
704#endif
705
706/* PSNR */
707void get_psnr(uint8_t *orig_image[3], uint8_t *coded_image[3],
708 int orig_linesize[3], int coded_linesize,
709 AVCodecContext *avctx);
710
711#define WRAPPER8_16(name8, name16)\
712static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
713 return name8(s, dst , src , stride, h)\
714 +name8(s, dst+8 , src+8 , stride, h);\
715}
716
717#define WRAPPER8_16_SQ(name8, name16)\
718static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
719 int score=0;\
720 score +=name8(s, dst , src , stride, 8);\
721 score +=name8(s, dst+8 , src+8 , stride, 8);\
722 if(h==16){\
723 dst += 8*stride;\
724 src += 8*stride;\
725 score +=name8(s, dst , src , stride, 8);\
726 score +=name8(s, dst+8 , src+8 , stride, 8);\
727 }\
728 return score;\
729}
730
731
732static inline void copy_block2(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
733{
734 int i;
735 for(i=0; i<h; i++)
736 {
737 AV_WN16(dst , AV_RN16(src ));
738 dst+=dstStride;
739 src+=srcStride;
740 }
741}
742
743static inline void copy_block4(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
744{
745 int i;
746 for(i=0; i<h; i++)
747 {
748 AV_WN32(dst , AV_RN32(src ));
749 dst+=dstStride;
750 src+=srcStride;
751 }
752}
753
754static inline void copy_block8(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
755{
756 int i;
757 for(i=0; i<h; i++)
758 {
759 AV_WN32(dst , AV_RN32(src ));
760 AV_WN32(dst+4 , AV_RN32(src+4 ));
761 dst+=dstStride;
762 src+=srcStride;
763 }
764}
765
766static inline void copy_block9(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
767{
768 int i;
769 for(i=0; i<h; i++)
770 {
771 AV_WN32(dst , AV_RN32(src ));
772 AV_WN32(dst+4 , AV_RN32(src+4 ));
773 dst[8]= src[8];
774 dst+=dstStride;
775 src+=srcStride;
776 }
777}
778
779static inline void copy_block16(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
780{
781 int i;
782 for(i=0; i<h; i++)
783 {
784 AV_WN32(dst , AV_RN32(src ));
785 AV_WN32(dst+4 , AV_RN32(src+4 ));
786 AV_WN32(dst+8 , AV_RN32(src+8 ));
787 AV_WN32(dst+12, AV_RN32(src+12));
788 dst+=dstStride;
789 src+=srcStride;
790 }
791}
792
793static inline void copy_block17(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
794{
795 int i;
796 for(i=0; i<h; i++)
797 {
798 AV_WN32(dst , AV_RN32(src ));
799 AV_WN32(dst+4 , AV_RN32(src+4 ));
800 AV_WN32(dst+8 , AV_RN32(src+8 ));
801 AV_WN32(dst+12, AV_RN32(src+12));
802 dst[16]= src[16];
803 dst+=dstStride;
804 src+=srcStride;
805 }
806}
807
808#endif /* AVCODEC_DSPUTIL_H */