summaryrefslogtreecommitdiff
path: root/lib/rbcodec/codecs/libopus/celt/arm
diff options
context:
space:
mode:
Diffstat (limited to 'lib/rbcodec/codecs/libopus/celt/arm')
-rwxr-xr-xlib/rbcodec/codecs/libopus/celt/arm/arm2gnu.pl353
-rw-r--r--lib/rbcodec/codecs/libopus/celt/arm/arm_celt_map.c160
-rw-r--r--lib/rbcodec/codecs/libopus/celt/arm/armcpu.c185
-rw-r--r--lib/rbcodec/codecs/libopus/celt/arm/armcpu.h77
-rw-r--r--lib/rbcodec/codecs/libopus/celt/arm/armopts.s.in37
-rw-r--r--lib/rbcodec/codecs/libopus/celt/arm/celt_fft_ne10.c173
-rw-r--r--lib/rbcodec/codecs/libopus/celt/arm/celt_mdct_ne10.c258
-rw-r--r--lib/rbcodec/codecs/libopus/celt/arm/celt_neon_intr.c211
-rw-r--r--lib/rbcodec/codecs/libopus/celt/arm/celt_pitch_xcorr_arm.s551
-rw-r--r--lib/rbcodec/codecs/libopus/celt/arm/fft_arm.h71
-rw-r--r--lib/rbcodec/codecs/libopus/celt/arm/fixed_arm64.h35
-rw-r--r--lib/rbcodec/codecs/libopus/celt/arm/fixed_armv4.h6
-rw-r--r--lib/rbcodec/codecs/libopus/celt/arm/fixed_armv5e.h4
-rw-r--r--lib/rbcodec/codecs/libopus/celt/arm/mdct_arm.h59
-rw-r--r--lib/rbcodec/codecs/libopus/celt/arm/pitch_arm.h160
-rw-r--r--lib/rbcodec/codecs/libopus/celt/arm/pitch_neon_intr.c290
16 files changed, 2625 insertions, 5 deletions
diff --git a/lib/rbcodec/codecs/libopus/celt/arm/arm2gnu.pl b/lib/rbcodec/codecs/libopus/celt/arm/arm2gnu.pl
new file mode 100755
index 0000000000..a2895f7445
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/celt/arm/arm2gnu.pl
@@ -0,0 +1,353 @@
1#!/usr/bin/perl
2# Copyright (C) 2002-2013 Xiph.org Foundation
3#
4# Redistribution and use in source and binary forms, with or without
5# modification, are permitted provided that the following conditions
6# are met:
7#
8# - Redistributions of source code must retain the above copyright
9# notice, this list of conditions and the following disclaimer.
10#
11# - Redistributions in binary form must reproduce the above copyright
12# notice, this list of conditions and the following disclaimer in the
13# documentation and/or other materials provided with the distribution.
14#
15# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16# ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19# OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27my $bigend; # little/big endian
28my $nxstack;
29my $apple = 0;
30my $symprefix = "";
31
32$nxstack = 0;
33
34eval 'exec /usr/local/bin/perl -S $0 ${1+"$@"}'
35 if $running_under_some_shell;
36
37while ($ARGV[0] =~ /^-/) {
38 $_ = shift;
39 last if /^--$/;
40 if (/^-n$/) {
41 $nflag++;
42 next;
43 }
44 if (/^--apple$/) {
45 $apple = 1;
46 $symprefix = "_";
47 next;
48 }
49 die "I don't recognize this switch: $_\\n";
50}
51$printit++ unless $nflag;
52
53$\ = "\n"; # automatically add newline on print
54$n=0;
55
56$thumb = 0; # ARM mode by default, not Thumb.
57@proc_stack = ();
58
59printf (" .syntax unified\n");
60
61LINE:
62while (<>) {
63
64 # For ADRLs we need to add a new line after the substituted one.
65 $addPadding = 0;
66
67 # First, we do not dare to touch *anything* inside double quotes, do we?
68 # Second, if you want a dollar character in the string,
69 # insert two of them -- that's how ARM C and assembler treat strings.
70 s/^([A-Za-z_]\w*)[ \t]+DCB[ \t]*\"/$1: .ascii \"/ && do { s/\$\$/\$/g; next };
71 s/\bDCB\b[ \t]*\"/.ascii \"/ && do { s/\$\$/\$/g; next };
72 s/^(\S+)\s+RN\s+(\S+)/$1 .req r$2/ && do { s/\$\$/\$/g; next };
73 # If there's nothing on a line but a comment, don't try to apply any further
74 # substitutions (this is a cheap hack to avoid mucking up the license header)
75 s/^([ \t]*);/$1@/ && do { s/\$\$/\$/g; next };
76 # If substituted -- leave immediately !
77
78 s/@/,:/;
79 s/;/@/;
80 while ( /@.*'/ ) {
81 s/(@.*)'/$1/g;
82 }
83 s/\{FALSE\}/0/g;
84 s/\{TRUE\}/1/g;
85 s/\{(\w\w\w\w+)\}/$1/g;
86 s/\bINCLUDE[ \t]*([^ \t\n]+)/.include \"$1\"/;
87 s/\bGET[ \t]*([^ \t\n]+)/.include \"${ my $x=$1; $x =~ s|\.s|-gnu.S|; \$x }\"/;
88 s/\bIMPORT\b/.extern/;
89 s/\bEXPORT\b\s*/.global $symprefix/;
90 s/^(\s+)\[/$1IF/;
91 s/^(\s+)\|/$1ELSE/;
92 s/^(\s+)\]/$1ENDIF/;
93 s/IF *:DEF:/ .ifdef/;
94 s/IF *:LNOT: *:DEF:/ .ifndef/;
95 s/ELSE/ .else/;
96 s/ENDIF/ .endif/;
97
98 if( /\bIF\b/ ) {
99 s/\bIF\b/ .if/;
100 s/=/==/;
101 }
102 if ( $n == 2) {
103 s/\$/\\/g;
104 }
105 if ($n == 1) {
106 s/\$//g;
107 s/label//g;
108 $n = 2;
109 }
110 if ( /MACRO/ ) {
111 s/MACRO *\n/.macro/;
112 $n=1;
113 }
114 if ( /\bMEND\b/ ) {
115 s/\bMEND\b/.endm/;
116 $n=0;
117 }
118
119 # ".rdata" doesn't work in 'as' version 2.13.2, as it is ".rodata" there.
120 #
121 if ( /\bAREA\b/ ) {
122 my $align;
123 $align = "2";
124 if ( /ALIGN=(\d+)/ ) {
125 $align = $1;
126 }
127 if ( /CODE/ ) {
128 $nxstack = 1;
129 }
130 s/^(.+)CODE(.+)READONLY(.*)/ .text/;
131 s/^(.+)DATA(.+)READONLY(.*)/ .section .rdata/;
132 s/^(.+)\|\|\.data\|\|(.+)/ .data/;
133 s/^(.+)\|\|\.bss\|\|(.+)/ .bss/;
134 s/$/; .p2align $align/;
135 # Enable NEON instructions but don't produce a binary that requires
136 # ARMv7. RVCT does not have equivalent directives, so we just do this
137 # for all CODE areas.
138 if ( /.text/ ) {
139 # Separating .arch, .fpu, etc., by semicolons does not work (gas
140 # thinks the semicolon is part of the arch name, even when there's
141 # whitespace separating them). Sadly this means our line numbers
142 # won't match the original source file (we could use the .line
143 # directive, which is documented to be obsolete, but then gdb will
144 # show the wrong line in the translated source file).
145 s/$/; .arch armv7-a\n .fpu neon\n .object_arch armv4t/ unless ($apple);
146 }
147 }
148
149 s/\|\|\.constdata\$(\d+)\|\|/.L_CONST$1/; # ||.constdata$3||
150 s/\|\|\.bss\$(\d+)\|\|/.L_BSS$1/; # ||.bss$2||
151 s/\|\|\.data\$(\d+)\|\|/.L_DATA$1/; # ||.data$2||
152 s/\|\|([a-zA-Z0-9_]+)\@([a-zA-Z0-9_]+)\|\|/@ $&/;
153 s/^(\s+)\%(\s)/ .space $1/;
154
155 s/\|(.+)\.(\d+)\|/\.$1_$2/; # |L80.123| -> .L80_123
156 s/\bCODE32\b/.code 32/ && do {$thumb = 0};
157 s/\bCODE16\b/.code 16/ && do {$thumb = 1};
158 if (/\bPROC\b/)
159 {
160 my $prefix;
161 my $proc;
162 /^([A-Za-z_\.]\w+)\b/;
163 $proc = $1;
164 $prefix = "";
165 if ($proc)
166 {
167 $prefix = $prefix.sprintf("\t.type\t%s, %%function", $proc) unless ($apple);
168 # Make sure we $prefix isn't empty here (for the $apple case).
169 # We handle mangling the label here, make sure it doesn't match
170 # the label handling below (if $prefix would be empty).
171 $prefix = $prefix."; ";
172 push(@proc_stack, $proc);
173 s/^[A-Za-z_\.]\w+/$symprefix$&:/;
174 }
175 $prefix = $prefix."\t.thumb_func; " if ($thumb);
176 s/\bPROC\b/@ $&/;
177 $_ = $prefix.$_;
178 }
179 s/^(\s*)(S|Q|SH|U|UQ|UH)ASX\b/$1$2ADDSUBX/;
180 s/^(\s*)(S|Q|SH|U|UQ|UH)SAX\b/$1$2SUBADDX/;
181 if (/\bENDP\b/)
182 {
183 my $proc;
184 s/\bENDP\b/@ $&/;
185 $proc = pop(@proc_stack);
186 $_ = "\t.size $proc, .-$proc".$_ if ($proc && !$apple);
187 }
188 s/\bSUBT\b/@ $&/;
189 s/\bDATA\b/@ $&/; # DATA directive is deprecated -- Asm guide, p.7-25
190 s/\bKEEP\b/@ $&/;
191 s/\bEXPORTAS\b/@ $&/;
192 s/\|\|(.)+\bEQU\b/@ $&/;
193 s/\|\|([\w\$]+)\|\|/$1/;
194 s/\bENTRY\b/@ $&/;
195 s/\bASSERT\b/@ $&/;
196 s/\bGBLL\b/@ $&/;
197 s/\bGBLA\b/@ $&/;
198 s/^\W+OPT\b/@ $&/;
199 s/:OR:/|/g;
200 s/:SHL:/<</g;
201 s/:SHR:/>>/g;
202 s/:AND:/&/g;
203 s/:LAND:/&&/g;
204 s/CPSR/cpsr/;
205 s/SPSR/spsr/;
206 s/ALIGN$/.balign 4/;
207 s/ALIGN\s+([0-9x]+)$/.balign $1/;
208 s/psr_cxsf/psr_all/;
209 s/LTORG/.ltorg/;
210 s/^([A-Za-z_]\w*)[ \t]+EQU/ .set $1,/;
211 s/^([A-Za-z_]\w*)[ \t]+SETL/ .set $1,/;
212 s/^([A-Za-z_]\w*)[ \t]+SETA/ .set $1,/;
213 s/^([A-Za-z_]\w*)[ \t]+\*/ .set $1,/;
214
215 # {PC} + 0xdeadfeed --> . + 0xdeadfeed
216 s/\{PC\} \+/ \. +/;
217
218 # Single hex constant on the line !
219 #
220 # >>> NOTE <<<
221 # Double-precision floats in gcc are always mixed-endian, which means
222 # bytes in two words are little-endian, but words are big-endian.
223 # So, 0x0000deadfeed0000 would be stored as 0x0000dead at low address
224 # and 0xfeed0000 at high address.
225 #
226 s/\bDCFD\b[ \t]+0x([a-fA-F0-9]{8})([a-fA-F0-9]{8})/.long 0x$1, 0x$2/;
227 # Only decimal constants on the line, no hex !
228 s/\bDCFD\b[ \t]+([0-9\.\-]+)/.double $1/;
229
230 # Single hex constant on the line !
231# s/\bDCFS\b[ \t]+0x([a-f0-9]{8})([a-f0-9]{8})/.long 0x$1, 0x$2/;
232 # Only decimal constants on the line, no hex !
233# s/\bDCFS\b[ \t]+([0-9\.\-]+)/.double $1/;
234 s/\bDCFS[ \t]+0x/.word 0x/;
235 s/\bDCFS\b/.float/;
236
237 s/^([A-Za-z_]\w*)[ \t]+DCD/$1 .word/;
238 s/\bDCD\b/.word/;
239 s/^([A-Za-z_]\w*)[ \t]+DCW/$1 .short/;
240 s/\bDCW\b/.short/;
241 s/^([A-Za-z_]\w*)[ \t]+DCB/$1 .byte/;
242 s/\bDCB\b/.byte/;
243 s/^([A-Za-z_]\w*)[ \t]+\%/.comm $1,/;
244 s/^[A-Za-z_\.]\w+/$&:/;
245 s/^(\d+)/$1:/;
246 s/\%(\d+)/$1b_or_f/;
247 s/\%[Bb](\d+)/$1b/;
248 s/\%[Ff](\d+)/$1f/;
249 s/\%[Ff][Tt](\d+)/$1f/;
250 s/&([\dA-Fa-f]+)/0x$1/;
251 if ( /\b2_[01]+\b/ ) {
252 s/\b2_([01]+)\b/conv$1&&&&/g;
253 while ( /[01][01][01][01]&&&&/ ) {
254 s/0000&&&&/&&&&0/g;
255 s/0001&&&&/&&&&1/g;
256 s/0010&&&&/&&&&2/g;
257 s/0011&&&&/&&&&3/g;
258 s/0100&&&&/&&&&4/g;
259 s/0101&&&&/&&&&5/g;
260 s/0110&&&&/&&&&6/g;
261 s/0111&&&&/&&&&7/g;
262 s/1000&&&&/&&&&8/g;
263 s/1001&&&&/&&&&9/g;
264 s/1010&&&&/&&&&A/g;
265 s/1011&&&&/&&&&B/g;
266 s/1100&&&&/&&&&C/g;
267 s/1101&&&&/&&&&D/g;
268 s/1110&&&&/&&&&E/g;
269 s/1111&&&&/&&&&F/g;
270 }
271 s/000&&&&/&&&&0/g;
272 s/001&&&&/&&&&1/g;
273 s/010&&&&/&&&&2/g;
274 s/011&&&&/&&&&3/g;
275 s/100&&&&/&&&&4/g;
276 s/101&&&&/&&&&5/g;
277 s/110&&&&/&&&&6/g;
278 s/111&&&&/&&&&7/g;
279 s/00&&&&/&&&&0/g;
280 s/01&&&&/&&&&1/g;
281 s/10&&&&/&&&&2/g;
282 s/11&&&&/&&&&3/g;
283 s/0&&&&/&&&&0/g;
284 s/1&&&&/&&&&1/g;
285 s/conv&&&&/0x/g;
286 }
287
288 if ( /commandline/)
289 {
290 if( /-bigend/)
291 {
292 $bigend=1;
293 }
294 }
295
296 if ( /\bDCDU\b/ )
297 {
298 my $cmd=$_;
299 my $value;
300 my $prefix;
301 my $w1;
302 my $w2;
303 my $w3;
304 my $w4;
305
306 s/\s+DCDU\b/@ $&/;
307
308 $cmd =~ /\bDCDU\b\s+0x(\d+)/;
309 $value = $1;
310 $value =~ /(\w\w)(\w\w)(\w\w)(\w\w)/;
311 $w1 = $1;
312 $w2 = $2;
313 $w3 = $3;
314 $w4 = $4;
315
316 if( $bigend ne "")
317 {
318 # big endian
319 $prefix = "\t.byte\t0x".$w1.";".
320 "\t.byte\t0x".$w2.";".
321 "\t.byte\t0x".$w3.";".
322 "\t.byte\t0x".$w4."; ";
323 }
324 else
325 {
326 # little endian
327 $prefix = "\t.byte\t0x".$w4.";".
328 "\t.byte\t0x".$w3.";".
329 "\t.byte\t0x".$w2.";".
330 "\t.byte\t0x".$w1."; ";
331 }
332 $_=$prefix.$_;
333 }
334
335 if ( /\badrl\b/i )
336 {
337 s/\badrl\s+(\w+)\s*,\s*(\w+)/ldr $1,=$2/i;
338 $addPadding = 1;
339 }
340 s/\bEND\b/@ END/;
341} continue {
342 printf ("%s", $_) if $printit;
343 if ($addPadding != 0)
344 {
345 printf (" mov r0,r0\n");
346 $addPadding = 0;
347 }
348}
349#If we had a code section, mark that this object doesn't need an executable
350# stack.
351if ($nxstack && !$apple) {
352 printf (" .section\t.note.GNU-stack,\"\",\%\%progbits\n");
353}
diff --git a/lib/rbcodec/codecs/libopus/celt/arm/arm_celt_map.c b/lib/rbcodec/codecs/libopus/celt/arm/arm_celt_map.c
new file mode 100644
index 0000000000..ca988b66f5
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/celt/arm/arm_celt_map.c
@@ -0,0 +1,160 @@
1/* Copyright (c) 2010 Xiph.Org Foundation
2 * Copyright (c) 2013 Parrot */
3/*
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include "pitch.h"
33#include "kiss_fft.h"
34#include "mdct.h"
35
36#if defined(OPUS_HAVE_RTCD)
37
38# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
39opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N) = {
40 celt_inner_prod_c, /* ARMv4 */
41 celt_inner_prod_c, /* EDSP */
42 celt_inner_prod_c, /* Media */
43 celt_inner_prod_neon /* NEON */
44};
45
46void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
47 int N, opus_val32 *xy1, opus_val32 *xy2) = {
48 dual_inner_prod_c, /* ARMv4 */
49 dual_inner_prod_c, /* EDSP */
50 dual_inner_prod_c, /* Media */
51 dual_inner_prod_neon /* NEON */
52};
53# endif
54
55# if defined(FIXED_POINT)
56# if ((defined(OPUS_ARM_MAY_HAVE_NEON) && !defined(OPUS_ARM_PRESUME_NEON)) || \
57 (defined(OPUS_ARM_MAY_HAVE_MEDIA) && !defined(OPUS_ARM_PRESUME_MEDIA)) || \
58 (defined(OPUS_ARM_MAY_HAVE_EDSP) && !defined(OPUS_ARM_PRESUME_EDSP)))
59opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
60 const opus_val16 *, opus_val32 *, int, int, int) = {
61 celt_pitch_xcorr_c, /* ARMv4 */
62 MAY_HAVE_EDSP(celt_pitch_xcorr), /* EDSP */
63 MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */
64 MAY_HAVE_NEON(celt_pitch_xcorr) /* NEON */
65};
66
67# endif
68# else /* !FIXED_POINT */
69# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
70void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
71 const opus_val16 *, opus_val32 *, int, int, int) = {
72 celt_pitch_xcorr_c, /* ARMv4 */
73 celt_pitch_xcorr_c, /* EDSP */
74 celt_pitch_xcorr_c, /* Media */
75 celt_pitch_xcorr_float_neon /* Neon */
76};
77# endif
78# endif /* FIXED_POINT */
79
80#if defined(FIXED_POINT) && defined(OPUS_HAVE_RTCD) && \
81 defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
82
83void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
84 const opus_val16 *x,
85 const opus_val16 *y,
86 opus_val32 sum[4],
87 int len
88) = {
89 xcorr_kernel_c, /* ARMv4 */
90 xcorr_kernel_c, /* EDSP */
91 xcorr_kernel_c, /* Media */
92 xcorr_kernel_neon_fixed, /* Neon */
93};
94
95#endif
96
97# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
98# if defined(HAVE_ARM_NE10)
99# if defined(CUSTOM_MODES)
100int (*const OPUS_FFT_ALLOC_ARCH_IMPL[OPUS_ARCHMASK+1])(kiss_fft_state *st) = {
101 opus_fft_alloc_arch_c, /* ARMv4 */
102 opus_fft_alloc_arch_c, /* EDSP */
103 opus_fft_alloc_arch_c, /* Media */
104 opus_fft_alloc_arm_neon /* Neon with NE10 library support */
105};
106
107void (*const OPUS_FFT_FREE_ARCH_IMPL[OPUS_ARCHMASK+1])(kiss_fft_state *st) = {
108 opus_fft_free_arch_c, /* ARMv4 */
109 opus_fft_free_arch_c, /* EDSP */
110 opus_fft_free_arch_c, /* Media */
111 opus_fft_free_arm_neon /* Neon with NE10 */
112};
113# endif /* CUSTOM_MODES */
114
115void (*const OPUS_FFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
116 const kiss_fft_cpx *fin,
117 kiss_fft_cpx *fout) = {
118 opus_fft_c, /* ARMv4 */
119 opus_fft_c, /* EDSP */
120 opus_fft_c, /* Media */
121 opus_fft_neon /* Neon with NE10 */
122};
123
124void (*const OPUS_IFFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
125 const kiss_fft_cpx *fin,
126 kiss_fft_cpx *fout) = {
127 opus_ifft_c, /* ARMv4 */
128 opus_ifft_c, /* EDSP */
129 opus_ifft_c, /* Media */
130 opus_ifft_neon /* Neon with NE10 */
131};
132
133void (*const CLT_MDCT_FORWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
134 kiss_fft_scalar *in,
135 kiss_fft_scalar * OPUS_RESTRICT out,
136 const opus_val16 *window,
137 int overlap, int shift,
138 int stride, int arch) = {
139 clt_mdct_forward_c, /* ARMv4 */
140 clt_mdct_forward_c, /* EDSP */
141 clt_mdct_forward_c, /* Media */
142 clt_mdct_forward_neon /* Neon with NE10 */
143};
144
145void (*const CLT_MDCT_BACKWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
146 kiss_fft_scalar *in,
147 kiss_fft_scalar * OPUS_RESTRICT out,
148 const opus_val16 *window,
149 int overlap, int shift,
150 int stride, int arch) = {
151 clt_mdct_backward_c, /* ARMv4 */
152 clt_mdct_backward_c, /* EDSP */
153 clt_mdct_backward_c, /* Media */
154 clt_mdct_backward_neon /* Neon with NE10 */
155};
156
157# endif /* HAVE_ARM_NE10 */
158# endif /* OPUS_ARM_MAY_HAVE_NEON_INTR */
159
160#endif /* OPUS_HAVE_RTCD */
diff --git a/lib/rbcodec/codecs/libopus/celt/arm/armcpu.c b/lib/rbcodec/codecs/libopus/celt/arm/armcpu.c
new file mode 100644
index 0000000000..694a63b78e
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/celt/arm/armcpu.c
@@ -0,0 +1,185 @@
1/* Copyright (c) 2010 Xiph.Org Foundation
2 * Copyright (c) 2013 Parrot */
3/*
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28/* Original code from libtheora modified to suit to Opus */
29
30#ifdef HAVE_CONFIG_H
31#include "config.h"
32#endif
33
34#ifdef OPUS_HAVE_RTCD
35
36#include "armcpu.h"
37#include "cpu_support.h"
38#include "os_support.h"
39#include "opus_types.h"
40#include "arch.h"
41
42#define OPUS_CPU_ARM_V4_FLAG (1<<OPUS_ARCH_ARM_V4)
43#define OPUS_CPU_ARM_EDSP_FLAG (1<<OPUS_ARCH_ARM_EDSP)
44#define OPUS_CPU_ARM_MEDIA_FLAG (1<<OPUS_ARCH_ARM_MEDIA)
45#define OPUS_CPU_ARM_NEON_FLAG (1<<OPUS_ARCH_ARM_NEON)
46
47#if defined(_MSC_VER)
48/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
49# define WIN32_LEAN_AND_MEAN
50# define WIN32_EXTRA_LEAN
51# include <windows.h>
52
53static OPUS_INLINE opus_uint32 opus_cpu_capabilities(void){
54 opus_uint32 flags;
55 flags=0;
56 /* MSVC has no OPUS_INLINE __asm support for ARM, but it does let you __emit
57 * instructions via their assembled hex code.
58 * All of these instructions should be essentially nops. */
59# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_MEDIA) \
60 || defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
61 __try{
62 /*PLD [r13]*/
63 __emit(0xF5DDF000);
64 flags|=OPUS_CPU_ARM_EDSP_FLAG;
65 }
66 __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
67 /*Ignore exception.*/
68 }
69# if defined(OPUS_ARM_MAY_HAVE_MEDIA) \
70 || defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
71 __try{
72 /*SHADD8 r3,r3,r3*/
73 __emit(0xE6333F93);
74 flags|=OPUS_CPU_ARM_MEDIA_FLAG;
75 }
76 __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
77 /*Ignore exception.*/
78 }
79# if defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
80 __try{
81 /*VORR q0,q0,q0*/
82 __emit(0xF2200150);
83 flags|=OPUS_CPU_ARM_NEON_FLAG;
84 }
85 __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
86 /*Ignore exception.*/
87 }
88# endif
89# endif
90# endif
91 return flags;
92}
93
94#elif defined(__linux__)
95/* Linux based */
96opus_uint32 opus_cpu_capabilities(void)
97{
98 opus_uint32 flags = 0;
99 FILE *cpuinfo;
100
101 /* Reading /proc/self/auxv would be easier, but that doesn't work reliably on
102 * Android */
103 cpuinfo = fopen("/proc/cpuinfo", "r");
104
105 if(cpuinfo != NULL)
106 {
107 /* 512 should be enough for anybody (it's even enough for all the flags that
108 * x86 has accumulated... so far). */
109 char buf[512];
110
111 while(fgets(buf, 512, cpuinfo) != NULL)
112 {
113# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_MEDIA) \
114 || defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
115 /* Search for edsp and neon flag */
116 if(memcmp(buf, "Features", 8) == 0)
117 {
118 char *p;
119 p = strstr(buf, " edsp");
120 if(p != NULL && (p[5] == ' ' || p[5] == '\n'))
121 flags |= OPUS_CPU_ARM_EDSP_FLAG;
122
123# if defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
124 p = strstr(buf, " neon");
125 if(p != NULL && (p[5] == ' ' || p[5] == '\n'))
126 flags |= OPUS_CPU_ARM_NEON_FLAG;
127# endif
128 }
129# endif
130
131# if defined(OPUS_ARM_MAY_HAVE_MEDIA) \
132 || defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
133 /* Search for media capabilities (>= ARMv6) */
134 if(memcmp(buf, "CPU architecture:", 17) == 0)
135 {
136 int version;
137 version = atoi(buf+17);
138
139 if(version >= 6)
140 flags |= OPUS_CPU_ARM_MEDIA_FLAG;
141 }
142# endif
143 }
144
145 fclose(cpuinfo);
146 }
147 return flags;
148}
149#else
150/* The feature registers which can tell us what the processor supports are
151 * accessible in priveleged modes only, so we can't have a general user-space
152 * detection method like on x86.*/
153# error "Configured to use ARM asm but no CPU detection method available for " \
154 "your platform. Reconfigure with --disable-rtcd (or send patches)."
155#endif
156
157int opus_select_arch(void)
158{
159 opus_uint32 flags = opus_cpu_capabilities();
160 int arch = 0;
161
162 if(!(flags & OPUS_CPU_ARM_EDSP_FLAG)) {
163 /* Asserts ensure arch values are sequential */
164 celt_assert(arch == OPUS_ARCH_ARM_V4);
165 return arch;
166 }
167 arch++;
168
169 if(!(flags & OPUS_CPU_ARM_MEDIA_FLAG)) {
170 celt_assert(arch == OPUS_ARCH_ARM_EDSP);
171 return arch;
172 }
173 arch++;
174
175 if(!(flags & OPUS_CPU_ARM_NEON_FLAG)) {
176 celt_assert(arch == OPUS_ARCH_ARM_MEDIA);
177 return arch;
178 }
179 arch++;
180
181 celt_assert(arch == OPUS_ARCH_ARM_NEON);
182 return arch;
183}
184
185#endif
diff --git a/lib/rbcodec/codecs/libopus/celt/arm/armcpu.h b/lib/rbcodec/codecs/libopus/celt/arm/armcpu.h
new file mode 100644
index 0000000000..820262ff5f
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/celt/arm/armcpu.h
@@ -0,0 +1,77 @@
1/* Copyright (c) 2010 Xiph.Org Foundation
2 * Copyright (c) 2013 Parrot */
3/*
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#if !defined(ARMCPU_H)
29# define ARMCPU_H
30
31# if defined(OPUS_ARM_MAY_HAVE_EDSP)
32# define MAY_HAVE_EDSP(name) name ## _edsp
33# else
34# define MAY_HAVE_EDSP(name) name ## _c
35# endif
36
37# if defined(OPUS_ARM_MAY_HAVE_MEDIA)
38# define MAY_HAVE_MEDIA(name) name ## _media
39# else
40# define MAY_HAVE_MEDIA(name) MAY_HAVE_EDSP(name)
41# endif
42
43# if defined(OPUS_ARM_MAY_HAVE_NEON)
44# define MAY_HAVE_NEON(name) name ## _neon
45# else
46# define MAY_HAVE_NEON(name) MAY_HAVE_MEDIA(name)
47# endif
48
49# if defined(OPUS_ARM_PRESUME_EDSP)
50# define PRESUME_EDSP(name) name ## _edsp
51# else
52# define PRESUME_EDSP(name) name ## _c
53# endif
54
55# if defined(OPUS_ARM_PRESUME_MEDIA)
56# define PRESUME_MEDIA(name) name ## _media
57# else
58# define PRESUME_MEDIA(name) PRESUME_EDSP(name)
59# endif
60
61# if defined(OPUS_ARM_PRESUME_NEON)
62# define PRESUME_NEON(name) name ## _neon
63# else
64# define PRESUME_NEON(name) PRESUME_MEDIA(name)
65# endif
66
67# if defined(OPUS_HAVE_RTCD)
68int opus_select_arch(void);
69
70#define OPUS_ARCH_ARM_V4 (0)
71#define OPUS_ARCH_ARM_EDSP (1)
72#define OPUS_ARCH_ARM_MEDIA (2)
73#define OPUS_ARCH_ARM_NEON (3)
74
75# endif
76
77#endif
diff --git a/lib/rbcodec/codecs/libopus/celt/arm/armopts.s.in b/lib/rbcodec/codecs/libopus/celt/arm/armopts.s.in
new file mode 100644
index 0000000000..3d8aaf2754
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/celt/arm/armopts.s.in
@@ -0,0 +1,37 @@
1/* Copyright (C) 2013 Mozilla Corporation */
2/*
3 Redistribution and use in source and binary forms, with or without
4 modification, are permitted provided that the following conditions
5 are met:
6
7 - Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9
10 - Redistributions in binary form must reproduce the above copyright
11 notice, this list of conditions and the following disclaimer in the
12 documentation and/or other materials provided with the distribution.
13
14 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
18 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
22 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*/
26
27; Set the following to 1 if we have EDSP instructions
28; (LDRD/STRD, etc., ARMv5E and later).
29OPUS_ARM_MAY_HAVE_EDSP * @OPUS_ARM_MAY_HAVE_EDSP@
30
31; Set the following to 1 if we have ARMv6 media instructions.
32OPUS_ARM_MAY_HAVE_MEDIA * @OPUS_ARM_MAY_HAVE_MEDIA@
33
34; Set the following to 1 if we have NEON (some ARMv7)
35OPUS_ARM_MAY_HAVE_NEON * @OPUS_ARM_MAY_HAVE_NEON@
36
37END
diff --git a/lib/rbcodec/codecs/libopus/celt/arm/celt_fft_ne10.c b/lib/rbcodec/codecs/libopus/celt/arm/celt_fft_ne10.c
new file mode 100644
index 0000000000..ea5fd7808b
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/celt/arm/celt_fft_ne10.c
@@ -0,0 +1,173 @@
1/* Copyright (c) 2015 Xiph.Org Foundation
2 Written by Viswanath Puttagunta */
3/**
4 @file celt_fft_ne10.c
5 @brief ARM Neon optimizations for fft using NE10 library
6 */
7
8/*
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions
11 are met:
12
13 - Redistributions of source code must retain the above copyright
14 notice, this list of conditions and the following disclaimer.
15
16 - Redistributions in binary form must reproduce the above copyright
17 notice, this list of conditions and the following disclaimer in the
18 documentation and/or other materials provided with the distribution.
19
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
24 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31*/
32
33#ifndef SKIP_CONFIG_H
34#ifdef HAVE_CONFIG_H
35#include "config.h"
36#endif
37#endif
38
39#include <NE10_dsp.h>
40#include "os_support.h"
41#include "kiss_fft.h"
42#include "stack_alloc.h"
43
44#if !defined(FIXED_POINT)
45# define NE10_FFT_ALLOC_C2C_TYPE_NEON ne10_fft_alloc_c2c_float32_neon
46# define NE10_FFT_CFG_TYPE_T ne10_fft_cfg_float32_t
47# define NE10_FFT_STATE_TYPE_T ne10_fft_state_float32_t
48# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_float32
49# define NE10_FFT_CPX_TYPE_T ne10_fft_cpx_float32_t
50# define NE10_FFT_C2C_1D_TYPE_NEON ne10_fft_c2c_1d_float32_neon
51#else
52# define NE10_FFT_ALLOC_C2C_TYPE_NEON(nfft) ne10_fft_alloc_c2c_int32_neon(nfft)
53# define NE10_FFT_CFG_TYPE_T ne10_fft_cfg_int32_t
54# define NE10_FFT_STATE_TYPE_T ne10_fft_state_int32_t
55# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_int32
56# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_int32
57# define NE10_FFT_CPX_TYPE_T ne10_fft_cpx_int32_t
58# define NE10_FFT_C2C_1D_TYPE_NEON ne10_fft_c2c_1d_int32_neon
59#endif
60
61#if defined(CUSTOM_MODES)
62
63/* nfft lengths in NE10 that support scaled fft */
64# define NE10_FFTSCALED_SUPPORT_MAX 4
65static const int ne10_fft_scaled_support[NE10_FFTSCALED_SUPPORT_MAX] = {
66 480, 240, 120, 60
67};
68
69int opus_fft_alloc_arm_neon(kiss_fft_state *st)
70{
71 int i;
72 size_t memneeded = sizeof(struct arch_fft_state);
73
74 st->arch_fft = (arch_fft_state *)opus_alloc(memneeded);
75 if (!st->arch_fft)
76 return -1;
77
78 for (i = 0; i < NE10_FFTSCALED_SUPPORT_MAX; i++) {
79 if(st->nfft == ne10_fft_scaled_support[i])
80 break;
81 }
82 if (i == NE10_FFTSCALED_SUPPORT_MAX) {
83 /* This nfft length (scaled fft) is not supported in NE10 */
84 st->arch_fft->is_supported = 0;
85 st->arch_fft->priv = NULL;
86 }
87 else {
88 st->arch_fft->is_supported = 1;
89 st->arch_fft->priv = (void *)NE10_FFT_ALLOC_C2C_TYPE_NEON(st->nfft);
90 if (st->arch_fft->priv == NULL) {
91 return -1;
92 }
93 }
94 return 0;
95}
96
97void opus_fft_free_arm_neon(kiss_fft_state *st)
98{
99 NE10_FFT_CFG_TYPE_T cfg;
100
101 if (!st->arch_fft)
102 return;
103
104 cfg = (NE10_FFT_CFG_TYPE_T)st->arch_fft->priv;
105 if (cfg)
106 NE10_FFT_DESTROY_C2C_TYPE(cfg);
107 opus_free(st->arch_fft);
108}
109#endif
110
111void opus_fft_neon(const kiss_fft_state *st,
112 const kiss_fft_cpx *fin,
113 kiss_fft_cpx *fout)
114{
115 NE10_FFT_STATE_TYPE_T state;
116 NE10_FFT_CFG_TYPE_T cfg = &state;
117 VARDECL(NE10_FFT_CPX_TYPE_T, buffer);
118 SAVE_STACK;
119 ALLOC(buffer, st->nfft, NE10_FFT_CPX_TYPE_T);
120
121 if (!st->arch_fft->is_supported) {
122 /* This nfft length (scaled fft) not supported in NE10 */
123 opus_fft_c(st, fin, fout);
124 }
125 else {
126 memcpy((void *)cfg, st->arch_fft->priv, sizeof(NE10_FFT_STATE_TYPE_T));
127 state.buffer = (NE10_FFT_CPX_TYPE_T *)&buffer[0];
128#if !defined(FIXED_POINT)
129 state.is_forward_scaled = 1;
130
131 NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
132 (NE10_FFT_CPX_TYPE_T *)fin,
133 cfg, 0);
134#else
135 NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
136 (NE10_FFT_CPX_TYPE_T *)fin,
137 cfg, 0, 1);
138#endif
139 }
140 RESTORE_STACK;
141}
142
143void opus_ifft_neon(const kiss_fft_state *st,
144 const kiss_fft_cpx *fin,
145 kiss_fft_cpx *fout)
146{
147 NE10_FFT_STATE_TYPE_T state;
148 NE10_FFT_CFG_TYPE_T cfg = &state;
149 VARDECL(NE10_FFT_CPX_TYPE_T, buffer);
150 SAVE_STACK;
151 ALLOC(buffer, st->nfft, NE10_FFT_CPX_TYPE_T);
152
153 if (!st->arch_fft->is_supported) {
154 /* This nfft length (scaled fft) not supported in NE10 */
155 opus_ifft_c(st, fin, fout);
156 }
157 else {
158 memcpy((void *)cfg, st->arch_fft->priv, sizeof(NE10_FFT_STATE_TYPE_T));
159 state.buffer = (NE10_FFT_CPX_TYPE_T *)&buffer[0];
160#if !defined(FIXED_POINT)
161 state.is_backward_scaled = 0;
162
163 NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
164 (NE10_FFT_CPX_TYPE_T *)fin,
165 cfg, 1);
166#else
167 NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
168 (NE10_FFT_CPX_TYPE_T *)fin,
169 cfg, 1, 0);
170#endif
171 }
172 RESTORE_STACK;
173}
diff --git a/lib/rbcodec/codecs/libopus/celt/arm/celt_mdct_ne10.c b/lib/rbcodec/codecs/libopus/celt/arm/celt_mdct_ne10.c
new file mode 100644
index 0000000000..3531d02d10
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/celt/arm/celt_mdct_ne10.c
@@ -0,0 +1,258 @@
1/* Copyright (c) 2015 Xiph.Org Foundation
2 Written by Viswanath Puttagunta */
3/**
4 @file celt_mdct_ne10.c
5 @brief ARM Neon optimizations for mdct using NE10 library
6 */
7
8/*
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions
11 are met:
12
13 - Redistributions of source code must retain the above copyright
14 notice, this list of conditions and the following disclaimer.
15
16 - Redistributions in binary form must reproduce the above copyright
17 notice, this list of conditions and the following disclaimer in the
18 documentation and/or other materials provided with the distribution.
19
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
24 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31*/
32
33#ifndef SKIP_CONFIG_H
34#ifdef HAVE_CONFIG_H
35#include "config.h"
36#endif
37#endif
38
39#include "kiss_fft.h"
40#include "_kiss_fft_guts.h"
41#include "mdct.h"
42#include "stack_alloc.h"
43
44void clt_mdct_forward_neon(const mdct_lookup *l,
45 kiss_fft_scalar *in,
46 kiss_fft_scalar * OPUS_RESTRICT out,
47 const opus_val16 *window,
48 int overlap, int shift, int stride, int arch)
49{
50 int i;
51 int N, N2, N4;
52 VARDECL(kiss_fft_scalar, f);
53 VARDECL(kiss_fft_cpx, f2);
54 const kiss_fft_state *st = l->kfft[shift];
55 const kiss_twiddle_scalar *trig;
56
57 SAVE_STACK;
58
59 N = l->n;
60 trig = l->trig;
61 for (i=0;i<shift;i++)
62 {
63 N >>= 1;
64 trig += N;
65 }
66 N2 = N>>1;
67 N4 = N>>2;
68
69 ALLOC(f, N2, kiss_fft_scalar);
70 ALLOC(f2, N4, kiss_fft_cpx);
71
72 /* Consider the input to be composed of four blocks: [a, b, c, d] */
73 /* Window, shuffle, fold */
74 {
75 /* Temp pointers to make it really clear to the compiler what we're doing */
76 const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
77 const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
78 kiss_fft_scalar * OPUS_RESTRICT yp = f;
79 const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1);
80 const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
81 for(i=0;i<((overlap+3)>>2);i++)
82 {
83 /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
84 *yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2);
85 *yp++ = MULT16_32_Q15(*wp1, *xp1) - MULT16_32_Q15(*wp2, xp2[-N2]);
86 xp1+=2;
87 xp2-=2;
88 wp1+=2;
89 wp2-=2;
90 }
91 wp1 = window;
92 wp2 = window+overlap-1;
93 for(;i<N4-((overlap+3)>>2);i++)
94 {
95 /* Real part arranged as a-bR, Imag part arranged as -c-dR */
96 *yp++ = *xp2;
97 *yp++ = *xp1;
98 xp1+=2;
99 xp2-=2;
100 }
101 for(;i<N4;i++)
102 {
103 /* Real part arranged as a-bR, Imag part arranged as -c-dR */
104 *yp++ = -MULT16_32_Q15(*wp1, xp1[-N2]) + MULT16_32_Q15(*wp2, *xp2);
105 *yp++ = MULT16_32_Q15(*wp2, *xp1) + MULT16_32_Q15(*wp1, xp2[N2]);
106 xp1+=2;
107 xp2-=2;
108 wp1+=2;
109 wp2-=2;
110 }
111 }
112 /* Pre-rotation */
113 {
114 kiss_fft_scalar * OPUS_RESTRICT yp = f;
115 const kiss_twiddle_scalar *t = &trig[0];
116 for(i=0;i<N4;i++)
117 {
118 kiss_fft_cpx yc;
119 kiss_twiddle_scalar t0, t1;
120 kiss_fft_scalar re, im, yr, yi;
121 t0 = t[i];
122 t1 = t[N4+i];
123 re = *yp++;
124 im = *yp++;
125 yr = S_MUL(re,t0) - S_MUL(im,t1);
126 yi = S_MUL(im,t0) + S_MUL(re,t1);
127 yc.r = yr;
128 yc.i = yi;
129 f2[i] = yc;
130 }
131 }
132
133 opus_fft(st, f2, (kiss_fft_cpx *)f, arch);
134
135 /* Post-rotate */
136 {
137 /* Temp pointers to make it really clear to the compiler what we're doing */
138 const kiss_fft_cpx * OPUS_RESTRICT fp = (kiss_fft_cpx *)f;
139 kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
140 kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
141 const kiss_twiddle_scalar *t = &trig[0];
142 /* Temp pointers to make it really clear to the compiler what we're doing */
143 for(i=0;i<N4;i++)
144 {
145 kiss_fft_scalar yr, yi;
146 yr = S_MUL(fp->i,t[N4+i]) - S_MUL(fp->r,t[i]);
147 yi = S_MUL(fp->r,t[N4+i]) + S_MUL(fp->i,t[i]);
148 *yp1 = yr;
149 *yp2 = yi;
150 fp++;
151 yp1 += 2*stride;
152 yp2 -= 2*stride;
153 }
154 }
155 RESTORE_STACK;
156}
157
158void clt_mdct_backward_neon(const mdct_lookup *l,
159 kiss_fft_scalar *in,
160 kiss_fft_scalar * OPUS_RESTRICT out,
161 const opus_val16 * OPUS_RESTRICT window,
162 int overlap, int shift, int stride, int arch)
163{
164 int i;
165 int N, N2, N4;
166 VARDECL(kiss_fft_scalar, f);
167 const kiss_twiddle_scalar *trig;
168 const kiss_fft_state *st = l->kfft[shift];
169
170 N = l->n;
171 trig = l->trig;
172 for (i=0;i<shift;i++)
173 {
174 N >>= 1;
175 trig += N;
176 }
177 N2 = N>>1;
178 N4 = N>>2;
179
180 ALLOC(f, N2, kiss_fft_scalar);
181
182 /* Pre-rotate */
183 {
184 /* Temp pointers to make it really clear to the compiler what we're doing */
185 const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
186 const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
187 kiss_fft_scalar * OPUS_RESTRICT yp = f;
188 const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
189 for(i=0;i<N4;i++)
190 {
191 kiss_fft_scalar yr, yi;
192 yr = S_MUL(*xp2, t[i]) + S_MUL(*xp1, t[N4+i]);
193 yi = S_MUL(*xp1, t[i]) - S_MUL(*xp2, t[N4+i]);
194 yp[2*i] = yr;
195 yp[2*i+1] = yi;
196 xp1+=2*stride;
197 xp2-=2*stride;
198 }
199 }
200
201 opus_ifft(st, (kiss_fft_cpx *)f, (kiss_fft_cpx*)(out+(overlap>>1)), arch);
202
203 /* Post-rotate and de-shuffle from both ends of the buffer at once to make
204 it in-place. */
205 {
206 kiss_fft_scalar * yp0 = out+(overlap>>1);
207 kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;
208 const kiss_twiddle_scalar *t = &trig[0];
209 /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
210 middle pair will be computed twice. */
211 for(i=0;i<(N4+1)>>1;i++)
212 {
213 kiss_fft_scalar re, im, yr, yi;
214 kiss_twiddle_scalar t0, t1;
215 re = yp0[0];
216 im = yp0[1];
217 t0 = t[i];
218 t1 = t[N4+i];
219 /* We'd scale up by 2 here, but instead it's done when mixing the windows */
220 yr = S_MUL(re,t0) + S_MUL(im,t1);
221 yi = S_MUL(re,t1) - S_MUL(im,t0);
222 re = yp1[0];
223 im = yp1[1];
224 yp0[0] = yr;
225 yp1[1] = yi;
226
227 t0 = t[(N4-i-1)];
228 t1 = t[(N2-i-1)];
229 /* We'd scale up by 2 here, but instead it's done when mixing the windows */
230 yr = S_MUL(re,t0) + S_MUL(im,t1);
231 yi = S_MUL(re,t1) - S_MUL(im,t0);
232 yp1[0] = yr;
233 yp0[1] = yi;
234 yp0 += 2;
235 yp1 -= 2;
236 }
237 }
238
239 /* Mirror on both sides for TDAC */
240 {
241 kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
242 kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
243 const opus_val16 * OPUS_RESTRICT wp1 = window;
244 const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1;
245
246 for(i = 0; i < overlap/2; i++)
247 {
248 kiss_fft_scalar x1, x2;
249 x1 = *xp1;
250 x2 = *yp1;
251 *yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1);
252 *xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1);
253 wp1++;
254 wp2--;
255 }
256 }
257 RESTORE_STACK;
258}
diff --git a/lib/rbcodec/codecs/libopus/celt/arm/celt_neon_intr.c b/lib/rbcodec/codecs/libopus/celt/arm/celt_neon_intr.c
new file mode 100644
index 0000000000..effda769d0
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/celt/arm/celt_neon_intr.c
@@ -0,0 +1,211 @@
1/* Copyright (c) 2014-2015 Xiph.Org Foundation
2 Written by Viswanath Puttagunta */
3/**
4 @file celt_neon_intr.c
5 @brief ARM Neon Intrinsic optimizations for celt
6 */
7
8/*
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions
11 are met:
12
13 - Redistributions of source code must retain the above copyright
14 notice, this list of conditions and the following disclaimer.
15
16 - Redistributions in binary form must reproduce the above copyright
17 notice, this list of conditions and the following disclaimer in the
18 documentation and/or other materials provided with the distribution.
19
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
24 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31*/
32
33#ifdef HAVE_CONFIG_H
34#include "config.h"
35#endif
36
37#include <arm_neon.h>
38#include "../pitch.h"
39
40#if defined(FIXED_POINT)
41void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
42{
43 int j;
44 int32x4_t a = vld1q_s32(sum);
45 /* Load y[0...3] */
46 /* This requires len>0 to always be valid (which we assert in the C code). */
47 int16x4_t y0 = vld1_s16(y);
48 y += 4;
49
50 for (j = 0; j + 8 <= len; j += 8)
51 {
52 /* Load x[0...7] */
53 int16x8_t xx = vld1q_s16(x);
54 int16x4_t x0 = vget_low_s16(xx);
55 int16x4_t x4 = vget_high_s16(xx);
56 /* Load y[4...11] */
57 int16x8_t yy = vld1q_s16(y);
58 int16x4_t y4 = vget_low_s16(yy);
59 int16x4_t y8 = vget_high_s16(yy);
60 int32x4_t a0 = vmlal_lane_s16(a, y0, x0, 0);
61 int32x4_t a1 = vmlal_lane_s16(a0, y4, x4, 0);
62
63 int16x4_t y1 = vext_s16(y0, y4, 1);
64 int16x4_t y5 = vext_s16(y4, y8, 1);
65 int32x4_t a2 = vmlal_lane_s16(a1, y1, x0, 1);
66 int32x4_t a3 = vmlal_lane_s16(a2, y5, x4, 1);
67
68 int16x4_t y2 = vext_s16(y0, y4, 2);
69 int16x4_t y6 = vext_s16(y4, y8, 2);
70 int32x4_t a4 = vmlal_lane_s16(a3, y2, x0, 2);
71 int32x4_t a5 = vmlal_lane_s16(a4, y6, x4, 2);
72
73 int16x4_t y3 = vext_s16(y0, y4, 3);
74 int16x4_t y7 = vext_s16(y4, y8, 3);
75 int32x4_t a6 = vmlal_lane_s16(a5, y3, x0, 3);
76 int32x4_t a7 = vmlal_lane_s16(a6, y7, x4, 3);
77
78 y0 = y8;
79 a = a7;
80 x += 8;
81 y += 8;
82 }
83
84 for (; j < len; j++)
85 {
86 int16x4_t x0 = vld1_dup_s16(x); /* load next x */
87 int32x4_t a0 = vmlal_s16(a, y0, x0);
88
89 int16x4_t y4 = vld1_dup_s16(y); /* load next y */
90 y0 = vext_s16(y0, y4, 1);
91 a = a0;
92 x++;
93 y++;
94 }
95
96 vst1q_s32(sum, a);
97}
98
99#else
100/*
101 * Function: xcorr_kernel_neon_float
102 * ---------------------------------
103 * Computes 4 correlation values and stores them in sum[4]
104 */
105static void xcorr_kernel_neon_float(const float32_t *x, const float32_t *y,
106 float32_t sum[4], int len) {
107 float32x4_t YY[3];
108 float32x4_t YEXT[3];
109 float32x4_t XX[2];
110 float32x2_t XX_2;
111 float32x4_t SUMM;
112 const float32_t *xi = x;
113 const float32_t *yi = y;
114
115 celt_assert(len>0);
116
117 YY[0] = vld1q_f32(yi);
118 SUMM = vdupq_n_f32(0);
119
120 /* Consume 8 elements in x vector and 12 elements in y
121 * vector. However, the 12'th element never really gets
122 * touched in this loop. So, if len == 8, then we only
123 * must access y[0] to y[10]. y[11] must not be accessed
124 * hence make sure len > 8 and not len >= 8
125 */
126 while (len > 8) {
127 yi += 4;
128 YY[1] = vld1q_f32(yi);
129 yi += 4;
130 YY[2] = vld1q_f32(yi);
131
132 XX[0] = vld1q_f32(xi);
133 xi += 4;
134 XX[1] = vld1q_f32(xi);
135 xi += 4;
136
137 SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0);
138 YEXT[0] = vextq_f32(YY[0], YY[1], 1);
139 SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1);
140 YEXT[1] = vextq_f32(YY[0], YY[1], 2);
141 SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0);
142 YEXT[2] = vextq_f32(YY[0], YY[1], 3);
143 SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1);
144
145 SUMM = vmlaq_lane_f32(SUMM, YY[1], vget_low_f32(XX[1]), 0);
146 YEXT[0] = vextq_f32(YY[1], YY[2], 1);
147 SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[1]), 1);
148 YEXT[1] = vextq_f32(YY[1], YY[2], 2);
149 SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[1]), 0);
150 YEXT[2] = vextq_f32(YY[1], YY[2], 3);
151 SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[1]), 1);
152
153 YY[0] = YY[2];
154 len -= 8;
155 }
156
157 /* Consume 4 elements in x vector and 8 elements in y
158 * vector. However, the 8'th element in y never really gets
159 * touched in this loop. So, if len == 4, then we only
160 * must access y[0] to y[6]. y[7] must not be accessed
161 * hence make sure len>4 and not len>=4
162 */
163 if (len > 4) {
164 yi += 4;
165 YY[1] = vld1q_f32(yi);
166
167 XX[0] = vld1q_f32(xi);
168 xi += 4;
169
170 SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0);
171 YEXT[0] = vextq_f32(YY[0], YY[1], 1);
172 SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1);
173 YEXT[1] = vextq_f32(YY[0], YY[1], 2);
174 SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0);
175 YEXT[2] = vextq_f32(YY[0], YY[1], 3);
176 SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1);
177
178 YY[0] = YY[1];
179 len -= 4;
180 }
181
182 while (--len > 0) {
183 XX_2 = vld1_dup_f32(xi++);
184 SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);
185 YY[0]= vld1q_f32(++yi);
186 }
187
188 XX_2 = vld1_dup_f32(xi);
189 SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);
190
191 vst1q_f32(sum, SUMM);
192}
193
194void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
195 opus_val32 *xcorr, int len, int max_pitch, int arch) {
196 int i;
197 (void)arch;
198 celt_assert(max_pitch > 0);
199 celt_sig_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
200
201 for (i = 0; i < (max_pitch-3); i += 4) {
202 xcorr_kernel_neon_float((const float32_t *)_x, (const float32_t *)_y+i,
203 (float32_t *)xcorr+i, len);
204 }
205
206 /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */
207 for (; i < max_pitch; i++) {
208 xcorr[i] = celt_inner_prod_neon(_x, _y+i, len);
209 }
210}
211#endif
diff --git a/lib/rbcodec/codecs/libopus/celt/arm/celt_pitch_xcorr_arm.s b/lib/rbcodec/codecs/libopus/celt/arm/celt_pitch_xcorr_arm.s
new file mode 100644
index 0000000000..6e873afc37
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/celt/arm/celt_pitch_xcorr_arm.s
@@ -0,0 +1,551 @@
1; Copyright (c) 2007-2008 CSIRO
2; Copyright (c) 2007-2009 Xiph.Org Foundation
3; Copyright (c) 2013 Parrot
4; Written by Aurélien Zanelli
5;
6; Redistribution and use in source and binary forms, with or without
7; modification, are permitted provided that the following conditions
8; are met:
9;
10; - Redistributions of source code must retain the above copyright
11; notice, this list of conditions and the following disclaimer.
12;
13; - Redistributions in binary form must reproduce the above copyright
14; notice, this list of conditions and the following disclaimer in the
15; documentation and/or other materials provided with the distribution.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
21; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
25; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
29 AREA |.text|, CODE, READONLY
30
31 GET celt/arm/armopts.s
32
33IF OPUS_ARM_MAY_HAVE_EDSP
34 EXPORT celt_pitch_xcorr_edsp
35ENDIF
36
37IF OPUS_ARM_MAY_HAVE_NEON
38 EXPORT celt_pitch_xcorr_neon
39ENDIF
40
41IF OPUS_ARM_MAY_HAVE_NEON
42
43; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
44xcorr_kernel_neon PROC
45xcorr_kernel_neon_start
46 ; input:
47 ; r3 = int len
48 ; r4 = opus_val16 *x
49 ; r5 = opus_val16 *y
50 ; q0 = opus_val32 sum[4]
51 ; output:
52 ; q0 = opus_val32 sum[4]
53 ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
54 ; internal usage:
55 ; r12 = int j
56 ; d3 = y_3|y_2|y_1|y_0
57 ; q2 = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
58 ; q3 = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
59 ; q8 = scratch
60 ;
61 ; Load y[0...3]
62 ; This requires len>0 to always be valid (which we assert in the C code).
63 VLD1.16 {d5}, [r5]!
64 SUBS r12, r3, #8
65 BLE xcorr_kernel_neon_process4
66; Process 8 samples at a time.
67; This loop loads one y value more than we actually need. Therefore we have to
68; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
69; reading past the end of the array.
70xcorr_kernel_neon_process8
71 ; This loop has 19 total instructions (10 cycles to issue, minimum), with
72 ; - 2 cycles of ARM insrtuctions,
73 ; - 10 cycles of load/store/byte permute instructions, and
74 ; - 9 cycles of data processing instructions.
75 ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
76 ; latter two categories, meaning the whole loop should run in 10 cycles per
77 ; iteration, barring cache misses.
78 ;
79 ; Load x[0...7]
80 VLD1.16 {d6, d7}, [r4]!
81 ; Unlike VMOV, VAND is a data processsing instruction (and doesn't get
82 ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
83 VAND d3, d5, d5
84 SUBS r12, r12, #8
85 ; Load y[4...11]
86 VLD1.16 {d4, d5}, [r5]!
87 VMLAL.S16 q0, d3, d6[0]
88 VEXT.16 d16, d3, d4, #1
89 VMLAL.S16 q0, d4, d7[0]
90 VEXT.16 d17, d4, d5, #1
91 VMLAL.S16 q0, d16, d6[1]
92 VEXT.16 d16, d3, d4, #2
93 VMLAL.S16 q0, d17, d7[1]
94 VEXT.16 d17, d4, d5, #2
95 VMLAL.S16 q0, d16, d6[2]
96 VEXT.16 d16, d3, d4, #3
97 VMLAL.S16 q0, d17, d7[2]
98 VEXT.16 d17, d4, d5, #3
99 VMLAL.S16 q0, d16, d6[3]
100 VMLAL.S16 q0, d17, d7[3]
101 BGT xcorr_kernel_neon_process8
102; Process 4 samples here if we have > 4 left (still reading one extra y value).
103xcorr_kernel_neon_process4
104 ADDS r12, r12, #4
105 BLE xcorr_kernel_neon_process2
106 ; Load x[0...3]
107 VLD1.16 d6, [r4]!
108 ; Use VAND since it's a data processing instruction again.
109 VAND d4, d5, d5
110 SUB r12, r12, #4
111 ; Load y[4...7]
112 VLD1.16 d5, [r5]!
113 VMLAL.S16 q0, d4, d6[0]
114 VEXT.16 d16, d4, d5, #1
115 VMLAL.S16 q0, d16, d6[1]
116 VEXT.16 d16, d4, d5, #2
117 VMLAL.S16 q0, d16, d6[2]
118 VEXT.16 d16, d4, d5, #3
119 VMLAL.S16 q0, d16, d6[3]
120; Process 2 samples here if we have > 2 left (still reading one extra y value).
121xcorr_kernel_neon_process2
122 ADDS r12, r12, #2
123 BLE xcorr_kernel_neon_process1
124 ; Load x[0...1]
125 VLD2.16 {d6[],d7[]}, [r4]!
126 ; Use VAND since it's a data processing instruction again.
127 VAND d4, d5, d5
128 SUB r12, r12, #2
129 ; Load y[4...5]
130 VLD1.32 {d5[]}, [r5]!
131 VMLAL.S16 q0, d4, d6
132 VEXT.16 d16, d4, d5, #1
133 ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
134 ; instead of VEXT, since it's a data-processing instruction.
135 VSRI.64 d5, d4, #32
136 VMLAL.S16 q0, d16, d7
137; Process 1 sample using the extra y value we loaded above.
138xcorr_kernel_neon_process1
139 ; Load next *x
140 VLD1.16 {d6[]}, [r4]!
141 ADDS r12, r12, #1
142 ; y[0...3] are left in d5 from prior iteration(s) (if any)
143 VMLAL.S16 q0, d5, d6
144 MOVLE pc, lr
145; Now process 1 last sample, not reading ahead.
146 ; Load last *y
147 VLD1.16 {d4[]}, [r5]!
148 VSRI.64 d4, d5, #16
149 ; Load last *x
150 VLD1.16 {d6[]}, [r4]!
151 VMLAL.S16 q0, d4, d6
152 MOV pc, lr
153 ENDP
154
155; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
156; opus_val32 *xcorr, int len, int max_pitch, int arch)
157celt_pitch_xcorr_neon PROC
158 ; input:
159 ; r0 = opus_val16 *_x
160 ; r1 = opus_val16 *_y
161 ; r2 = opus_val32 *xcorr
162 ; r3 = int len
163 ; output:
164 ; r0 = int maxcorr
165 ; internal usage:
166 ; r4 = opus_val16 *x (for xcorr_kernel_neon())
167 ; r5 = opus_val16 *y (for xcorr_kernel_neon())
168 ; r6 = int max_pitch
169 ; r12 = int j
170 ; q15 = int maxcorr[4] (q15 is not used by xcorr_kernel_neon())
171 ; ignored:
172 ; int arch
173 STMFD sp!, {r4-r6, lr}
174 LDR r6, [sp, #16]
175 VMOV.S32 q15, #1
176 ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
177 SUBS r6, r6, #4
178 BLT celt_pitch_xcorr_neon_process4_done
179celt_pitch_xcorr_neon_process4
180 ; xcorr_kernel_neon parameters:
181 ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
182 MOV r4, r0
183 MOV r5, r1
184 VEOR q0, q0, q0
185 ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
186 ; So we don't save/restore any other registers.
187 BL xcorr_kernel_neon_start
188 SUBS r6, r6, #4
189 VST1.32 {q0}, [r2]!
190 ; _y += 4
191 ADD r1, r1, #8
192 VMAX.S32 q15, q15, q0
193 ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
194 BGE celt_pitch_xcorr_neon_process4
195; We have less than 4 sums left to compute.
196celt_pitch_xcorr_neon_process4_done
197 ADDS r6, r6, #4
198 ; Reduce maxcorr to a single value
199 VMAX.S32 d30, d30, d31
200 VPMAX.S32 d30, d30, d30
201 ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
202 BLE celt_pitch_xcorr_neon_done
203; Now compute each remaining sum one at a time.
204celt_pitch_xcorr_neon_process_remaining
205 MOV r4, r0
206 MOV r5, r1
207 VMOV.I32 q0, #0
208 SUBS r12, r3, #8
209 BLT celt_pitch_xcorr_neon_process_remaining4
210; Sum terms 8 at a time.
211celt_pitch_xcorr_neon_process_remaining_loop8
212 ; Load x[0...7]
213 VLD1.16 {q1}, [r4]!
214 ; Load y[0...7]
215 VLD1.16 {q2}, [r5]!
216 SUBS r12, r12, #8
217 VMLAL.S16 q0, d4, d2
218 VMLAL.S16 q0, d5, d3
219 BGE celt_pitch_xcorr_neon_process_remaining_loop8
220; Sum terms 4 at a time.
221celt_pitch_xcorr_neon_process_remaining4
222 ADDS r12, r12, #4
223 BLT celt_pitch_xcorr_neon_process_remaining4_done
224 ; Load x[0...3]
225 VLD1.16 {d2}, [r4]!
226 ; Load y[0...3]
227 VLD1.16 {d3}, [r5]!
228 SUB r12, r12, #4
229 VMLAL.S16 q0, d3, d2
230celt_pitch_xcorr_neon_process_remaining4_done
231 ; Reduce the sum to a single value.
232 VADD.S32 d0, d0, d1
233 VPADDL.S32 d0, d0
234 ADDS r12, r12, #4
235 BLE celt_pitch_xcorr_neon_process_remaining_loop_done
236; Sum terms 1 at a time.
237celt_pitch_xcorr_neon_process_remaining_loop1
238 VLD1.16 {d2[]}, [r4]!
239 VLD1.16 {d3[]}, [r5]!
240 SUBS r12, r12, #1
241 VMLAL.S16 q0, d2, d3
242 BGT celt_pitch_xcorr_neon_process_remaining_loop1
243celt_pitch_xcorr_neon_process_remaining_loop_done
244 VST1.32 {d0[0]}, [r2]!
245 VMAX.S32 d30, d30, d0
246 SUBS r6, r6, #1
247 ; _y++
248 ADD r1, r1, #2
249 ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
250 BGT celt_pitch_xcorr_neon_process_remaining
251celt_pitch_xcorr_neon_done
252 VMOV.32 r0, d30[0]
253 LDMFD sp!, {r4-r6, pc}
254 ENDP
255
256ENDIF
257
258IF OPUS_ARM_MAY_HAVE_EDSP
259
260; This will get used on ARMv7 devices without NEON, so it has been optimized
261; to take advantage of dual-issuing where possible.
262xcorr_kernel_edsp PROC
263xcorr_kernel_edsp_start
264 ; input:
265 ; r3 = int len
266 ; r4 = opus_val16 *_x (must be 32-bit aligned)
267 ; r5 = opus_val16 *_y (must be 32-bit aligned)
268 ; r6...r9 = opus_val32 sum[4]
269 ; output:
270 ; r6...r9 = opus_val32 sum[4]
271 ; preserved: r0-r5
272 ; internal usage
273 ; r2 = int j
274 ; r12,r14 = opus_val16 x[4]
275 ; r10,r11 = opus_val16 y[4]
276 STMFD sp!, {r2,r4,r5,lr}
277 LDR r10, [r5], #4 ; Load y[0...1]
278 SUBS r2, r3, #4 ; j = len-4
279 LDR r11, [r5], #4 ; Load y[2...3]
280 BLE xcorr_kernel_edsp_process4_done
281 LDR r12, [r4], #4 ; Load x[0...1]
282 ; Stall
283xcorr_kernel_edsp_process4
284 ; The multiplies must issue from pipeline 0, and can't dual-issue with each
285 ; other. Every other instruction here dual-issues with a multiply, and is
286 ; thus "free". There should be no stalls in the body of the loop.
287 SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_0,y_0)
288 LDR r14, [r4], #4 ; Load x[2...3]
289 SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x_0,y_1)
290 SUBS r2, r2, #4 ; j-=4
291 SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_0,y_2)
292 SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x_0,y_3)
293 SMLATT r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_1,y_1)
294 LDR r10, [r5], #4 ; Load y[4...5]
295 SMLATB r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],x_1,y_2)
296 SMLATT r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_1,y_3)
297 SMLATB r9, r12, r10, r9 ; sum[3] = MAC16_16(sum[3],x_1,y_4)
298 LDRGT r12, [r4], #4 ; Load x[0...1]
299 SMLABB r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_2,y_2)
300 SMLABT r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x_2,y_3)
301 SMLABB r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_2,y_4)
302 SMLABT r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x_2,y_5)
303 SMLATT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_3,y_3)
304 LDR r11, [r5], #4 ; Load y[6...7]
305 SMLATB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],x_3,y_4)
306 SMLATT r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_3,y_5)
307 SMLATB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],x_3,y_6)
308 BGT xcorr_kernel_edsp_process4
309xcorr_kernel_edsp_process4_done
310 ADDS r2, r2, #4
311 BLE xcorr_kernel_edsp_done
312 LDRH r12, [r4], #2 ; r12 = *x++
313 SUBS r2, r2, #1 ; j--
314 ; Stall
315 SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_0)
316 LDRHGT r14, [r4], #2 ; r14 = *x++
317 SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x,y_1)
318 SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_2)
319 SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x,y_3)
320 BLE xcorr_kernel_edsp_done
321 SMLABT r6, r14, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_1)
322 SUBS r2, r2, #1 ; j--
323 SMLABB r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x,y_2)
324 LDRH r10, [r5], #2 ; r10 = y_4 = *y++
325 SMLABT r8, r14, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_3)
326 LDRHGT r12, [r4], #2 ; r12 = *x++
327 SMLABB r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x,y_4)
328 BLE xcorr_kernel_edsp_done
329 SMLABB r6, r12, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_2)
330 CMP r2, #1 ; j--
331 SMLABT r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_3)
332 LDRH r2, [r5], #2 ; r2 = y_5 = *y++
333 SMLABB r8, r12, r10, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_4)
334 LDRHGT r14, [r4] ; r14 = *x
335 SMLABB r9, r12, r2, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_5)
336 BLE xcorr_kernel_edsp_done
337 SMLABT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_3)
338 LDRH r11, [r5] ; r11 = y_6 = *y
339 SMLABB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_4)
340 SMLABB r8, r14, r2, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_5)
341 SMLABB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_6)
342xcorr_kernel_edsp_done
343 LDMFD sp!, {r2,r4,r5,pc}
344 ENDP
345
346celt_pitch_xcorr_edsp PROC
347 ; input:
348 ; r0 = opus_val16 *_x (must be 32-bit aligned)
349 ; r1 = opus_val16 *_y (only needs to be 16-bit aligned)
350 ; r2 = opus_val32 *xcorr
351 ; r3 = int len
352 ; output:
353 ; r0 = maxcorr
354 ; internal usage
355 ; r4 = opus_val16 *x
356 ; r5 = opus_val16 *y
357 ; r6 = opus_val32 sum0
358 ; r7 = opus_val32 sum1
359 ; r8 = opus_val32 sum2
360 ; r9 = opus_val32 sum3
361 ; r1 = int max_pitch
362 ; r12 = int j
363 ; ignored:
364 ; int arch
365 STMFD sp!, {r4-r11, lr}
366 MOV r5, r1
367 LDR r1, [sp, #36]
368 MOV r4, r0
369 TST r5, #3
370 ; maxcorr = 1
371 MOV r0, #1
372 BEQ celt_pitch_xcorr_edsp_process1u_done
373; Compute one sum at the start to make y 32-bit aligned.
374 SUBS r12, r3, #4
375 ; r14 = sum = 0
376 MOV r14, #0
377 LDRH r8, [r5], #2
378 BLE celt_pitch_xcorr_edsp_process1u_loop4_done
379 LDR r6, [r4], #4
380 MOV r8, r8, LSL #16
381celt_pitch_xcorr_edsp_process1u_loop4
382 LDR r9, [r5], #4
383 SMLABT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
384 LDR r7, [r4], #4
385 SMLATB r14, r6, r9, r14 ; sum = MAC16_16(sum, x_1, y_1)
386 LDR r8, [r5], #4
387 SMLABT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2)
388 SUBS r12, r12, #4 ; j-=4
389 SMLATB r14, r7, r8, r14 ; sum = MAC16_16(sum, x_3, y_3)
390 LDRGT r6, [r4], #4
391 BGT celt_pitch_xcorr_edsp_process1u_loop4
392 MOV r8, r8, LSR #16
393celt_pitch_xcorr_edsp_process1u_loop4_done
394 ADDS r12, r12, #4
395celt_pitch_xcorr_edsp_process1u_loop1
396 LDRHGE r6, [r4], #2
397 ; Stall
398 SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y)
399 SUBSGE r12, r12, #1
400 LDRHGT r8, [r5], #2
401 BGT celt_pitch_xcorr_edsp_process1u_loop1
402 ; Restore _x
403 SUB r4, r4, r3, LSL #1
404 ; Restore and advance _y
405 SUB r5, r5, r3, LSL #1
406 ; maxcorr = max(maxcorr, sum)
407 CMP r0, r14
408 ADD r5, r5, #2
409 MOVLT r0, r14
410 SUBS r1, r1, #1
411 ; xcorr[i] = sum
412 STR r14, [r2], #4
413 BLE celt_pitch_xcorr_edsp_done
414celt_pitch_xcorr_edsp_process1u_done
415 ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
416 SUBS r1, r1, #4
417 BLT celt_pitch_xcorr_edsp_process2
418celt_pitch_xcorr_edsp_process4
419 ; xcorr_kernel_edsp parameters:
420 ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
421 MOV r6, #0
422 MOV r7, #0
423 MOV r8, #0
424 MOV r9, #0
425 BL xcorr_kernel_edsp_start ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
426 ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
427 CMP r0, r6
428 ; _y+=4
429 ADD r5, r5, #8
430 MOVLT r0, r6
431 CMP r0, r7
432 MOVLT r0, r7
433 CMP r0, r8
434 MOVLT r0, r8
435 CMP r0, r9
436 MOVLT r0, r9
437 STMIA r2!, {r6-r9}
438 SUBS r1, r1, #4
439 BGE celt_pitch_xcorr_edsp_process4
440celt_pitch_xcorr_edsp_process2
441 ADDS r1, r1, #2
442 BLT celt_pitch_xcorr_edsp_process1a
443 SUBS r12, r3, #4
444 ; {r10, r11} = {sum0, sum1} = {0, 0}
445 MOV r10, #0
446 MOV r11, #0
447 LDR r8, [r5], #4
448 BLE celt_pitch_xcorr_edsp_process2_loop_done
449 LDR r6, [r4], #4
450 LDR r9, [r5], #4
451celt_pitch_xcorr_edsp_process2_loop4
452 SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
453 LDR r7, [r4], #4
454 SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
455 SUBS r12, r12, #4 ; j-=4
456 SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1)
457 LDR r8, [r5], #4
458 SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2)
459 LDRGT r6, [r4], #4
460 SMLABB r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_2, y_2)
461 SMLABT r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_2, y_3)
462 SMLATT r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_3, y_3)
463 LDRGT r9, [r5], #4
464 SMLATB r11, r7, r8, r11 ; sum1 = MAC16_16(sum1, x_3, y_4)
465 BGT celt_pitch_xcorr_edsp_process2_loop4
466celt_pitch_xcorr_edsp_process2_loop_done
467 ADDS r12, r12, #2
468 BLE celt_pitch_xcorr_edsp_process2_1
469 LDR r6, [r4], #4
470 ; Stall
471 SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
472 LDR r9, [r5], #4
473 SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
474 SUB r12, r12, #2
475 SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1)
476 MOV r8, r9
477 SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2)
478celt_pitch_xcorr_edsp_process2_1
479 LDRH r6, [r4], #2
480 ADDS r12, r12, #1
481 ; Stall
482 SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
483 LDRHGT r7, [r4], #2
484 SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
485 BLE celt_pitch_xcorr_edsp_process2_done
486 LDRH r9, [r5], #2
487 SMLABT r10, r7, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_1)
488 SMLABB r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_0, y_2)
489celt_pitch_xcorr_edsp_process2_done
490 ; Restore _x
491 SUB r4, r4, r3, LSL #1
492 ; Restore and advance _y
493 SUB r5, r5, r3, LSL #1
494 ; maxcorr = max(maxcorr, sum0)
495 CMP r0, r10
496 ADD r5, r5, #2
497 MOVLT r0, r10
498 SUB r1, r1, #2
499 ; maxcorr = max(maxcorr, sum1)
500 CMP r0, r11
501 ; xcorr[i] = sum
502 STR r10, [r2], #4
503 MOVLT r0, r11
504 STR r11, [r2], #4
505celt_pitch_xcorr_edsp_process1a
506 ADDS r1, r1, #1
507 BLT celt_pitch_xcorr_edsp_done
508 SUBS r12, r3, #4
509 ; r14 = sum = 0
510 MOV r14, #0
511 BLT celt_pitch_xcorr_edsp_process1a_loop_done
512 LDR r6, [r4], #4
513 LDR r8, [r5], #4
514 LDR r7, [r4], #4
515 LDR r9, [r5], #4
516celt_pitch_xcorr_edsp_process1a_loop4
517 SMLABB r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
518 SUBS r12, r12, #4 ; j-=4
519 SMLATT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1)
520 LDRGE r6, [r4], #4
521 SMLABB r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2)
522 LDRGE r8, [r5], #4
523 SMLATT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_3, y_3)
524 LDRGE r7, [r4], #4
525 LDRGE r9, [r5], #4
526 BGE celt_pitch_xcorr_edsp_process1a_loop4
527celt_pitch_xcorr_edsp_process1a_loop_done
528 ADDS r12, r12, #2
529 LDRGE r6, [r4], #4
530 LDRGE r8, [r5], #4
531 ; Stall
532 SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
533 SUBGE r12, r12, #2
534 SMLATTGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1)
535 ADDS r12, r12, #1
536 LDRHGE r6, [r4], #2
537 LDRHGE r8, [r5], #2
538 ; Stall
539 SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y)
540 ; maxcorr = max(maxcorr, sum)
541 CMP r0, r14
542 ; xcorr[i] = sum
543 STR r14, [r2], #4
544 MOVLT r0, r14
545celt_pitch_xcorr_edsp_done
546 LDMFD sp!, {r4-r11, pc}
547 ENDP
548
549ENDIF
550
551END
diff --git a/lib/rbcodec/codecs/libopus/celt/arm/fft_arm.h b/lib/rbcodec/codecs/libopus/celt/arm/fft_arm.h
new file mode 100644
index 0000000000..0b78175f3a
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/celt/arm/fft_arm.h
@@ -0,0 +1,71 @@
1/* Copyright (c) 2015 Xiph.Org Foundation
2 Written by Viswanath Puttagunta */
3/**
4 @file fft_arm.h
5 @brief ARM Neon Intrinsic optimizations for fft using NE10 library
6 */
7
8/*
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions
11 are met:
12
13 - Redistributions of source code must retain the above copyright
14 notice, this list of conditions and the following disclaimer.
15
16 - Redistributions in binary form must reproduce the above copyright
17 notice, this list of conditions and the following disclaimer in the
18 documentation and/or other materials provided with the distribution.
19
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
24 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31*/
32
33
34#if !defined(FFT_ARM_H)
35#define FFT_ARM_H
36
37#include "kiss_fft.h"
38
39#if defined(HAVE_ARM_NE10)
40
41int opus_fft_alloc_arm_neon(kiss_fft_state *st);
42void opus_fft_free_arm_neon(kiss_fft_state *st);
43
44void opus_fft_neon(const kiss_fft_state *st,
45 const kiss_fft_cpx *fin,
46 kiss_fft_cpx *fout);
47
48void opus_ifft_neon(const kiss_fft_state *st,
49 const kiss_fft_cpx *fin,
50 kiss_fft_cpx *fout);
51
52#if !defined(OPUS_HAVE_RTCD)
53#define OVERRIDE_OPUS_FFT (1)
54
55#define opus_fft_alloc_arch(_st, arch) \
56 ((void)(arch), opus_fft_alloc_arm_neon(_st))
57
58#define opus_fft_free_arch(_st, arch) \
59 ((void)(arch), opus_fft_free_arm_neon(_st))
60
61#define opus_fft(_st, _fin, _fout, arch) \
62 ((void)(arch), opus_fft_neon(_st, _fin, _fout))
63
64#define opus_ifft(_st, _fin, _fout, arch) \
65 ((void)(arch), opus_ifft_neon(_st, _fin, _fout))
66
67#endif /* OPUS_HAVE_RTCD */
68
69#endif /* HAVE_ARM_NE10 */
70
71#endif
diff --git a/lib/rbcodec/codecs/libopus/celt/arm/fixed_arm64.h b/lib/rbcodec/codecs/libopus/celt/arm/fixed_arm64.h
new file mode 100644
index 0000000000..c6fbd3db2c
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/celt/arm/fixed_arm64.h
@@ -0,0 +1,35 @@
1/* Copyright (C) 2015 Vidyo */
2/*
3 Redistribution and use in source and binary forms, with or without
4 modification, are permitted provided that the following conditions
5 are met:
6
7 - Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9
10 - Redistributions in binary form must reproduce the above copyright
11 notice, this list of conditions and the following disclaimer in the
12 documentation and/or other materials provided with the distribution.
13
14 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
18 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
22 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*/
26
27#ifndef FIXED_ARM64_H
28#define FIXED_ARM64_H
29
30#include <arm_neon.h>
31
32#undef SIG2WORD16
33#define SIG2WORD16(x) (vqmovns_s32(PSHR32((x), SIG_SHIFT)))
34
35#endif
diff --git a/lib/rbcodec/codecs/libopus/celt/arm/fixed_armv4.h b/lib/rbcodec/codecs/libopus/celt/arm/fixed_armv4.h
index efb3b1896a..d84888a772 100644
--- a/lib/rbcodec/codecs/libopus/celt/arm/fixed_armv4.h
+++ b/lib/rbcodec/codecs/libopus/celt/arm/fixed_armv4.h
@@ -37,7 +37,7 @@ static OPUS_INLINE opus_val32 MULT16_32_Q16_armv4(opus_val16 a, opus_val32 b)
37 "#MULT16_32_Q16\n\t" 37 "#MULT16_32_Q16\n\t"
38 "smull %0, %1, %2, %3\n\t" 38 "smull %0, %1, %2, %3\n\t"
39 : "=&r"(rd_lo), "=&r"(rd_hi) 39 : "=&r"(rd_lo), "=&r"(rd_hi)
40 : "%r"(b),"r"(a<<16) 40 : "%r"(b),"r"(SHL32(a,16))
41 ); 41 );
42 return rd_hi; 42 return rd_hi;
43} 43}
@@ -54,10 +54,10 @@ static OPUS_INLINE opus_val32 MULT16_32_Q15_armv4(opus_val16 a, opus_val32 b)
54 "#MULT16_32_Q15\n\t" 54 "#MULT16_32_Q15\n\t"
55 "smull %0, %1, %2, %3\n\t" 55 "smull %0, %1, %2, %3\n\t"
56 : "=&r"(rd_lo), "=&r"(rd_hi) 56 : "=&r"(rd_lo), "=&r"(rd_hi)
57 : "%r"(b), "r"(a<<16) 57 : "%r"(b), "r"(SHL32(a,16))
58 ); 58 );
59 /*We intentionally don't OR in the high bit of rd_lo for speed.*/ 59 /*We intentionally don't OR in the high bit of rd_lo for speed.*/
60 return rd_hi<<1; 60 return SHL32(rd_hi,1);
61} 61}
62#define MULT16_32_Q15(a, b) (MULT16_32_Q15_armv4(a, b)) 62#define MULT16_32_Q15(a, b) (MULT16_32_Q15_armv4(a, b))
63 63
diff --git a/lib/rbcodec/codecs/libopus/celt/arm/fixed_armv5e.h b/lib/rbcodec/codecs/libopus/celt/arm/fixed_armv5e.h
index 36a6321101..6bf73cbace 100644
--- a/lib/rbcodec/codecs/libopus/celt/arm/fixed_armv5e.h
+++ b/lib/rbcodec/codecs/libopus/celt/arm/fixed_armv5e.h
@@ -59,7 +59,7 @@ static OPUS_INLINE opus_val32 MULT16_32_Q15_armv5e(opus_val16 a, opus_val32 b)
59 : "=r"(res) 59 : "=r"(res)
60 : "r"(b), "r"(a) 60 : "r"(b), "r"(a)
61 ); 61 );
62 return res<<1; 62 return SHL32(res,1);
63} 63}
64#define MULT16_32_Q15(a, b) (MULT16_32_Q15_armv5e(a, b)) 64#define MULT16_32_Q15(a, b) (MULT16_32_Q15_armv5e(a, b))
65 65
@@ -76,7 +76,7 @@ static OPUS_INLINE opus_val32 MAC16_32_Q15_armv5e(opus_val32 c, opus_val16 a,
76 "#MAC16_32_Q15\n\t" 76 "#MAC16_32_Q15\n\t"
77 "smlawb %0, %1, %2, %3;\n" 77 "smlawb %0, %1, %2, %3;\n"
78 : "=r"(res) 78 : "=r"(res)
79 : "r"(b<<1), "r"(a), "r"(c) 79 : "r"(SHL32(b,1)), "r"(a), "r"(c)
80 ); 80 );
81 return res; 81 return res;
82} 82}
diff --git a/lib/rbcodec/codecs/libopus/celt/arm/mdct_arm.h b/lib/rbcodec/codecs/libopus/celt/arm/mdct_arm.h
new file mode 100644
index 0000000000..14200bac4b
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/celt/arm/mdct_arm.h
@@ -0,0 +1,59 @@
1/* Copyright (c) 2015 Xiph.Org Foundation
2 Written by Viswanath Puttagunta */
3/**
4 @file arm_mdct.h
5 @brief ARM Neon Intrinsic optimizations for mdct using NE10 library
6 */
7
8/*
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions
11 are met:
12
13 - Redistributions of source code must retain the above copyright
14 notice, this list of conditions and the following disclaimer.
15
16 - Redistributions in binary form must reproduce the above copyright
17 notice, this list of conditions and the following disclaimer in the
18 documentation and/or other materials provided with the distribution.
19
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
24 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31*/
32
33#if !defined(MDCT_ARM_H)
34#define MDCT_ARM_H
35
36#include "mdct.h"
37
38#if defined(HAVE_ARM_NE10)
39/** Compute a forward MDCT and scale by 4/N, trashes the input array */
40void clt_mdct_forward_neon(const mdct_lookup *l, kiss_fft_scalar *in,
41 kiss_fft_scalar * OPUS_RESTRICT out,
42 const opus_val16 *window, int overlap,
43 int shift, int stride, int arch);
44
45void clt_mdct_backward_neon(const mdct_lookup *l, kiss_fft_scalar *in,
46 kiss_fft_scalar * OPUS_RESTRICT out,
47 const opus_val16 *window, int overlap,
48 int shift, int stride, int arch);
49
50#if !defined(OPUS_HAVE_RTCD)
51#define OVERRIDE_OPUS_MDCT (1)
52#define clt_mdct_forward(_l, _in, _out, _window, _int, _shift, _stride, _arch) \
53 clt_mdct_forward_neon(_l, _in, _out, _window, _int, _shift, _stride, _arch)
54#define clt_mdct_backward(_l, _in, _out, _window, _int, _shift, _stride, _arch) \
55 clt_mdct_backward_neon(_l, _in, _out, _window, _int, _shift, _stride, _arch)
56#endif /* OPUS_HAVE_RTCD */
57#endif /* HAVE_ARM_NE10 */
58
59#endif
diff --git a/lib/rbcodec/codecs/libopus/celt/arm/pitch_arm.h b/lib/rbcodec/codecs/libopus/celt/arm/pitch_arm.h
new file mode 100644
index 0000000000..bed8b04eac
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/celt/arm/pitch_arm.h
@@ -0,0 +1,160 @@
1/* Copyright (c) 2010 Xiph.Org Foundation
2 * Copyright (c) 2013 Parrot */
3/*
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#if !defined(PITCH_ARM_H)
29# define PITCH_ARM_H
30
31# include "armcpu.h"
32
33# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
34opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N);
35void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01,
36 const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2);
37
38# if !defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_PRESUME_NEON)
39# define OVERRIDE_CELT_INNER_PROD (1)
40# define OVERRIDE_DUAL_INNER_PROD (1)
41# define celt_inner_prod(x, y, N, arch) ((void)(arch), PRESUME_NEON(celt_inner_prod)(x, y, N))
42# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((void)(arch), PRESUME_NEON(dual_inner_prod)(x, y01, y02, N, xy1, xy2))
43# endif
44# endif
45
46# if !defined(OVERRIDE_CELT_INNER_PROD)
47# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
48extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N);
49# define OVERRIDE_CELT_INNER_PROD (1)
50# define celt_inner_prod(x, y, N, arch) ((*CELT_INNER_PROD_IMPL[(arch)&OPUS_ARCHMASK])(x, y, N))
51# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
52# define OVERRIDE_CELT_INNER_PROD (1)
53# define celt_inner_prod(x, y, N, arch) ((void)(arch), celt_inner_prod_neon(x, y, N))
54# endif
55# endif
56
57# if !defined(OVERRIDE_DUAL_INNER_PROD)
58# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
59extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x,
60 const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2);
61# define OVERRIDE_DUAL_INNER_PROD (1)
62# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((*DUAL_INNER_PROD_IMPL[(arch)&OPUS_ARCHMASK])(x, y01, y02, N, xy1, xy2))
63# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
64# define OVERRIDE_DUAL_INNER_PROD (1)
65# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((void)(arch), dual_inner_prod_neon(x, y01, y02, N, xy1, xy2))
66# endif
67# endif
68
69# if defined(FIXED_POINT)
70
71# if defined(OPUS_ARM_MAY_HAVE_NEON)
72opus_val32 celt_pitch_xcorr_neon(const opus_val16 *_x, const opus_val16 *_y,
73 opus_val32 *xcorr, int len, int max_pitch, int arch);
74# endif
75
76# if defined(OPUS_ARM_MAY_HAVE_MEDIA)
77# define celt_pitch_xcorr_media MAY_HAVE_EDSP(celt_pitch_xcorr)
78# endif
79
80# if defined(OPUS_ARM_MAY_HAVE_EDSP)
81opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y,
82 opus_val32 *xcorr, int len, int max_pitch, int arch);
83# endif
84
85# if defined(OPUS_HAVE_RTCD) && \
86 ((defined(OPUS_ARM_MAY_HAVE_NEON) && !defined(OPUS_ARM_PRESUME_NEON)) || \
87 (defined(OPUS_ARM_MAY_HAVE_MEDIA) && !defined(OPUS_ARM_PRESUME_MEDIA)) || \
88 (defined(OPUS_ARM_MAY_HAVE_EDSP) && !defined(OPUS_ARM_PRESUME_EDSP)))
89extern opus_val32
90(*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
91 const opus_val16 *, opus_val32 *, int, int, int);
92# define OVERRIDE_PITCH_XCORR (1)
93# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
94 ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
95 xcorr, len, max_pitch, arch))
96
97# elif defined(OPUS_ARM_PRESUME_EDSP) || \
98 defined(OPUS_ARM_PRESUME_MEDIA) || \
99 defined(OPUS_ARM_PRESUME_NEON)
100# define OVERRIDE_PITCH_XCORR (1)
101# define celt_pitch_xcorr (PRESUME_NEON(celt_pitch_xcorr))
102
103# endif
104
105# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
106void xcorr_kernel_neon_fixed(
107 const opus_val16 *x,
108 const opus_val16 *y,
109 opus_val32 sum[4],
110 int len);
111# endif
112
113# if defined(OPUS_HAVE_RTCD) && \
114 (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
115
116extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
117 const opus_val16 *x,
118 const opus_val16 *y,
119 opus_val32 sum[4],
120 int len);
121
122# define OVERRIDE_XCORR_KERNEL (1)
123# define xcorr_kernel(x, y, sum, len, arch) \
124 ((*XCORR_KERNEL_IMPL[(arch) & OPUS_ARCHMASK])(x, y, sum, len))
125
126# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
127# define OVERRIDE_XCORR_KERNEL (1)
128# define xcorr_kernel(x, y, sum, len, arch) \
129 ((void)arch, xcorr_kernel_neon_fixed(x, y, sum, len))
130
131# endif
132
133#else /* Start !FIXED_POINT */
134/* Float case */
135#if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
136void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
137 opus_val32 *xcorr, int len, int max_pitch, int arch);
138#endif
139
140# if defined(OPUS_HAVE_RTCD) && \
141 (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
142extern void
143(*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
144 const opus_val16 *, opus_val32 *, int, int, int);
145
146# define OVERRIDE_PITCH_XCORR (1)
147# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
148 ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
149 xcorr, len, max_pitch, arch))
150
151# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
152
153# define OVERRIDE_PITCH_XCORR (1)
154# define celt_pitch_xcorr celt_pitch_xcorr_float_neon
155
156# endif
157
158#endif /* end !FIXED_POINT */
159
160#endif
diff --git a/lib/rbcodec/codecs/libopus/celt/arm/pitch_neon_intr.c b/lib/rbcodec/codecs/libopus/celt/arm/pitch_neon_intr.c
new file mode 100644
index 0000000000..1ac38c433a
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/celt/arm/pitch_neon_intr.c
@@ -0,0 +1,290 @@
1/***********************************************************************
2Copyright (c) 2017 Google Inc.
3Redistribution and use in source and binary forms, with or without
4modification, are permitted provided that the following conditions
5are met:
6- Redistributions of source code must retain the above copyright notice,
7this list of conditions and the following disclaimer.
8- Redistributions in binary form must reproduce the above copyright
9notice, this list of conditions and the following disclaimer in the
10documentation and/or other materials provided with the distribution.
11- Neither the name of Internet Society, IETF or IETF Trust, nor the
12names of specific contributors, may be used to endorse or promote
13products derived from this software without specific prior written
14permission.
15THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25POSSIBILITY OF SUCH DAMAGE.
26***********************************************************************/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include <arm_neon.h>
33#include "pitch.h"
34
35#ifdef FIXED_POINT
36
37opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
38{
39 int i;
40 opus_val32 xy;
41 int16x8_t x_s16x8, y_s16x8;
42 int32x4_t xy_s32x4 = vdupq_n_s32(0);
43 int64x2_t xy_s64x2;
44 int64x1_t xy_s64x1;
45
46 for (i = 0; i < N - 7; i += 8) {
47 x_s16x8 = vld1q_s16(&x[i]);
48 y_s16x8 = vld1q_s16(&y[i]);
49 xy_s32x4 = vmlal_s16(xy_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y_s16x8));
50 xy_s32x4 = vmlal_s16(xy_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y_s16x8));
51 }
52
53 if (N - i >= 4) {
54 const int16x4_t x_s16x4 = vld1_s16(&x[i]);
55 const int16x4_t y_s16x4 = vld1_s16(&y[i]);
56 xy_s32x4 = vmlal_s16(xy_s32x4, x_s16x4, y_s16x4);
57 i += 4;
58 }
59
60 xy_s64x2 = vpaddlq_s32(xy_s32x4);
61 xy_s64x1 = vadd_s64(vget_low_s64(xy_s64x2), vget_high_s64(xy_s64x2));
62 xy = vget_lane_s32(vreinterpret_s32_s64(xy_s64x1), 0);
63
64 for (; i < N; i++) {
65 xy = MAC16_16(xy, x[i], y[i]);
66 }
67
68#ifdef OPUS_CHECK_ASM
69 celt_assert(celt_inner_prod_c(x, y, N) == xy);
70#endif
71
72 return xy;
73}
74
75void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
76 int N, opus_val32 *xy1, opus_val32 *xy2)
77{
78 int i;
79 opus_val32 xy01, xy02;
80 int16x8_t x_s16x8, y01_s16x8, y02_s16x8;
81 int32x4_t xy01_s32x4 = vdupq_n_s32(0);
82 int32x4_t xy02_s32x4 = vdupq_n_s32(0);
83 int64x2_t xy01_s64x2, xy02_s64x2;
84 int64x1_t xy01_s64x1, xy02_s64x1;
85
86 for (i = 0; i < N - 7; i += 8) {
87 x_s16x8 = vld1q_s16(&x[i]);
88 y01_s16x8 = vld1q_s16(&y01[i]);
89 y02_s16x8 = vld1q_s16(&y02[i]);
90 xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y01_s16x8));
91 xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y02_s16x8));
92 xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y01_s16x8));
93 xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y02_s16x8));
94 }
95
96 if (N - i >= 4) {
97 const int16x4_t x_s16x4 = vld1_s16(&x[i]);
98 const int16x4_t y01_s16x4 = vld1_s16(&y01[i]);
99 const int16x4_t y02_s16x4 = vld1_s16(&y02[i]);
100 xy01_s32x4 = vmlal_s16(xy01_s32x4, x_s16x4, y01_s16x4);
101 xy02_s32x4 = vmlal_s16(xy02_s32x4, x_s16x4, y02_s16x4);
102 i += 4;
103 }
104
105 xy01_s64x2 = vpaddlq_s32(xy01_s32x4);
106 xy02_s64x2 = vpaddlq_s32(xy02_s32x4);
107 xy01_s64x1 = vadd_s64(vget_low_s64(xy01_s64x2), vget_high_s64(xy01_s64x2));
108 xy02_s64x1 = vadd_s64(vget_low_s64(xy02_s64x2), vget_high_s64(xy02_s64x2));
109 xy01 = vget_lane_s32(vreinterpret_s32_s64(xy01_s64x1), 0);
110 xy02 = vget_lane_s32(vreinterpret_s32_s64(xy02_s64x1), 0);
111
112 for (; i < N; i++) {
113 xy01 = MAC16_16(xy01, x[i], y01[i]);
114 xy02 = MAC16_16(xy02, x[i], y02[i]);
115 }
116 *xy1 = xy01;
117 *xy2 = xy02;
118
119#ifdef OPUS_CHECK_ASM
120 {
121 opus_val32 xy1_c, xy2_c;
122 dual_inner_prod_c(x, y01, y02, N, &xy1_c, &xy2_c);
123 celt_assert(xy1_c == *xy1);
124 celt_assert(xy2_c == *xy2);
125 }
126#endif
127}
128
129#else /* !FIXED_POINT */
130
131/* ========================================================================== */
132
133#ifdef OPUS_CHECK_ASM
134
135/* This part of code simulates floating-point NEON operations. */
136
137/* celt_inner_prod_neon_float_c_simulation() simulates the floating-point */
138/* operations of celt_inner_prod_neon(), and both functions should have bit */
139/* exact output. */
140static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, int N)
141{
142 int i;
143 opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0;
144 for (i = 0; i < N - 3; i += 4) {
145 xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]);
146 xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]);
147 xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]);
148 xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]);
149 }
150 xy0 += xy2;
151 xy1 += xy3;
152 xy = xy0 + xy1;
153 for (; i < N; i++) {
154 xy = MAC16_16(xy, x[i], y[i]);
155 }
156 return xy;
157}
158
159/* dual_inner_prod_neon_float_c_simulation() simulates the floating-point */
160/* operations of dual_inner_prod_neon(), and both functions should have bit */
161/* exact output. */
162static void dual_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
163 int N, opus_val32 *xy1, opus_val32 *xy2)
164{
165 int i;
166 opus_val32 xy01, xy02, xy01_0 = 0, xy01_1 = 0, xy01_2 = 0, xy01_3 = 0, xy02_0 = 0, xy02_1 = 0, xy02_2 = 0, xy02_3 = 0;
167 for (i = 0; i < N - 3; i += 4) {
168 xy01_0 = MAC16_16(xy01_0, x[i + 0], y01[i + 0]);
169 xy01_1 = MAC16_16(xy01_1, x[i + 1], y01[i + 1]);
170 xy01_2 = MAC16_16(xy01_2, x[i + 2], y01[i + 2]);
171 xy01_3 = MAC16_16(xy01_3, x[i + 3], y01[i + 3]);
172 xy02_0 = MAC16_16(xy02_0, x[i + 0], y02[i + 0]);
173 xy02_1 = MAC16_16(xy02_1, x[i + 1], y02[i + 1]);
174 xy02_2 = MAC16_16(xy02_2, x[i + 2], y02[i + 2]);
175 xy02_3 = MAC16_16(xy02_3, x[i + 3], y02[i + 3]);
176 }
177 xy01_0 += xy01_2;
178 xy02_0 += xy02_2;
179 xy01_1 += xy01_3;
180 xy02_1 += xy02_3;
181 xy01 = xy01_0 + xy01_1;
182 xy02 = xy02_0 + xy02_1;
183 for (; i < N; i++) {
184 xy01 = MAC16_16(xy01, x[i], y01[i]);
185 xy02 = MAC16_16(xy02, x[i], y02[i]);
186 }
187 *xy1 = xy01;
188 *xy2 = xy02;
189}
190
191#endif /* OPUS_CHECK_ASM */
192
193/* ========================================================================== */
194
195opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
196{
197 int i;
198 opus_val32 xy;
199 float32x4_t xy_f32x4 = vdupq_n_f32(0);
200 float32x2_t xy_f32x2;
201
202 for (i = 0; i < N - 7; i += 8) {
203 float32x4_t x_f32x4, y_f32x4;
204 x_f32x4 = vld1q_f32(&x[i]);
205 y_f32x4 = vld1q_f32(&y[i]);
206 xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
207 x_f32x4 = vld1q_f32(&x[i + 4]);
208 y_f32x4 = vld1q_f32(&y[i + 4]);
209 xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
210 }
211
212 if (N - i >= 4) {
213 const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
214 const float32x4_t y_f32x4 = vld1q_f32(&y[i]);
215 xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
216 i += 4;
217 }
218
219 xy_f32x2 = vadd_f32(vget_low_f32(xy_f32x4), vget_high_f32(xy_f32x4));
220 xy_f32x2 = vpadd_f32(xy_f32x2, xy_f32x2);
221 xy = vget_lane_f32(xy_f32x2, 0);
222
223 for (; i < N; i++) {
224 xy = MAC16_16(xy, x[i], y[i]);
225 }
226
227#ifdef OPUS_CHECK_ASM
228 celt_assert(ABS32(celt_inner_prod_neon_float_c_simulation(x, y, N) - xy) <= VERY_SMALL);
229#endif
230
231 return xy;
232}
233
234void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
235 int N, opus_val32 *xy1, opus_val32 *xy2)
236{
237 int i;
238 opus_val32 xy01, xy02;
239 float32x4_t xy01_f32x4 = vdupq_n_f32(0);
240 float32x4_t xy02_f32x4 = vdupq_n_f32(0);
241 float32x2_t xy01_f32x2, xy02_f32x2;
242
243 for (i = 0; i < N - 7; i += 8) {
244 float32x4_t x_f32x4, y01_f32x4, y02_f32x4;
245 x_f32x4 = vld1q_f32(&x[i]);
246 y01_f32x4 = vld1q_f32(&y01[i]);
247 y02_f32x4 = vld1q_f32(&y02[i]);
248 xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
249 xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
250 x_f32x4 = vld1q_f32(&x[i + 4]);
251 y01_f32x4 = vld1q_f32(&y01[i + 4]);
252 y02_f32x4 = vld1q_f32(&y02[i + 4]);
253 xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
254 xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
255 }
256
257 if (N - i >= 4) {
258 const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
259 const float32x4_t y01_f32x4 = vld1q_f32(&y01[i]);
260 const float32x4_t y02_f32x4 = vld1q_f32(&y02[i]);
261 xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
262 xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
263 i += 4;
264 }
265
266 xy01_f32x2 = vadd_f32(vget_low_f32(xy01_f32x4), vget_high_f32(xy01_f32x4));
267 xy02_f32x2 = vadd_f32(vget_low_f32(xy02_f32x4), vget_high_f32(xy02_f32x4));
268 xy01_f32x2 = vpadd_f32(xy01_f32x2, xy01_f32x2);
269 xy02_f32x2 = vpadd_f32(xy02_f32x2, xy02_f32x2);
270 xy01 = vget_lane_f32(xy01_f32x2, 0);
271 xy02 = vget_lane_f32(xy02_f32x2, 0);
272
273 for (; i < N; i++) {
274 xy01 = MAC16_16(xy01, x[i], y01[i]);
275 xy02 = MAC16_16(xy02, x[i], y02[i]);
276 }
277 *xy1 = xy01;
278 *xy2 = xy02;
279
280#ifdef OPUS_CHECK_ASM
281 {
282 opus_val32 xy1_c, xy2_c;
283 dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c);
284 celt_assert(ABS32(xy1_c - *xy1) <= VERY_SMALL);
285 celt_assert(ABS32(xy2_c - *xy2) <= VERY_SMALL);
286 }
287#endif
288}
289
290#endif /* FIXED_POINT */