1 files changed, 290 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/libopus/celt/arm/pitch_neon_intr.c b/lib/rbcodec/codecs/libopus/celt/arm/pitch_neon_intr.c
new file mode 100644
index 0000000000..1ac38c433a
--- /dev/null
+++ b/lib/rbcodec/codecs/libopus/celt/arm/pitch_neon_intr.c
@@ -0,0 +1,290 @@
+/***********************************************************************
+Copyright (c) 2017 Google Inc.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+#include <arm_neon.h>
+#include "pitch.h"
+#ifdef FIXED_POINT
+opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
+{
+    int i;
+    opus_val32 xy;
+    int16x8_t x_s16x8, y_s16x8;
+    int32x4_t xy_s32x4 = vdupq_n_s32(0);
+    int64x2_t xy_s64x2;
+    int64x1_t xy_s64x1;
+    for (i = 0; i < N - 7; i += 8) {
+        x_s16x8  = vld1q_s16(&x[i]);
+        y_s16x8  = vld1q_s16(&y[i]);
+        xy_s32x4 = vmlal_s16(xy_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y_s16x8));
+        xy_s32x4 = vmlal_s16(xy_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y_s16x8));
+    }
+    if (N - i >= 4) {
+        const int16x4_t x_s16x4 = vld1_s16(&x[i]);
+        const int16x4_t y_s16x4 = vld1_s16(&y[i]);
+        xy_s32x4 = vmlal_s16(xy_s32x4, x_s16x4, y_s16x4);
+        i += 4;
+    }
+    xy_s64x2 = vpaddlq_s32(xy_s32x4);
+    xy_s64x1 = vadd_s64(vget_low_s64(xy_s64x2), vget_high_s64(xy_s64x2));
+    xy       = vget_lane_s32(vreinterpret_s32_s64(xy_s64x1), 0);
+    for (; i < N; i++) {
+        xy = MAC16_16(xy, x[i], y[i]);
+    }
+#ifdef OPUS_CHECK_ASM
+    celt_assert(celt_inner_prod_c(x, y, N) == xy);
+#endif
+    return xy;
+}
+void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+        int N, opus_val32 *xy1, opus_val32 *xy2)
+{
+    int i;
+    opus_val32 xy01, xy02;
+    int16x8_t x_s16x8, y01_s16x8, y02_s16x8;
+    int32x4_t xy01_s32x4 = vdupq_n_s32(0);
+    int32x4_t xy02_s32x4 = vdupq_n_s32(0);
+    int64x2_t xy01_s64x2, xy02_s64x2;
+    int64x1_t xy01_s64x1, xy02_s64x1;
+    for (i = 0; i < N - 7; i += 8) {
+        x_s16x8    = vld1q_s16(&x[i]);
+        y01_s16x8  = vld1q_s16(&y01[i]);
+        y02_s16x8  = vld1q_s16(&y02[i]);
+        xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y01_s16x8));
+        xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y02_s16x8));
+        xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y01_s16x8));
+        xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y02_s16x8));
+    }
+    if (N - i >= 4) {
+        const int16x4_t x_s16x4   = vld1_s16(&x[i]);
+        const int16x4_t y01_s16x4 = vld1_s16(&y01[i]);
+        const int16x4_t y02_s16x4 = vld1_s16(&y02[i]);
+        xy01_s32x4 = vmlal_s16(xy01_s32x4, x_s16x4, y01_s16x4);
+        xy02_s32x4 = vmlal_s16(xy02_s32x4, x_s16x4, y02_s16x4);
+        i += 4;
+    }
+    xy01_s64x2 = vpaddlq_s32(xy01_s32x4);
+    xy02_s64x2 = vpaddlq_s32(xy02_s32x4);
+    xy01_s64x1 = vadd_s64(vget_low_s64(xy01_s64x2), vget_high_s64(xy01_s64x2));
+    xy02_s64x1 = vadd_s64(vget_low_s64(xy02_s64x2), vget_high_s64(xy02_s64x2));
+    xy01       = vget_lane_s32(vreinterpret_s32_s64(xy01_s64x1), 0);
+    xy02       = vget_lane_s32(vreinterpret_s32_s64(xy02_s64x1), 0);
+    for (; i < N; i++) {
+        xy01 = MAC16_16(xy01, x[i], y01[i]);
+        xy02 = MAC16_16(xy02, x[i], y02[i]);
+    }
+    *xy1 = xy01;
+    *xy2 = xy02;
+#ifdef OPUS_CHECK_ASM
+    {
+        opus_val32 xy1_c, xy2_c;
+        dual_inner_prod_c(x, y01, y02, N, &xy1_c, &xy2_c);
+        celt_assert(xy1_c == *xy1);
+        celt_assert(xy2_c == *xy2);
+    }
+#endif
+}
+#else /* !FIXED_POINT */
+/* ========================================================================== */
+#ifdef OPUS_CHECK_ASM
+/* This part of code simulates floating-point NEON operations. */
+/* celt_inner_prod_neon_float_c_simulation() simulates the floating-point   */
+/* operations of celt_inner_prod_neon(), and both functions should have bit */
+/* exact output.                                                            */
+static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, int N)
+{
+   int i;
+   opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0;
+   for (i = 0; i < N - 3; i += 4) {
+      xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]);
+      xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]);
+      xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]);
+      xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]);
+   }
+   xy0 += xy2;
+   xy1 += xy3;
+   xy = xy0 + xy1;
+   for (; i < N; i++) {
+      xy = MAC16_16(xy, x[i], y[i]);
+   }
+   return xy;
+}
+/* dual_inner_prod_neon_float_c_simulation() simulates the floating-point   */
+/* operations of dual_inner_prod_neon(), and both functions should have bit */
+/* exact output.                                                            */
+static void dual_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+      int N, opus_val32 *xy1, opus_val32 *xy2)
+{
+   int i;
+   opus_val32 xy01, xy02, xy01_0 = 0, xy01_1 = 0, xy01_2 = 0, xy01_3 = 0, xy02_0 = 0, xy02_1 = 0, xy02_2 = 0, xy02_3 = 0;
+   for (i = 0; i < N - 3; i += 4) {
+      xy01_0 = MAC16_16(xy01_0, x[i + 0], y01[i + 0]);
+      xy01_1 = MAC16_16(xy01_1, x[i + 1], y01[i + 1]);
+      xy01_2 = MAC16_16(xy01_2, x[i + 2], y01[i + 2]);
+      xy01_3 = MAC16_16(xy01_3, x[i + 3], y01[i + 3]);
+      xy02_0 = MAC16_16(xy02_0, x[i + 0], y02[i + 0]);
+      xy02_1 = MAC16_16(xy02_1, x[i + 1], y02[i + 1]);
+      xy02_2 = MAC16_16(xy02_2, x[i + 2], y02[i + 2]);
+      xy02_3 = MAC16_16(xy02_3, x[i + 3], y02[i + 3]);
+   }
+   xy01_0 += xy01_2;
+   xy02_0 += xy02_2;
+   xy01_1 += xy01_3;
+   xy02_1 += xy02_3;
+   xy01 = xy01_0 + xy01_1;
+   xy02 = xy02_0 + xy02_1;
+   for (; i < N; i++) {
+      xy01 = MAC16_16(xy01, x[i], y01[i]);
+      xy02 = MAC16_16(xy02, x[i], y02[i]);
+   }
+   *xy1 = xy01;
+   *xy2 = xy02;
+}
+#endif /* OPUS_CHECK_ASM */
+/* ========================================================================== */
+opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
+{
+    int i;
+    opus_val32 xy;
+    float32x4_t xy_f32x4 = vdupq_n_f32(0);
+    float32x2_t xy_f32x2;
+    for (i = 0; i < N - 7; i += 8) {
+        float32x4_t x_f32x4, y_f32x4;
+        x_f32x4  = vld1q_f32(&x[i]);
+        y_f32x4  = vld1q_f32(&y[i]);
+        xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
+        x_f32x4  = vld1q_f32(&x[i + 4]);
+        y_f32x4  = vld1q_f32(&y[i + 4]);
+        xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
+    }
+    if (N - i >= 4) {
+        const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
+        const float32x4_t y_f32x4 = vld1q_f32(&y[i]);
+        xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
+        i += 4;
+    }
+    xy_f32x2 = vadd_f32(vget_low_f32(xy_f32x4), vget_high_f32(xy_f32x4));
+    xy_f32x2 = vpadd_f32(xy_f32x2, xy_f32x2);
+    xy       = vget_lane_f32(xy_f32x2, 0);
+    for (; i < N; i++) {
+        xy = MAC16_16(xy, x[i], y[i]);
+    }
+#ifdef OPUS_CHECK_ASM
+    celt_assert(ABS32(celt_inner_prod_neon_float_c_simulation(x, y, N) - xy) <= VERY_SMALL);
+#endif
+    return xy;
+}
+void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+        int N, opus_val32 *xy1, opus_val32 *xy2)
+{
+    int i;
+    opus_val32 xy01, xy02;
+    float32x4_t xy01_f32x4 = vdupq_n_f32(0);
+    float32x4_t xy02_f32x4 = vdupq_n_f32(0);
+    float32x2_t xy01_f32x2, xy02_f32x2;
+    for (i = 0; i < N - 7; i += 8) {
+        float32x4_t x_f32x4, y01_f32x4, y02_f32x4;
+        x_f32x4    = vld1q_f32(&x[i]);
+        y01_f32x4  = vld1q_f32(&y01[i]);
+        y02_f32x4  = vld1q_f32(&y02[i]);
+        xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
+        xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
+        x_f32x4    = vld1q_f32(&x[i + 4]);
+        y01_f32x4  = vld1q_f32(&y01[i + 4]);
+        y02_f32x4  = vld1q_f32(&y02[i + 4]);
+        xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
+        xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
+    }
+    if (N - i >= 4) {
+        const float32x4_t x_f32x4   = vld1q_f32(&x[i]);
+        const float32x4_t y01_f32x4 = vld1q_f32(&y01[i]);
+        const float32x4_t y02_f32x4 = vld1q_f32(&y02[i]);
+        xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
+        xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
+        i += 4;
+    }
+    xy01_f32x2 = vadd_f32(vget_low_f32(xy01_f32x4), vget_high_f32(xy01_f32x4));
+    xy02_f32x2 = vadd_f32(vget_low_f32(xy02_f32x4), vget_high_f32(xy02_f32x4));
+    xy01_f32x2 = vpadd_f32(xy01_f32x2, xy01_f32x2);
+    xy02_f32x2 = vpadd_f32(xy02_f32x2, xy02_f32x2);
+    xy01       = vget_lane_f32(xy01_f32x2, 0);
+    xy02       = vget_lane_f32(xy02_f32x2, 0);
+    for (; i < N; i++) {
+        xy01 = MAC16_16(xy01, x[i], y01[i]);
+        xy02 = MAC16_16(xy02, x[i], y02[i]);
+    }
+    *xy1 = xy01;
+    *xy2 = xy02;
+#ifdef OPUS_CHECK_ASM
+    {
+        opus_val32 xy1_c, xy2_c;
+        dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c);
+        celt_assert(ABS32(xy1_c - *xy1) <= VERY_SMALL);
+        celt_assert(ABS32(xy2_c - *xy2) <= VERY_SMALL);
+    }
+#endif
+}
+#endif /* FIXED_POINT */

diff --git a/lib/rbcodec/codecs/libopus/celt/arm/pitch_neon_intr.c b/lib/rbcodec/codecs/libopus/celt/arm/pitch_neon_intr.c new file mode 100644 index 0000000000..1ac38c433a --- /dev/null +++ b/lib/rbcodec/codecs/libopus/celt/arm/pitch_neon_intr.c
@@ -0,0 +1,290 @@
	1	/***********************************************************************
	2	Copyright (c) 2017 Google Inc.
	3	Redistribution and use in source and binary forms, with or without
	4	modification, are permitted provided that the following conditions
	5	are met:
	6	- Redistributions of source code must retain the above copyright notice,
	7	this list of conditions and the following disclaimer.
	8	- Redistributions in binary form must reproduce the above copyright
	9	notice, this list of conditions and the following disclaimer in the
	10	documentation and/or other materials provided with the distribution.
	11	- Neither the name of Internet Society, IETF or IETF Trust, nor the
	12	names of specific contributors, may be used to endorse or promote
	13	products derived from this software without specific prior written
	14	permission.
	15	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	16	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	17	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	18	ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	19	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	20	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	21	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	22	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	23	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	24	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	25	POSSIBILITY OF SUCH DAMAGE.
	26	***********************************************************************/
	27
	28	#ifdef HAVE_CONFIG_H
	29	#include "config.h"
	30	#endif
	31
	32	#include <arm_neon.h>
	33	#include "pitch.h"
	34
	35	#ifdef FIXED_POINT
	36
	37	opus_val32 celt_inner_prod_neon(const opus_val16 x, const opus_val16 y, int N)
	38	{
	39	int i;
	40	opus_val32 xy;
	41	int16x8_t x_s16x8, y_s16x8;
	42	int32x4_t xy_s32x4 = vdupq_n_s32(0);
	43	int64x2_t xy_s64x2;
	44	int64x1_t xy_s64x1;
	45
	46	for (i = 0; i < N - 7; i += 8) {
	47	x_s16x8 = vld1q_s16(&x[i]);
	48	y_s16x8 = vld1q_s16(&y[i]);
	49	xy_s32x4 = vmlal_s16(xy_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y_s16x8));
	50	xy_s32x4 = vmlal_s16(xy_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y_s16x8));
	51	}
	52
	53	if (N - i >= 4) {
	54	const int16x4_t x_s16x4 = vld1_s16(&x[i]);
	55	const int16x4_t y_s16x4 = vld1_s16(&y[i]);
	56	xy_s32x4 = vmlal_s16(xy_s32x4, x_s16x4, y_s16x4);
	57	i += 4;
	58	}
	59
	60	xy_s64x2 = vpaddlq_s32(xy_s32x4);
	61	xy_s64x1 = vadd_s64(vget_low_s64(xy_s64x2), vget_high_s64(xy_s64x2));
	62	xy = vget_lane_s32(vreinterpret_s32_s64(xy_s64x1), 0);
	63
	64	for (; i < N; i++) {
	65	xy = MAC16_16(xy, x[i], y[i]);
	66	}
	67
	68	#ifdef OPUS_CHECK_ASM
	69	celt_assert(celt_inner_prod_c(x, y, N) == xy);
	70	#endif
	71
	72	return xy;
	73	}
	74
	75	void dual_inner_prod_neon(const opus_val16 x, const opus_val16 y01, const opus_val16 *y02,
	76	int N, opus_val32 xy1, opus_val32 xy2)
	77	{
	78	int i;
	79	opus_val32 xy01, xy02;
	80	int16x8_t x_s16x8, y01_s16x8, y02_s16x8;
	81	int32x4_t xy01_s32x4 = vdupq_n_s32(0);
	82	int32x4_t xy02_s32x4 = vdupq_n_s32(0);
	83	int64x2_t xy01_s64x2, xy02_s64x2;
	84	int64x1_t xy01_s64x1, xy02_s64x1;
	85
	86	for (i = 0; i < N - 7; i += 8) {
	87	x_s16x8 = vld1q_s16(&x[i]);
	88	y01_s16x8 = vld1q_s16(&y01[i]);
	89	y02_s16x8 = vld1q_s16(&y02[i]);
	90	xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y01_s16x8));
	91	xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y02_s16x8));
	92	xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y01_s16x8));
	93	xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y02_s16x8));
	94	}
	95
	96	if (N - i >= 4) {
	97	const int16x4_t x_s16x4 = vld1_s16(&x[i]);
	98	const int16x4_t y01_s16x4 = vld1_s16(&y01[i]);
	99	const int16x4_t y02_s16x4 = vld1_s16(&y02[i]);
	100	xy01_s32x4 = vmlal_s16(xy01_s32x4, x_s16x4, y01_s16x4);
	101	xy02_s32x4 = vmlal_s16(xy02_s32x4, x_s16x4, y02_s16x4);
	102	i += 4;
	103	}
	104
	105	xy01_s64x2 = vpaddlq_s32(xy01_s32x4);
	106	xy02_s64x2 = vpaddlq_s32(xy02_s32x4);
	107	xy01_s64x1 = vadd_s64(vget_low_s64(xy01_s64x2), vget_high_s64(xy01_s64x2));
	108	xy02_s64x1 = vadd_s64(vget_low_s64(xy02_s64x2), vget_high_s64(xy02_s64x2));
	109	xy01 = vget_lane_s32(vreinterpret_s32_s64(xy01_s64x1), 0);
	110	xy02 = vget_lane_s32(vreinterpret_s32_s64(xy02_s64x1), 0);
	111
	112	for (; i < N; i++) {
	113	xy01 = MAC16_16(xy01, x[i], y01[i]);
	114	xy02 = MAC16_16(xy02, x[i], y02[i]);
	115	}
	116	*xy1 = xy01;
	117	*xy2 = xy02;
	118
	119	#ifdef OPUS_CHECK_ASM
	120	{
	121	opus_val32 xy1_c, xy2_c;
	122	dual_inner_prod_c(x, y01, y02, N, &xy1_c, &xy2_c);
	123	celt_assert(xy1_c == *xy1);
	124	celt_assert(xy2_c == *xy2);
	125	}
	126	#endif
	127	}
	128
	129	#else /* !FIXED_POINT */
	130
	131	/* ========================================================================== */
	132
	133	#ifdef OPUS_CHECK_ASM
	134
	135	/* This part of code simulates floating-point NEON operations. */
	136
	137	/* celt_inner_prod_neon_float_c_simulation() simulates the floating-point */
	138	/* operations of celt_inner_prod_neon(), and both functions should have bit */
	139	/* exact output. */
	140	static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 x, const opus_val16 y, int N)
	141	{
	142	int i;
	143	opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0;
	144	for (i = 0; i < N - 3; i += 4) {
	145	xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]);
	146	xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]);
	147	xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]);
	148	xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]);
	149	}
	150	xy0 += xy2;
	151	xy1 += xy3;
	152	xy = xy0 + xy1;
	153	for (; i < N; i++) {
	154	xy = MAC16_16(xy, x[i], y[i]);
	155	}
	156	return xy;
	157	}
	158
	159	/* dual_inner_prod_neon_float_c_simulation() simulates the floating-point */
	160	/* operations of dual_inner_prod_neon(), and both functions should have bit */
	161	/* exact output. */
	162	static void dual_inner_prod_neon_float_c_simulation(const opus_val16 x, const opus_val16 y01, const opus_val16 *y02,
	163	int N, opus_val32 xy1, opus_val32 xy2)
	164	{
	165	int i;
	166	opus_val32 xy01, xy02, xy01_0 = 0, xy01_1 = 0, xy01_2 = 0, xy01_3 = 0, xy02_0 = 0, xy02_1 = 0, xy02_2 = 0, xy02_3 = 0;
	167	for (i = 0; i < N - 3; i += 4) {
	168	xy01_0 = MAC16_16(xy01_0, x[i + 0], y01[i + 0]);
	169	xy01_1 = MAC16_16(xy01_1, x[i + 1], y01[i + 1]);
	170	xy01_2 = MAC16_16(xy01_2, x[i + 2], y01[i + 2]);
	171	xy01_3 = MAC16_16(xy01_3, x[i + 3], y01[i + 3]);
	172	xy02_0 = MAC16_16(xy02_0, x[i + 0], y02[i + 0]);
	173	xy02_1 = MAC16_16(xy02_1, x[i + 1], y02[i + 1]);
	174	xy02_2 = MAC16_16(xy02_2, x[i + 2], y02[i + 2]);
	175	xy02_3 = MAC16_16(xy02_3, x[i + 3], y02[i + 3]);
	176	}
	177	xy01_0 += xy01_2;
	178	xy02_0 += xy02_2;
	179	xy01_1 += xy01_3;
	180	xy02_1 += xy02_3;
	181	xy01 = xy01_0 + xy01_1;
	182	xy02 = xy02_0 + xy02_1;
	183	for (; i < N; i++) {
	184	xy01 = MAC16_16(xy01, x[i], y01[i]);
	185	xy02 = MAC16_16(xy02, x[i], y02[i]);
	186	}
	187	*xy1 = xy01;
	188	*xy2 = xy02;
	189	}
	190
	191	#endif /* OPUS_CHECK_ASM */
	192
	193	/* ========================================================================== */
	194
	195	opus_val32 celt_inner_prod_neon(const opus_val16 x, const opus_val16 y, int N)
	196	{
	197	int i;
	198	opus_val32 xy;
	199	float32x4_t xy_f32x4 = vdupq_n_f32(0);
	200	float32x2_t xy_f32x2;
	201
	202	for (i = 0; i < N - 7; i += 8) {
	203	float32x4_t x_f32x4, y_f32x4;
	204	x_f32x4 = vld1q_f32(&x[i]);
	205	y_f32x4 = vld1q_f32(&y[i]);
	206	xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
	207	x_f32x4 = vld1q_f32(&x[i + 4]);
	208	y_f32x4 = vld1q_f32(&y[i + 4]);
	209	xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
	210	}
	211
	212	if (N - i >= 4) {
	213	const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
	214	const float32x4_t y_f32x4 = vld1q_f32(&y[i]);
	215	xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
	216	i += 4;
	217	}
	218
	219	xy_f32x2 = vadd_f32(vget_low_f32(xy_f32x4), vget_high_f32(xy_f32x4));
	220	xy_f32x2 = vpadd_f32(xy_f32x2, xy_f32x2);
	221	xy = vget_lane_f32(xy_f32x2, 0);
	222
	223	for (; i < N; i++) {
	224	xy = MAC16_16(xy, x[i], y[i]);
	225	}
	226
	227	#ifdef OPUS_CHECK_ASM
	228	celt_assert(ABS32(celt_inner_prod_neon_float_c_simulation(x, y, N) - xy) <= VERY_SMALL);
	229	#endif
	230
	231	return xy;
	232	}
	233
	234	void dual_inner_prod_neon(const opus_val16 x, const opus_val16 y01, const opus_val16 *y02,
	235	int N, opus_val32 xy1, opus_val32 xy2)
	236	{
	237	int i;
	238	opus_val32 xy01, xy02;
	239	float32x4_t xy01_f32x4 = vdupq_n_f32(0);
	240	float32x4_t xy02_f32x4 = vdupq_n_f32(0);
	241	float32x2_t xy01_f32x2, xy02_f32x2;
	242
	243	for (i = 0; i < N - 7; i += 8) {
	244	float32x4_t x_f32x4, y01_f32x4, y02_f32x4;
	245	x_f32x4 = vld1q_f32(&x[i]);
	246	y01_f32x4 = vld1q_f32(&y01[i]);
	247	y02_f32x4 = vld1q_f32(&y02[i]);
	248	xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
	249	xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
	250	x_f32x4 = vld1q_f32(&x[i + 4]);
	251	y01_f32x4 = vld1q_f32(&y01[i + 4]);
	252	y02_f32x4 = vld1q_f32(&y02[i + 4]);
	253	xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
	254	xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
	255	}
	256
	257	if (N - i >= 4) {
	258	const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
	259	const float32x4_t y01_f32x4 = vld1q_f32(&y01[i]);
	260	const float32x4_t y02_f32x4 = vld1q_f32(&y02[i]);
	261	xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
	262	xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
	263	i += 4;
	264	}
	265
	266	xy01_f32x2 = vadd_f32(vget_low_f32(xy01_f32x4), vget_high_f32(xy01_f32x4));
	267	xy02_f32x2 = vadd_f32(vget_low_f32(xy02_f32x4), vget_high_f32(xy02_f32x4));
	268	xy01_f32x2 = vpadd_f32(xy01_f32x2, xy01_f32x2);
	269	xy02_f32x2 = vpadd_f32(xy02_f32x2, xy02_f32x2);
	270	xy01 = vget_lane_f32(xy01_f32x2, 0);
	271	xy02 = vget_lane_f32(xy02_f32x2, 0);
	272
	273	for (; i < N; i++) {
	274	xy01 = MAC16_16(xy01, x[i], y01[i]);
	275	xy02 = MAC16_16(xy02, x[i], y02[i]);
	276	}
	277	*xy1 = xy01;
	278	*xy2 = xy02;
	279
	280	#ifdef OPUS_CHECK_ASM
	281	{
	282	opus_val32 xy1_c, xy2_c;
	283	dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c);
	284	celt_assert(ABS32(xy1_c - *xy1) <= VERY_SMALL);
	285	celt_assert(ABS32(xy2_c - *xy2) <= VERY_SMALL);
	286	}
	287	#endif
	288	}
	289
	290	#endif /* FIXED_POINT */