From 811877e5b3ae95b70e285b786bb7cc9d73d333e0 Mon Sep 17 00:00:00 2001 From: Jens Arnold Date: Mon, 30 Aug 2010 06:31:47 +0000 Subject: libdemac: ARMv7 assembler optimisation for the filters, tested on Nokia N900. Speedup is 2.1x for -c5000 compared to the ARMv6 asm. Note that actually compiling it on device requires hand-assembling the 'vadd' and 'vsub' instructions due to a bug in binutils 2.18.50, and making the standalone decoder use it requires Makefile and demac_config.h hacks. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@27944 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/demac/libdemac/filter.c | 2 + apps/codecs/demac/libdemac/vector_math16_armv7.h | 214 +++++++++++++++++++++++ 2 files changed, 216 insertions(+) create mode 100644 apps/codecs/demac/libdemac/vector_math16_armv7.h (limited to 'apps/codecs/demac/libdemac') diff --git a/apps/codecs/demac/libdemac/filter.c b/apps/codecs/demac/libdemac/filter.c index 8055098301..903885cf00 100644 --- a/apps/codecs/demac/libdemac/filter.c +++ b/apps/codecs/demac/libdemac/filter.c @@ -41,6 +41,8 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA #ifdef CPU_COLDFIRE #include "vector_math16_cf.h" +#elif defined(CPU_ARM) && (ARM_ARCH >= 7) +#include "vector_math16_armv7.h" #elif defined(CPU_ARM) && (ARM_ARCH >= 6) #include "vector_math16_armv6.h" #elif defined(CPU_ARM) && (ARM_ARCH >= 5) diff --git a/apps/codecs/demac/libdemac/vector_math16_armv7.h b/apps/codecs/demac/libdemac/vector_math16_armv7.h new file mode 100644 index 0000000000..84afda3e5d --- /dev/null +++ b/apps/codecs/demac/libdemac/vector_math16_armv7.h @@ -0,0 +1,214 @@ +/* + +libdemac - A Monkey's Audio decoder + +$Id$ + +Copyright (C) Dave Chapman 2007 + +ARMv7 neon vector math copyright (C) 2010 Jens Arnold + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA + +*/ + +#define FUSED_VECTOR_MATH + +#if ORDER > 32 +#define REPEAT_BLOCK(x) x x x +#elif ORDER > 16 +#define REPEAT_BLOCK(x) x +#else +#define REPEAT_BLOCK(x) +#endif + +/* Calculate scalarproduct, then add a 2nd vector (fused for performance) */ +static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2) +{ + int res; +#if ORDER > 64 + int cnt = ORDER>>6; +#endif + + asm volatile ( +#if ORDER > 64 + "vmov.i16 q0, #0 \n" + "1: \n" + "subs %[cnt], %[cnt], #1 \n" +#endif + "vld1.16 {d6-d9}, [%[f2]]! \n" + "vld1.16 {d2-d5}, [%[v1]] \n" + "vld1.16 {d10-d13}, [%[s2]]! \n" +#if ORDER > 64 + "vmlal.s16 q0, d2, d6 \n" +#else + "vmull.s16 q0, d2, d6 \n" +#endif + "vmlal.s16 q0, d3, d7 \n" + "vmlal.s16 q0, d4, d8 \n" + "vmlal.s16 q0, d5, d9 \n" + "vadd.i16 q1, q1, q5 \n" + "vadd.i16 q2, q2, q6 \n" + "vst1.16 {d2-d5}, [%[v1]]! \n" + + REPEAT_BLOCK( + "vld1.16 {d6-d9}, [%[f2]]! \n" + "vld1.16 {d2-d5}, [%[v1]] \n" + "vld1.16 {d10-d13}, [%[s2]]! \n" + "vmlal.s16 q0, d2, d6 \n" + "vmlal.s16 q0, d3, d7 \n" + "vmlal.s16 q0, d4, d8 \n" + "vmlal.s16 q0, d5, d9 \n" + "vadd.i16 q1, q1, q5 \n" + "vadd.i16 q2, q2, q6 \n" + "vst1.16 {d2-d5}, [%[v1]]! \n" + ) +#if ORDER > 64 + "bne 1b \n" +#endif + "vpadd.i32 d0, d0, d1 \n" + "vpaddl.s32 d0, d0 \n" + "vmov.32 %[res], d0[0] \n" + : /* outputs */ +#if ORDER > 64 + [cnt]"+r"(cnt), +#endif + [v1] "+r"(v1), + [f2] "+r"(f2), + [s2] "+r"(s2), + [res]"=r"(res) + : /* inputs */ + : /* clobbers */ + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", + "d8", "d9", "d10", "d11", "d12", "d13", "memory" + ); + return res; +} + +/* Calculate scalarproduct, then subtract a 2nd vector (fused for performance) */ +static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2) +{ + int res; +#if ORDER > 64 + int cnt = ORDER>>6; +#endif + + asm volatile ( +#if ORDER > 64 + "vmov.i16 q0, #0 \n" + "1: \n" + "subs %[cnt], %[cnt], #1 \n" +#endif + "vld1.16 {d6-d9}, [%[f2]]! \n" + "vld1.16 {d2-d5}, [%[v1]] \n" + "vld1.16 {d10-d13}, [%[s2]]! \n" +#if ORDER > 64 + "vmlal.s16 q0, d2, d6 \n" +#else + "vmull.s16 q0, d2, d6 \n" +#endif + "vmlal.s16 q0, d3, d7 \n" + "vmlal.s16 q0, d4, d8 \n" + "vmlal.s16 q0, d5, d9 \n" + "vsub.i16 q1, q1, q5 \n" + "vsub.i16 q2, q2, q6 \n" + "vst1.16 {d2-d5}, [%[v1]]! \n" + + REPEAT_BLOCK( + "vld1.16 {d6-d9}, [%[f2]]! \n" + "vld1.16 {d2-d5}, [%[v1]] \n" + "vld1.16 {d10-d13}, [%[s2]]! \n" + "vmlal.s16 q0, d2, d6 \n" + "vmlal.s16 q0, d3, d7 \n" + "vmlal.s16 q0, d4, d8 \n" + "vmlal.s16 q0, d5, d9 \n" + "vsub.i16 q1, q1, q5 \n" + "vsub.i16 q2, q2, q6 \n" + "vst1.16 {d2-d5}, [%[v1]]! \n" + ) +#if ORDER > 64 + "bne 1b \n" +#endif + "vpadd.i32 d0, d0, d1 \n" + "vpaddl.s32 d0, d0 \n" + "vmov.32 %[res], d0[0] \n" + : /* outputs */ +#if ORDER > 64 + [cnt]"+r"(cnt), +#endif + [v1] "+r"(v1), + [f2] "+r"(f2), + [s2] "+r"(s2), + [res]"=r"(res) + : /* inputs */ + : /* clobbers */ + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", + "d8", "d9", "d10", "d11", "d12", "d13", "memory" + ); + return res; +} + +static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) +{ + int res; +#if ORDER > 64 + int cnt = ORDER>>6; +#endif + + asm volatile ( +#if ORDER > 64 + "vmov.i16 q0, #0 \n" + "1: \n" + "subs %[cnt], %[cnt], #1 \n" +#endif + "vld1.16 {d2-d5}, [%[v1]]! \n" + "vld1.16 {d6-d9}, [%[v2]]! \n" +#if ORDER > 64 + "vmlal.s16 q0, d2, d6 \n" +#else + "vmull.s16 q0, d2, d6 \n" +#endif + "vmlal.s16 q0, d3, d7 \n" + "vmlal.s16 q0, d4, d8 \n" + "vmlal.s16 q0, d5, d9 \n" + + REPEAT_BLOCK( + "vld1.16 {d2-d5}, [%[v1]]! \n" + "vld1.16 {d6-d9}, [%[v2]]! \n" + "vmlal.s16 q0, d2, d6 \n" + "vmlal.s16 q0, d3, d7 \n" + "vmlal.s16 q0, d4, d8 \n" + "vmlal.s16 q0, d5, d9 \n" + ) +#if ORDER > 64 + "bne 1b \n" +#endif + "vpadd.i32 d0, d0, d1 \n" + "vpaddl.s32 d0, d0 \n" + "vmov.32 %[res], d0[0] \n" + : /* outputs */ +#if ORDER > 64 + [cnt]"+r"(cnt), +#endif + [v1] "+r"(v1), + [v2] "+r"(v2), + [res]"=r"(res) + : /* inputs */ + : /* clobbers */ + "d0", "d1", "d2", "d3", "d4", + "d5", "d6", "d7", "d8", "d9" + ); + return res; +} -- cgit v1.2.3