From 5696f91a1e5e0cd1022c60737465c29b429ea599 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Fri, 14 Mar 2014 23:45:27 +0100 Subject: pcm/PcmFormat: ARM NEON optimizations for float->s16 This is nearly 4 times faster than the "portable" algorithm. --- src/pcm/Neon.hxx | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/pcm/PcmFormat.cxx | 39 ++++++++++++++++++++- 2 files changed, 135 insertions(+), 1 deletion(-) create mode 100644 src/pcm/Neon.hxx (limited to 'src') diff --git a/src/pcm/Neon.hxx b/src/pcm/Neon.hxx new file mode 100644 index 000000000..a6f52db42 --- /dev/null +++ b/src/pcm/Neon.hxx @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2003-2014 The Music Player Daemon Project + * http://www.musicpd.org + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef MPD_PCM_NEON_HXX +#define MPD_PCM_NEON_HXX + +#include "Traits.hxx" + +#include + +/** + * Call a NEON intrinsic for each element in the vector. + * + * @param func the NEON intrinsic + * @param result the vector variable that gets assigned the result + * @param vector the input vector + */ +#define neon_x4_u(func, result, vector) do { \ + result.val[0] = func(vector.val[0]); \ + result.val[1] = func(vector.val[1]); \ + result.val[2] = func(vector.val[2]); \ + result.val[3] = func(vector.val[3]); \ +} while (0) + +/** + * Call a NEON intrinsic for each element in the vector. + * + * @param func the NEON intrinsic + * @param result the vector variable that gets assigned the result + * @param vector the input vector + * @param arg an additional argument that gets passed to each call + */ +#define neon_x4_b(func, result, vector, arg) do { \ + result.val[0] = func(vector.val[0], arg); \ + result.val[1] = func(vector.val[1], arg); \ + result.val[2] = func(vector.val[2], arg); \ + result.val[3] = func(vector.val[3], arg); \ +} while (0) + +/** + * Convert floating point samples to 16 bit signed integer using ARM NEON. + */ +struct NeonFloatTo16 { + static constexpr SampleFormat src_format = SampleFormat::FLOAT; + static constexpr SampleFormat dst_format = SampleFormat::S16; + typedef SampleTraits SrcTraits; + typedef SampleTraits DstTraits; + + typedef typename SrcTraits::value_type SV; + typedef typename DstTraits::value_type DV; + + static constexpr size_t BLOCK_SIZE = 16; + + void Convert(int16_t *dst, const float *src, const size_t n) const { + const float32x4_t factor = + vdupq_n_f32(1 << (DstTraits::BITS - 1)); + + for (unsigned i = 0; i < n / BLOCK_SIZE; + ++i, src += BLOCK_SIZE, dst += BLOCK_SIZE) { + /* load 16 float samples into 4 quad + registers */ + float32x4x4_t value = vld4q_f32(src); + + /* apply factor */ + neon_x4_b(vmulq_f32, value, value, factor); + + /* convert to 32 bit integer */ + int32x4x4_t ivalue; + neon_x4_u(vcvtq_s32_f32, ivalue, value); + + /* convert to 16 bit integer with saturation */ + int16x4x4_t nvalue; + neon_x4_u(vqmovn_s32, nvalue, ivalue); + + /* store result */ + vst4_s16(dst, nvalue); + } + } +}; + +#endif diff --git a/src/pcm/PcmFormat.cxx b/src/pcm/PcmFormat.cxx index f2d71a6fb..4cabc05a0 100644 --- a/src/pcm/PcmFormat.cxx +++ b/src/pcm/PcmFormat.cxx @@ -76,9 +76,46 @@ struct Convert32To16 { }; template> -struct FloatToInteger +struct PortableFloatToInteger : PerSampleConvert> {}; +template> +struct FloatToInteger : PortableFloatToInteger {}; + +/** + * A template class that attempts to use the "optimized" algorithm for + * large portions of the buffer, and calls the "portable" algorithm" + * for the rest when the last block is not full. + */ +template +class GlueOptimizedConvert : Optimized, Portable { +public: + typedef typename Portable::SrcTraits SrcTraits; + typedef typename Portable::DstTraits DstTraits; + + void Convert(typename DstTraits::pointer_type out, + typename SrcTraits::const_pointer_type in, + size_t n) const { + Optimized::Convert(out, in, n); + + /* use the "portable" algorithm for the trailing + samples */ + size_t remaining = n % Optimized::BLOCK_SIZE; + size_t done = n - remaining; + Portable::Convert(out + done, in + done, remaining); + } +}; + +#ifdef __ARM_NEON__ +#include "Neon.hxx" + +template<> +struct FloatToInteger> + : GlueOptimizedConvert> {}; + +#endif + template static ConstBuffer AllocateConvert(PcmBuffer &buffer, C convert, -- cgit v1.2.3