From 7e0f289ed75c40a5a633416242eaf1e44aa11498 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Wed, 19 Feb 2014 18:06:34 -0800 Subject: [PATCH] Add SSE2 variant of FIR filter This patch adds an SSE2 variant of the FIR filter, to complement the existing NEON optimization. This version is written using intrinsics. Benchmark results: 2.8ns per sample for a 16-tap filter, which is 4x the scalar speed. --- cpp/src/core.gyp | 8 ++++ cpp/src/fir.cc | 95 +++++++++++++++++++++++++++++++++++++++++- cpp/src/fir.h | 14 +++++++ cpp/src/test_filter.cc | 14 ++++++- 4 files changed, 128 insertions(+), 3 deletions(-) diff --git a/cpp/src/core.gyp b/cpp/src/core.gyp index 780b3f3..ff5d47b 100644 --- a/cpp/src/core.gyp +++ b/cpp/src/core.gyp @@ -23,6 +23,14 @@ ], 'include_dirs': ['.'], }, + { + 'target_name': 'test_filter', + 'type': 'executable', + 'sources': [ + 'test_filter.cc', + 'fir.cc', + ], + } ], } diff --git a/cpp/src/fir.cc b/cpp/src/fir.cc index a03da68..a35ac2d 100644 --- a/cpp/src/fir.cc +++ b/cpp/src/fir.cc @@ -18,15 +18,21 @@ #include // for debugging, remove #include -#include #include "aligned_buf.h" #include "fir.h" -// Should probably ifdef this to make it more portable +#ifdef __ANDROID_API__ void *malloc_aligned(size_t alignment, size_t nbytes) { return memalign(alignment, nbytes); } +#else +void *malloc_aligned(size_t alignment, size_t nbytes) { + void *result; + int status = posix_memalign(&result, alignment, nbytes); + return status == 0 ? result : 0; +} +#endif SimpleFirFilter::SimpleFirFilter(const float *kernel, size_t nk) : nk(nk) { k = (float *)malloc(nk * sizeof(k[0])); @@ -180,3 +186,88 @@ void Neon16FirFilter::process(const float *in, float *out, size_t n) { } #endif + +#ifdef __SSE2__ +#include + +SseFirFilter::SseFirFilter(const float *kernel, size_t nk) : nk(nk) { + // TODO: handle odd size nk (must be multiple of 4) + k = (float *)malloc_aligned(16, nk * sizeof(k[0])); + for (size_t i = 0; i < nk; i += 4) { + for (size_t j = 0; j < 4; j++) { + k[i + j] = kernel[nk - i - 4 + j]; + } + } +} + +SseFirFilter::~SseFirFilter() { + free(k); +} + +void printvec(__m128 v) { + float *f = (float *)&v; + printf("[%f %f %f %f]\n", f[0], f[1], f[2], f[3]); +} + +void SseFirFilter::process(const float *in1, float *out, size_t n) { + const float *in = in1 - 1; + __m128 q9 = _mm_set_ps1(0.0); + __m128 q10 = _mm_set_ps1(0.0); + __m128 q11 = _mm_set_ps1(0.0); + __m128i mask = _mm_set_epi32(-1, -1, -1, 0); + for (int i = 0; i < nk; i += 4) { + __m128 q0 = _mm_load_ps(&in[i]); + __m128 q1 = _mm_load_ps(&k[i]); + __m128 s = _mm_shuffle_ps(q0, q0, _MM_SHUFFLE(1, 1, 1, 1)); + q9 = _mm_add_ps(_mm_mul_ps(q1, s), q9); + s = _mm_shuffle_ps(q0, q0, _MM_SHUFFLE(2, 2, 2, 2)); + q10 = _mm_add_ps(_mm_mul_ps(q1, s), q10); + s = _mm_shuffle_ps(q0, q0, _MM_SHUFFLE(3, 3, 3, 3)); + q11 = _mm_add_ps(_mm_mul_ps(q1, s), q11); + } + // Note: AVX has _mm_permute_ps, which would be a bit more direct + q9 = (__m128)_mm_and_si128((__m128i)q9, mask); + __m128 q8 = _mm_shuffle_ps(q9, q9, _MM_SHUFFLE(0, 0, 0, 3)); + q10 = _mm_shuffle_ps(q10, (__m128)mask, _MM_SHUFFLE(0, 0, 3, 2)); + q8 = _mm_add_ps(q8, q10); + q11 = (__m128)_mm_and_si128((__m128i)q11, mask); + q11 = _mm_shuffle_ps(q11, q11, _MM_SHUFFLE(0, 3, 2, 1)); + q8 = _mm_add_ps(q8, q11); + for (int i = 0; i < n; i += 4) { + q9 = _mm_set_ps1(0.0); + q10 = _mm_set_ps1(0.0); + q11 = _mm_set_ps1(0.0); + const float *inptr = &in[i + 4]; + // inner loop + for (int j = 0; j < nk; j += 4) { + __m128 q0 = _mm_load_ps(&inptr[j]); + __m128 q1 = _mm_load_ps(&k[j]); + __m128 s = _mm_shuffle_ps(q0, q0, _MM_SHUFFLE(0, 0, 0, 0)); + q8 = _mm_add_ps(_mm_mul_ps(q1, s), q8); + s = _mm_shuffle_ps(q0, q0, _MM_SHUFFLE(1, 1, 1, 1)); + q9 = _mm_add_ps(_mm_mul_ps(q1, s), q9); + s = _mm_shuffle_ps(q0, q0, _MM_SHUFFLE(2, 2, 2, 2)); + q10 = _mm_add_ps(_mm_mul_ps(q1, s), q10); + s = _mm_shuffle_ps(q0, q0, _MM_SHUFFLE(3, 3, 3, 3)); + q11 = _mm_add_ps(_mm_mul_ps(q1, s), q11); + } + + // process overlaps + __m128 q0a = _mm_shuffle_ps(q9, q9, _MM_SHUFFLE(2, 1, 0, 3)); + __m128 q0 = _mm_add_ps(q8, (__m128)_mm_and_si128(mask, (__m128i)q0a)); + q8 = (__m128)_mm_andnot_si128(mask, (__m128i)q0a); + q0a = _mm_shuffle_ps((__m128)mask, q10, _MM_SHUFFLE(1, 0, 0, 0)); + q0 = _mm_add_ps(q0, q0a); + q0a = _mm_shuffle_ps(q10, (__m128)mask, _MM_SHUFFLE(0, 0, 3, 2)); + q8 = _mm_add_ps(q8, q0a); + q0a = (__m128)_mm_andnot_si128(mask, (__m128i)q11); + q0a = _mm_shuffle_ps(q0a, q0a, _MM_SHUFFLE(0, 3, 2, 1)); + q0 = _mm_add_ps(q0, q0a); + q0a = (__m128)_mm_and_si128(mask, (__m128i)q11); + q0a = _mm_shuffle_ps(q0a, q0a, _MM_SHUFFLE(0, 3, 2, 1)); + q8 = _mm_add_ps(q8, q0a); + _mm_store_ps(&out[i], q0); + } +} + +#endif diff --git a/cpp/src/fir.h b/cpp/src/fir.h index 3ff245e..8e329cf 100644 --- a/cpp/src/fir.h +++ b/cpp/src/fir.h @@ -76,3 +76,17 @@ class Neon16FirFilter : public FirFilter { }; #endif // HAVE_NEON + +#ifdef __SSE2__ + +class SseFirFilter : public FirFilter { + public: + SseFirFilter(const float *kernel, size_t nk); + ~SseFirFilter(); + void process(const float *in, float *out, size_t n); + private: + size_t nk; + float *k; +}; + +#endif // __SSE2__ diff --git a/cpp/src/test_filter.cc b/cpp/src/test_filter.cc index 1f9d8c7..be951c6 100644 --- a/cpp/src/test_filter.cc +++ b/cpp/src/test_filter.cc @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -101,6 +102,11 @@ void benchfir(int size, int experiment) { case 4: f = new HalfRateFirFilter(kernel, size, nblock); break; +#ifdef __SSE2__ + case 5: + f = new SseFirFilter(kernel, size); + break; +#endif } @@ -127,7 +133,13 @@ void runfirbench() { "set xlabel 'FIR kernel size'\n" "set ylabel 'ns per sample'\n" "plot '-' title 'scalar', '-' title '4x4 block', '-' title 'fixed16', '-' title 'fixed16 mirror', '-' title 'half rate'\n"); - for (int experiment = 0; experiment < 5; experiment++) { + for (int experiment = 0; experiment < 6; experiment++) { +#ifndef HAVE_NEON + if (experiment >= 1 && experiment <= 4) continue; +#endif +#ifndef __SSE2__ + if (experiment == 5) continue; +#endif for (int i = 16; i <= 256; i += 16) { benchfir(i, experiment); }