Add SSE2 variant of FIR filter

This patch adds an SSE2 variant of the FIR filter, to complement the
existing NEON optimization. This version is written using intrinsics.

Benchmark results: 2.8ns per sample for a 16-tap filter, which is
4x the scalar speed.
master
Raph Levien 11 years ago
parent 099ef4146e
commit 7e0f289ed7
  1. 8
      cpp/src/core.gyp
  2. 95
      cpp/src/fir.cc
  3. 14
      cpp/src/fir.h
  4. 14
      cpp/src/test_filter.cc

@ -23,6 +23,14 @@
],
'include_dirs': ['.'],
},
{
'target_name': 'test_filter',
'type': 'executable',
'sources': [
'test_filter.cc',
'fir.cc',
],
}
],
}

@ -18,15 +18,21 @@
#include <stdio.h> // for debugging, remove
#include <stdlib.h>
#include <malloc.h>
#include "aligned_buf.h"
#include "fir.h"
// Should probably ifdef this to make it more portable
#ifdef __ANDROID_API__
void *malloc_aligned(size_t alignment, size_t nbytes) {
return memalign(alignment, nbytes);
}
#else
void *malloc_aligned(size_t alignment, size_t nbytes) {
void *result;
int status = posix_memalign(&result, alignment, nbytes);
return status == 0 ? result : 0;
}
#endif
SimpleFirFilter::SimpleFirFilter(const float *kernel, size_t nk) : nk(nk) {
k = (float *)malloc(nk * sizeof(k[0]));
@ -180,3 +186,88 @@ void Neon16FirFilter::process(const float *in, float *out, size_t n) {
}
#endif
#ifdef __SSE2__
#include <emmintrin.h>
SseFirFilter::SseFirFilter(const float *kernel, size_t nk) : nk(nk) {
// TODO: handle odd size nk (must be multiple of 4)
k = (float *)malloc_aligned(16, nk * sizeof(k[0]));
for (size_t i = 0; i < nk; i += 4) {
for (size_t j = 0; j < 4; j++) {
k[i + j] = kernel[nk - i - 4 + j];
}
}
}
SseFirFilter::~SseFirFilter() {
free(k);
}
void printvec(__m128 v) {
float *f = (float *)&v;
printf("[%f %f %f %f]\n", f[0], f[1], f[2], f[3]);
}
void SseFirFilter::process(const float *in1, float *out, size_t n) {
const float *in = in1 - 1;
__m128 q9 = _mm_set_ps1(0.0);
__m128 q10 = _mm_set_ps1(0.0);
__m128 q11 = _mm_set_ps1(0.0);
__m128i mask = _mm_set_epi32(-1, -1, -1, 0);
for (int i = 0; i < nk; i += 4) {
__m128 q0 = _mm_load_ps(&in[i]);
__m128 q1 = _mm_load_ps(&k[i]);
__m128 s = _mm_shuffle_ps(q0, q0, _MM_SHUFFLE(1, 1, 1, 1));
q9 = _mm_add_ps(_mm_mul_ps(q1, s), q9);
s = _mm_shuffle_ps(q0, q0, _MM_SHUFFLE(2, 2, 2, 2));
q10 = _mm_add_ps(_mm_mul_ps(q1, s), q10);
s = _mm_shuffle_ps(q0, q0, _MM_SHUFFLE(3, 3, 3, 3));
q11 = _mm_add_ps(_mm_mul_ps(q1, s), q11);
}
// Note: AVX has _mm_permute_ps, which would be a bit more direct
q9 = (__m128)_mm_and_si128((__m128i)q9, mask);
__m128 q8 = _mm_shuffle_ps(q9, q9, _MM_SHUFFLE(0, 0, 0, 3));
q10 = _mm_shuffle_ps(q10, (__m128)mask, _MM_SHUFFLE(0, 0, 3, 2));
q8 = _mm_add_ps(q8, q10);
q11 = (__m128)_mm_and_si128((__m128i)q11, mask);
q11 = _mm_shuffle_ps(q11, q11, _MM_SHUFFLE(0, 3, 2, 1));
q8 = _mm_add_ps(q8, q11);
for (int i = 0; i < n; i += 4) {
q9 = _mm_set_ps1(0.0);
q10 = _mm_set_ps1(0.0);
q11 = _mm_set_ps1(0.0);
const float *inptr = &in[i + 4];
// inner loop
for (int j = 0; j < nk; j += 4) {
__m128 q0 = _mm_load_ps(&inptr[j]);
__m128 q1 = _mm_load_ps(&k[j]);
__m128 s = _mm_shuffle_ps(q0, q0, _MM_SHUFFLE(0, 0, 0, 0));
q8 = _mm_add_ps(_mm_mul_ps(q1, s), q8);
s = _mm_shuffle_ps(q0, q0, _MM_SHUFFLE(1, 1, 1, 1));
q9 = _mm_add_ps(_mm_mul_ps(q1, s), q9);
s = _mm_shuffle_ps(q0, q0, _MM_SHUFFLE(2, 2, 2, 2));
q10 = _mm_add_ps(_mm_mul_ps(q1, s), q10);
s = _mm_shuffle_ps(q0, q0, _MM_SHUFFLE(3, 3, 3, 3));
q11 = _mm_add_ps(_mm_mul_ps(q1, s), q11);
}
// process overlaps
__m128 q0a = _mm_shuffle_ps(q9, q9, _MM_SHUFFLE(2, 1, 0, 3));
__m128 q0 = _mm_add_ps(q8, (__m128)_mm_and_si128(mask, (__m128i)q0a));
q8 = (__m128)_mm_andnot_si128(mask, (__m128i)q0a);
q0a = _mm_shuffle_ps((__m128)mask, q10, _MM_SHUFFLE(1, 0, 0, 0));
q0 = _mm_add_ps(q0, q0a);
q0a = _mm_shuffle_ps(q10, (__m128)mask, _MM_SHUFFLE(0, 0, 3, 2));
q8 = _mm_add_ps(q8, q0a);
q0a = (__m128)_mm_andnot_si128(mask, (__m128i)q11);
q0a = _mm_shuffle_ps(q0a, q0a, _MM_SHUFFLE(0, 3, 2, 1));
q0 = _mm_add_ps(q0, q0a);
q0a = (__m128)_mm_and_si128(mask, (__m128i)q11);
q0a = _mm_shuffle_ps(q0a, q0a, _MM_SHUFFLE(0, 3, 2, 1));
q8 = _mm_add_ps(q8, q0a);
_mm_store_ps(&out[i], q0);
}
}
#endif

@ -76,3 +76,17 @@ class Neon16FirFilter : public FirFilter<float, float> {
};
#endif // HAVE_NEON
#ifdef __SSE2__
class SseFirFilter : public FirFilter<float, float> {
public:
SseFirFilter(const float *kernel, size_t nk);
~SseFirFilter();
void process(const float *in, float *out, size_t n);
private:
size_t nk;
float *k;
};
#endif // __SSE2__

@ -19,6 +19,7 @@
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>
#include <math.h>
@ -101,6 +102,11 @@ void benchfir(int size, int experiment) {
case 4:
f = new HalfRateFirFilter(kernel, size, nblock);
break;
#ifdef __SSE2__
case 5:
f = new SseFirFilter(kernel, size);
break;
#endif
}
@ -127,7 +133,13 @@ void runfirbench() {
"set xlabel 'FIR kernel size'\n"
"set ylabel 'ns per sample'\n"
"plot '-' title 'scalar', '-' title '4x4 block', '-' title 'fixed16', '-' title 'fixed16 mirror', '-' title 'half rate'\n");
for (int experiment = 0; experiment < 5; experiment++) {
for (int experiment = 0; experiment < 6; experiment++) {
#ifndef HAVE_NEON
if (experiment >= 1 && experiment <= 4) continue;
#endif
#ifndef __SSE2__
if (experiment == 5) continue;
#endif
for (int i = 16; i <= 256; i += 16) {
benchfir(i, experiment);
}

Loading…
Cancel
Save