From 287427513ccaeb086fe47586225f67a8c0ff1b5d Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Thu, 12 Dec 2013 00:00:12 -0800 Subject: [PATCH] FIR filter implementation This is a FIR filter with NEON speedup. The NEON version has been tested for accuracy against the scalar one, and there is simple benchmarking code in place as well. --- android/AndroidManifest.xml | 2 +- android/jni/Android.mk | 24 ++++++- cpp/src/fir.cc | 72 +++++++++++++++++++++ cpp/src/fir.h | 48 ++++++++++++++ cpp/src/neon_fir.s | 97 ++++++++++++++++++++++++++++ cpp/src/test_fir.cc | 125 ++++++++++++++++++++++++++++++++++++ 6 files changed, 365 insertions(+), 3 deletions(-) create mode 100644 cpp/src/fir.cc create mode 100644 cpp/src/fir.h create mode 100644 cpp/src/neon_fir.s create mode 100644 cpp/src/test_fir.cc diff --git a/android/AndroidManifest.xml b/android/AndroidManifest.xml index ad906e8..3099d62 100644 --- a/android/AndroidManifest.xml +++ b/android/AndroidManifest.xml @@ -2,7 +2,7 @@ // for debugging, remove +#include +#include + +#include "fir.h" + +// Should probably ifdef this to make it more portable +void *malloc_aligned(size_t alignment, size_t nbytes) { + return memalign(alignment, nbytes); +} + +SimpleFirFilter::SimpleFirFilter(const float *kernel, size_t nk) : nk(nk) { + k = (float *)malloc(nk * sizeof(k[0])); + for (size_t i = 0; i < nk; i++) { + k[i] = kernel[nk - i - 1]; + } +} + +SimpleFirFilter::~SimpleFirFilter() { + free(k); +} + +void SimpleFirFilter::process(const float *in, float *out, size_t n) { + for (size_t i = 0; i < n; i++) { + float y = 0; + for (size_t j = 0; j < nk; j++) { + y += k[j] * in[i + j]; + } + out[i] = y; + } +} + +NeonFirFilter::NeonFirFilter(const float *kernel, size_t nk) : nk(nk) { + // TODO: handle odd size nk (must be multiple of 4) + k = (float *)malloc_aligned(16, nk * sizeof(k[0])); + for (size_t i = 0; i < nk; i += 4) { + for (size_t j = 0; j < 4; j++) { + k[i + j] = kernel[nk - i - 4 + j]; + } + } +} + +NeonFirFilter::~NeonFirFilter() { + free(k); +} + +#ifdef HAVE_NEON +extern "C" +void neon_fir_direct(const float *in, const float *k, float *out, size_t n, size_t nk); +#endif + +void NeonFirFilter::process(const float *in, float *out, size_t n) { + neon_fir_direct(in - 1, k, out, n, nk); +} diff --git a/cpp/src/fir.h b/cpp/src/fir.h new file mode 100644 index 0000000..2dc3d7e --- /dev/null +++ b/cpp/src/fir.h @@ -0,0 +1,48 @@ +/* + * Copyright 2013 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// move to generic utility file? +void *malloc_aligned(size_t alignment, size_t nbytes); + +// Abstract class +class FirFilter { + public: + // preconditions: + // in + (nk - 1) is aligned to 128 bits + // out is aligned to 128 bits + virtual void process(const float *in, float *out, size_t n) = 0; +}; + +class SimpleFirFilter : public FirFilter { + public: + SimpleFirFilter(const float *kernel, size_t nk); + ~SimpleFirFilter(); + void process(const float *in, float *out, size_t n); + private: + size_t nk; + float *k; +}; + +// should make conditional compilation? +class NeonFirFilter : public FirFilter { + public: + NeonFirFilter(const float *kernel, size_t nk); + ~NeonFirFilter(); + void process(const float *in, float *out, size_t n); + private: + size_t nk; + float *k; +}; diff --git a/cpp/src/neon_fir.s b/cpp/src/neon_fir.s new file mode 100644 index 0000000..b7d3131 --- /dev/null +++ b/cpp/src/neon_fir.s @@ -0,0 +1,97 @@ +@ Copyright 2013 Google Inc. +@ +@ Licensed under the Apache License, Version 2.0 (the "License"); +@ you may not use this file except in compliance with the License. +@ You may obtain a copy of the License at +@ +@ http://www.apache.org/licenses/LICENSE-2.0 +@ +@ Unless required by applicable law or agreed to in writing, software +@ distributed under the License is distributed on an "AS IS" BASIS, +@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ See the License for the specific language governing permissions and +@ limitations under the License. + +@ NEON assembly implementation of FIR filter core + + .text + + .align 2 + .global neon_fir_direct + .type neon_fir_direct, %function +neon_fir_direct: +@ r0 = pointer to input (aligned) +@ r1 = pointer to kernel (aligned) +@ r2 = pointer to output (aligned) +@ r3 = size of input in floats (multiple of 4) +@ stack[0] = size of kernel in floats (multiple of 4) + + push {r4-r7} + ldr r4, [sp, #(4 * 4)] + + lsl r6, r4, #2 + sub r6, #16 + + @ compute initial overlap + mov r7, r4 + vmov.i32 q9, #0 + vmov.i32 q10, #0 + vmov.i32 q11, #0 +neon_fir_direct1: + vld1.i32 {q0}, [r0:128]! @ load 4 samples from input + vld1.i32 {q1}, [r1:128]! @ load 4 samples from kernel + vmla.f32 q9, q1, d0[1] + vmla.f32 q10, q1, d1[0] + vmla.f32 q11, q1, d1[1] + subs r7, #4 + bne neon_fir_direct1 + vmov.i32 q12, #0 + vext.32 q13, q9, q12, #3 + vext.32 q1, q10, q12, #2 + vadd.f32 q13, q1 + vext.32 q1, q11, q12, #1 + vadd.f32 q8, q13, q1 + sub r0, r6 + sub r1, r4, lsl #2 + +neon_fir_direct2: + + mov r7, r4 + vmov.i32 q9, #0 + vmov.i32 q10, #0 + vmov.i32 q11, #0 + @ inner loop +neon_fir_direct3: + vld1.32 {q0}, [r0:128]! @ load 4 samples from input + vld1.i32 {q1}, [r1:128]! @ load 4 samples from kernel + vmla.f32 q9, q1, d0[1] + vmla.f32 q8, q1, d0[0] + vmla.f32 q10, q1, d1[0] + vmla.f32 q11, q1, d1[1] + subs r7, #4 + bne neon_fir_direct3 + + @ process overlaps + vext.32 q0, q12, q9, #3 + vext.32 q13, q9, q12, #3 + vadd.f32 q8, q0 + vext.32 q0, q12, q10, #2 + vext.32 q1, q10, q12, #2 + vadd.f32 q8, q0 + vadd.f32 q13, q1 + vext.32 q0, q12, q11, #1 + vext.32 q1, q11, q12, #1 + vadd.f32 q0, q8 + vadd.f32 q8, q13, q1 + + sub r0, r6 + sub r1, r4, lsl #2 + vst1.32 {q0}, [r2:128]! + + subs r3, #4 + bne neon_fir_direct2 + + pop {r4-r7} + bx lr + + .size neon_fir_direct, .-neon_fir_direct diff --git a/cpp/src/test_fir.cc b/cpp/src/test_fir.cc new file mode 100644 index 0000000..d26638a --- /dev/null +++ b/cpp/src/test_fir.cc @@ -0,0 +1,125 @@ +/* + * Copyright 2013 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Little test app for measuring FIR speed + +#include +#include +#include +#include +#include + +#include "fir.h" + +// clock_gettime would be a little better, but whatever +double now() { + struct timeval tp; + gettimeofday(&tp, NULL); + return tp.tv_sec + 1e-6 * tp.tv_usec; +} + +void condition_governor() { + // sleep for a bit to avoid thermal throttling + static uint32_t v = 0; + struct timespec ts; + ts.tv_sec = 0; + ts.tv_nsec = 900000000 + (v & 1); // 900ms + nanosleep(&ts, NULL); + + // consume cpu a bit to try to coax max cpufreq + uint32_t x = v; + for (int i = 0; i < 10000000; i++) { + x += 42; + x += (x << 10); + x ^= (x >> 6); + } + // storing it in a static guarantees not optimizing out + v = x; +} + +float *mkrandom(size_t size) { + float *result = (float *)malloc_aligned(16, size * sizeof(result[0])); + for (int i = 0; i < size; i++) { + result[i] = random() * (2.0 / RAND_MAX) - 1; + } + return result; +} + +double test_accuracy(FirFilter *f1, FirFilter *f2, const float *inp, int nblock) { + float *out1 = (float *)malloc_aligned(16, nblock * sizeof(out1[0])); + float *out2 = (float *)malloc_aligned(16, nblock * sizeof(out2[0])); + f1->process(inp + 1, out1, nblock); + f2->process(inp + 1, out2, nblock); + double err = 0; + for (int i = 0; i < nblock; i++) { + //printf("%d: %f %f\n", i, out1[i], out2[i]); + err += fabs(out1[i] - out2[i]); + } + free(out1); + free(out2); + return err; +} + +void benchfir(int size, int experiment) { + condition_governor(); + + const int nblock = 64; + float *kernel = mkrandom(size); + float *inp = mkrandom(size + nblock); + float *out = (float *)malloc_aligned(16, nblock * sizeof(out[0])); + FirFilter *f; + + switch(experiment) { + case 0: + f = new SimpleFirFilter(kernel, size); + break; + case 1: + f = new NeonFirFilter(kernel, size); + break; + } + + + double start = now(); + for (int j = 0; j < 15625; j++) { + f->process(inp + 1, out, 64); + } + double elapsed = now() - start; + printf("%i %f\n", size, 1e3 * elapsed); + + FirFilter *fbase = new SimpleFirFilter(kernel, size); + double accuracy = test_accuracy(fbase, f, inp, nblock); + //printf("accuracy = %g\n", accuracy); + + delete f; + delete fbase; + free(kernel); + free(inp); + free(out); +} + +int main(int argc, char **argv) { + printf("set style data linespoints\n" + "set xlabel 'FIR kernel size'\n" + "set ylabel 'ns per sample'\n" + "plot '-' title 'scalar', '-' title '4x4 block'\n"); + for (int experiment = 0; experiment < 2; experiment++) { + for (int i = 16; i <= 128; i += 16) { + benchfir(i, experiment); + } + printf("e\n"); + } + return 0; +}