From 85850e22b103aed14dde224aee5930245a981f26 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Tue, 14 Jan 2014 22:13:21 -0800 Subject: [PATCH] Add NEON biquad implementation This commit adds the NEON code for the matrix-based biquad filter, along with some benchmarking infrastructure. --- android/jni/Android.mk | 7 +- cpp/src/neon_iir.s | 130 ++++++++++++++++++ cpp/src/test_filter.cc | 289 +++++++++++++++++++++++++++++++++++++++++ cpp/src/test_fir.cc | 136 ------------------- 4 files changed, 423 insertions(+), 139 deletions(-) create mode 100644 cpp/src/neon_iir.s create mode 100644 cpp/src/test_filter.cc delete mode 100644 cpp/src/test_fir.cc diff --git a/android/jni/Android.mk b/android/jni/Android.mk index a111be7..1e692c0 100644 --- a/android/jni/Android.mk +++ b/android/jni/Android.mk @@ -61,18 +61,19 @@ include $(BUILD_EXECUTABLE) include $(CLEAR_VARS) -LOCAL_SRC_FILES := test_fir.cc \ +LOCAL_SRC_FILES := test_filter.cc \ fir.cc ifeq ($(TARGET_ARCH_ABI),armeabi-v7a) LOCAL_ARM_NEON := true LOCAL_CFLAGS := -DHAVE_NEON=1 - LOCAL_SRC_FILES += neon_fir.s + LOCAL_SRC_FILES += neon_fir.s \ + neon_iir.s endif LOCAL_CFLAGS += -O3 LOCAL_STATIC_LIBRARIES += cpufeatures -LOCAL_MODULE := test_fir +LOCAL_MODULE := test_filter include $(BUILD_EXECUTABLE) diff --git a/cpp/src/neon_iir.s b/cpp/src/neon_iir.s new file mode 100644 index 0000000..ce61672 --- /dev/null +++ b/cpp/src/neon_iir.s @@ -0,0 +1,130 @@ +@ Copyright 2014 Google Inc. +@ +@ Licensed under the Apache License, Version 2.0 (the "License"); +@ you may not use this file except in compliance with the License. +@ You may obtain a copy of the License at +@ +@ http://www.apache.org/licenses/LICENSE-2.0 +@ +@ Unless required by applicable law or agreed to in writing, software +@ distributed under the License is distributed on an "AS IS" BASIS, +@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ See the License for the specific language governing permissions and +@ limitations under the License. + +@ NEON assembly implementation of a second-order IIR filter core (suitable +@ for biquad filters) + + .text + + .align 2 + .global neon_iir_2chan + .type neon_iir_2chan, %function +neon_iir_2chan: +@ r0 = pointer to input buffer 1 (aligned) +@ r1 = pointer to input buffer 2 (aligned) +@ r2 = pointer to output buffer 1 (aligned) +@ r3 = pointer to output buffer 2 (aligned) +@ stack[0] = size of buffer in floats (multiple of 4, >= 8) +@ stack[1] = matrices +@ stack[2] = iir state + + push {r4-r5} + vpush {q4-q7} + ldr r4, [sp, #(4 * (2 + 16 + 0))] + ldr r5, [sp, #(4 * (2 + 16 + 1))] + + @ load matrices + vld1.32 {q8, q9}, [r5:128]! + vld1.32 {q10, q11}, [r5:128]! + vld1.32 {q12, q13}, [r5:128]! + vld1.32 {q14, q15}, [r5:128]! + + ldr r5, [sp, #(4 * (2 + 16 + 2))] + @ load IIR state + vld1.32 d1, [r5:64]! + vld1.32 d3, [r5:64]! + + sub r4, #4 +@ q0, q1 are state vector L, R for first unroll +@ q2, q3 are state vector L, R for second unroll +@ q4-q5 are scratch to compute next state vectors +@ q6 is input +@ q8-q11 is matrix 1 +@ q12-q15 is matrix 2 + + vld1.32 d12, [r0:64]! + vld1.32 d13, [r1:64]! + vmul.f32 q2, q8, d12[0] + vmul.f32 q3, q12, d13[0] + vmul.f32 q4, q9, d12[1] + vmul.f32 q5, q13, d13[1] + vmla.f32 q2, q10, d1[0] + vmla.f32 q3, q14, d3[0] + vld1.32 d12, [r0:64]! + vmla.f32 q4, q11, d1[1] + vld1.32 d13, [r1:64]! + vmla.f32 q5, q15, d3[1] + +neon_iir_2chan_1: + @ first unroll + vmul.f32 q0, q8, d12[0] + vmul.f32 q1, q12, d13[0] + vadd.f32 q2, q4 + vadd.f32 q3, q5 + vmla.f32 q0, q9, d12[1] + vld1.32 d12, [r0:64]! + vmla.f32 q1, q13, d13[1] + vld1.32 d13, [r1:64]! + vmul.f32 q4, q10, d5[0] + vst1.32 d4, [r2:64]! + vmul.f32 q5, q14, d7[0] + vmla.f32 q0, q11, d5[1] + vmla.f32 q1, q15, d7[1] + vst1.32 d6, [r3:64]! + + @ second unroll + vmul.f32 q2, q8, d12[0] + vmul.f32 q3, q12, d13[0] + vadd.f32 q0, q4 + vadd.f32 q1, q5 + vmla.f32 q2, q9, d12[1] + vld1.32 d12, [r0:64]! + vmla.f32 q3, q13, d13[1] + vld1.32 d13, [r1:64]! + vmul.f32 q4, q10, d1[0] + vst1.32 d0, [r2:64]! + vmul.f32 q5, q14, d3[0] + vmla.f32 q2, q11, d1[1] + vmla.f32 q3, q15, d3[1] + vst1.32 d2, [r3:64]! + subs r4, #4 + bne neon_iir_2chan_1 + + vmul.f32 q0, q8, d12[0] + vmul.f32 q1, q12, d13[0] + vadd.f32 q2, q4 + vadd.f32 q3, q5 + vmla.f32 q0, q9, d12[1] + vmla.f32 q1, q13, d13[1] + vmul.f32 q4, q10, d5[0] + vst1.32 d4, [r2:64]! + vmul.f32 q5, q14, d7[0] + vmla.f32 q0, q11, d5[1] + vmla.f32 q1, q15, d7[1] + vst1.32 d6, [r3:64]! + + vadd.f32 q0, q4 + vadd.f32 q1, q5 + vst1.32 d0, [r2:64]! + vst1.32 d2, [r3:64]! + + @ save IIR state + sub r5, #16 + vst1.32 d1, [r5:64]! + vst1.32 d3, [r5:64]! + + vpop {q4-q7} + pop {r4-r5} + bx lr + .size neon_iir_2chan, .-neon_iir_2chan diff --git a/cpp/src/test_filter.cc b/cpp/src/test_filter.cc new file mode 100644 index 0000000..1f9d8c7 --- /dev/null +++ b/cpp/src/test_filter.cc @@ -0,0 +1,289 @@ +/* + * Copyright 2013 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Little test app for measuring FIR speed + +#include +#include +#include +#include +#include + +#include "aligned_buf.h" +#include "fir.h" + +// clock_gettime would be a little better, but whatever +double now() { + struct timeval tp; + gettimeofday(&tp, NULL); + return tp.tv_sec + 1e-6 * tp.tv_usec; +} + +void condition_governor() { + // sleep for a bit to avoid thermal throttling + static uint32_t v = 0; + struct timespec ts; + ts.tv_sec = 0; + ts.tv_nsec = 900000000 + (v & 1); // 900ms + //nanosleep(&ts, NULL); + + // consume cpu a bit to try to coax max cpufreq + uint32_t x = v; + for (int i = 0; i < 10000000; i++) { + x += 42; + x += (x << 10); + x ^= (x >> 6); + } + // storing it in a static guarantees not optimizing out + v = x; +} + +float *mkrandom(size_t size) { + float *result = (float *)malloc_aligned(16, size * sizeof(result[0])); + for (int i = 0; i < size; i++) { + result[i] = random() * (2.0 / RAND_MAX) - 1; + } + return result; +} + +double test_accuracy(FirFilter *f1, FirFilter *f2, const float *inp, int nblock) { + float *out1 = (float *)malloc_aligned(16, nblock * sizeof(out1[0])); + float *out2 = (float *)malloc_aligned(16, nblock * sizeof(out2[0])); + f1->process(inp + 1, out1, nblock); + f2->process(inp + 1, out2, nblock); + double err = 0; + for (int i = 0; i < nblock; i++) { + printf("#%d: %f %f\n", i, out1[i], out2[i]); + err += fabs(out1[i] - out2[i]); + } + free(out1); + free(out2); + return err; +} + +void benchfir(int size, int experiment) { + condition_governor(); + + const int nblock = 64; + float *kernel = mkrandom(size); + float *inp = mkrandom(size + nblock); + float *out = (float *)malloc_aligned(16, nblock * sizeof(out[0])); + FirFilter *f; + + switch(experiment) { + case 0: + f = new SimpleFirFilter(kernel, size); + break; +#ifdef HAVE_NEON + // this will crash on non-NEON devices, but we're only interested + // in testing NEON for now + case 1: + f = new NeonFirFilter(kernel, size); + break; + case 2: + case 3: + f = new Neon16FirFilter(kernel, size, experiment == 3); + break; +#endif + case 4: + f = new HalfRateFirFilter(kernel, size, nblock); + break; + } + + + double start = now(); + for (int j = 0; j < 15625; j++) { + f->process(inp + 1, out, nblock); + } + double elapsed = now() - start; + printf("%i %f\n", size, 1e3 * elapsed); + + FirFilter *fbase = new SimpleFirFilter(kernel, size); + double accuracy = test_accuracy(fbase, f, inp, nblock); + printf("#accuracy = %g\n", accuracy); + + delete f; + delete fbase; + free(kernel); + free(inp); + free(out); +} + +void runfirbench() { + printf("set style data linespoints\n" + "set xlabel 'FIR kernel size'\n" + "set ylabel 'ns per sample'\n" + "plot '-' title 'scalar', '-' title '4x4 block', '-' title 'fixed16', '-' title 'fixed16 mirror', '-' title 'half rate'\n"); + for (int experiment = 0; experiment < 5; experiment++) { + for (int i = 16; i <= 256; i += 16) { + benchfir(i, experiment); + } + printf("e\n"); + } +} + +void scalarbiquad(const float *inp, float *out, size_t n, + float b0, float b1, float b2, float a1, float a2) { + float x1 = 0, x2 = 0, y1 = 0, y2 = 0; + for (size_t i = 0; i < n; i++) { + float x = inp[i]; + float y = b0 * x + b1 * x1 + b2 * x2 - a1 * y1 - a2 * y2; + out[i] = y; + x2 = x1; + x1 = x; + y2 = y1; + y1 = y; + } +} + +void benchscalarbiquad() { + condition_governor(); + const int nbuf = 1 << 10; + float *inp = mkrandom(nbuf); + float *out = (float *)malloc_aligned(16, nbuf * sizeof(out[0])); + + double start = now(); + const int niter = 10000; + for (int i = 0; i < niter; i++) { + scalarbiquad(inp, out, nbuf, 1.0207, -1.7719, .9376, -1.7719, 0.9583); + } + double elapsed = now() - start; + double ns_per_iir = 1e9 * elapsed / nbuf / niter; + printf("scalar: %f ns/iir\n", ns_per_iir); + + free(inp); + free(out); +} + +extern "C" +void neon_iir_2chan(const float *in1, const float *in2, float *out1, float *out2, + size_t n, const float *matrices, float *state); + +// see "lab/biquadin two.ipynb" for why +void initbiquadmatrix(float *matrix, double b0, double b1, double b2, double a1, double a2) { + double c1 = b1 - a1 * b0; + double c2 = b2 - a2 * b0; + matrix[0] = b0; + matrix[1] = c1; + matrix[2] = -a1 * c1 + c2; + matrix[3] = -a2 * c1; + matrix[4] = 0; + matrix[5] = b0; + matrix[6] = c1; + matrix[7] = c2; + matrix[8] = 1; + matrix[9] = -a1; + matrix[10] = -a2 + a1 * a1; + matrix[11] = a1 * a2; + matrix[12] = 0; + matrix[13] = 1; + matrix[14] = -a1; + matrix[15] = -a2; +} + +#ifdef HAVE_NEON +void benchbiquadneon() { + const int nbuf = 1 << 10; + float *inp1 = mkrandom(nbuf); + float *inp2 = mkrandom(nbuf); + float *out1 = (float *)malloc_aligned(16, nbuf * sizeof(out1[0])); + float *out2 = (float *)malloc_aligned(16, nbuf * sizeof(out2[0])); + AlignedBuf matrices; + AlignedBuf state; + + for (size_t i = 0; i < 4; i++) { + state.get()[i] = 0; + } + + double start = now(); + const int niter = 100000; + for (int i = 0; i < niter; i++) { + neon_iir_2chan(inp1, inp2, out1, out2, nbuf, matrices.get(), state.get()); + } + + double elapsed = now() - start; + double ns_per_iir = 1e9 * 0.5 * elapsed / nbuf / niter; + printf("neon: %f ns/iir\n", ns_per_iir); + free(inp1); + free(inp2); + free(out1); + free(out2); +} + +void testbiquadaccuracy() { + const int nbuf = 1 << 10; + float *inp1 = mkrandom(nbuf); + float *inp2 = mkrandom(nbuf); + float *out1 = (float *)malloc_aligned(16, nbuf * sizeof(out1[0])); + float *out2 = (float *)malloc_aligned(16, nbuf * sizeof(out2[0])); + float *out1a = (float *)malloc_aligned(16, nbuf * sizeof(out1[0])); + float *out2a = (float *)malloc_aligned(16, nbuf * sizeof(out2[0])); + AlignedBuf matrices; + AlignedBuf state; + double b0 = 1.0207, b1 = -1.7719, b2 = .9376, a1 = -1.7719, a2 = 0.9583; + + for (size_t i = 0; i < 4; i++) { + state.get()[i] = 0; + } + + initbiquadmatrix(matrices.get(), b0, b1, b2, a1, a2); + initbiquadmatrix(matrices.get() + 16, b0, b1, b2, a1, a2); + + neon_iir_2chan(inp1, inp2, out1, out2, nbuf, matrices.get(), state.get()); + + scalarbiquad(inp1, out1a, nbuf, b0, b1, b2, a1, a2); + + float maxerr = 0; + for (int i = 0; i < nbuf; i++) { + float err = fabs(out1[i] - out1a[i]); + if (err > maxerr) { + maxerr = err; + } + } + printf("neon maxerr = %g\n", maxerr); + free(inp1); + free(inp2); + free(out1); + free(out2); + free(out1a); + free(out2a); +} + +#endif + +void runbiquad() { + benchscalarbiquad(); +#ifdef HAVE_NEON + benchbiquadneon(); + testbiquadaccuracy(); +#endif +} + +int main(int argc, char **argv) { + if (argc == 2) { + if (!strcmp("fir", argv[1])) { + runfirbench(); + return 0; + } else if (!strcmp("biquad", argv[1])) { + runbiquad(); + return 0; + } + } + printf("usage:\n" + " test_filter fir\n" + " test_filter biquad\n"); + return 1; +} diff --git a/cpp/src/test_fir.cc b/cpp/src/test_fir.cc deleted file mode 100644 index 6c055c6..0000000 --- a/cpp/src/test_fir.cc +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright 2013 Google Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Little test app for measuring FIR speed - -#include -#include -#include -#include -#include - -#include "fir.h" - -// clock_gettime would be a little better, but whatever -double now() { - struct timeval tp; - gettimeofday(&tp, NULL); - return tp.tv_sec + 1e-6 * tp.tv_usec; -} - -void condition_governor() { - // sleep for a bit to avoid thermal throttling - static uint32_t v = 0; - struct timespec ts; - ts.tv_sec = 0; - ts.tv_nsec = 900000000 + (v & 1); // 900ms - //nanosleep(&ts, NULL); - - // consume cpu a bit to try to coax max cpufreq - uint32_t x = v; - for (int i = 0; i < 10000000; i++) { - x += 42; - x += (x << 10); - x ^= (x >> 6); - } - // storing it in a static guarantees not optimizing out - v = x; -} - -float *mkrandom(size_t size) { - float *result = (float *)malloc_aligned(16, size * sizeof(result[0])); - for (int i = 0; i < size; i++) { - result[i] = random() * (2.0 / RAND_MAX) - 1; - } - return result; -} - -double test_accuracy(FirFilter *f1, FirFilter *f2, const float *inp, int nblock) { - float *out1 = (float *)malloc_aligned(16, nblock * sizeof(out1[0])); - float *out2 = (float *)malloc_aligned(16, nblock * sizeof(out2[0])); - f1->process(inp + 1, out1, nblock); - f2->process(inp + 1, out2, nblock); - double err = 0; - for (int i = 0; i < nblock; i++) { - printf("#%d: %f %f\n", i, out1[i], out2[i]); - err += fabs(out1[i] - out2[i]); - } - free(out1); - free(out2); - return err; -} - -void benchfir(int size, int experiment) { - condition_governor(); - - const int nblock = 64; - float *kernel = mkrandom(size); - float *inp = mkrandom(size + nblock); - float *out = (float *)malloc_aligned(16, nblock * sizeof(out[0])); - FirFilter *f; - - switch(experiment) { - case 0: - f = new SimpleFirFilter(kernel, size); - break; -#ifdef HAVE_NEON - // this will crash on non-NEON devices, but we're only interested - // in testing NEON for now - case 1: - f = new NeonFirFilter(kernel, size); - break; - case 2: - case 3: - f = new Neon16FirFilter(kernel, size, experiment == 3); - break; -#endif - case 4: - f = new HalfRateFirFilter(kernel, size, nblock); - break; - } - - - double start = now(); - for (int j = 0; j < 15625; j++) { - f->process(inp + 1, out, nblock); - } - double elapsed = now() - start; - printf("%i %f\n", size, 1e3 * elapsed); - - FirFilter *fbase = new SimpleFirFilter(kernel, size); - double accuracy = test_accuracy(fbase, f, inp, nblock); - printf("#accuracy = %g\n", accuracy); - - delete f; - delete fbase; - free(kernel); - free(inp); - free(out); -} - -int main(int argc, char **argv) { - printf("set style data linespoints\n" - "set xlabel 'FIR kernel size'\n" - "set ylabel 'ns per sample'\n" - "plot '-' title 'scalar', '-' title '4x4 block', '-' title 'fixed16', '-' title 'fixed16 mirror', '-' title 'half rate'\n"); - for (int experiment = 0; experiment < 5; experiment++) { - for (int i = 16; i <= 256; i += 16) { - benchfir(i, experiment); - } - printf("e\n"); - } - return 0; -}