Add NEON biquad implementation

This commit adds the NEON code for the matrix-based biquad filter,
along with some benchmarking infrastructure.
master
Raph Levien 10 years ago
parent 1b45fbeb46
commit 85850e22b1
  1. 7
      android/jni/Android.mk
  2. 130
      cpp/src/neon_iir.s
  3. 289
      cpp/src/test_filter.cc
  4. 136
      cpp/src/test_fir.cc

@ -61,18 +61,19 @@ include $(BUILD_EXECUTABLE)
include $(CLEAR_VARS)
LOCAL_SRC_FILES := test_fir.cc \
LOCAL_SRC_FILES := test_filter.cc \
fir.cc
ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
LOCAL_ARM_NEON := true
LOCAL_CFLAGS := -DHAVE_NEON=1
LOCAL_SRC_FILES += neon_fir.s
LOCAL_SRC_FILES += neon_fir.s \
neon_iir.s
endif
LOCAL_CFLAGS += -O3
LOCAL_STATIC_LIBRARIES += cpufeatures
LOCAL_MODULE := test_fir
LOCAL_MODULE := test_filter
include $(BUILD_EXECUTABLE)

@ -0,0 +1,130 @@
@ Copyright 2014 Google Inc.
@
@ Licensed under the Apache License, Version 2.0 (the "License");
@ you may not use this file except in compliance with the License.
@ You may obtain a copy of the License at
@
@ http://www.apache.org/licenses/LICENSE-2.0
@
@ Unless required by applicable law or agreed to in writing, software
@ distributed under the License is distributed on an "AS IS" BASIS,
@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ See the License for the specific language governing permissions and
@ limitations under the License.
@ NEON assembly implementation of a second-order IIR filter core (suitable
@ for biquad filters)
.text
.align 2
.global neon_iir_2chan
.type neon_iir_2chan, %function
neon_iir_2chan:
@ r0 = pointer to input buffer 1 (aligned)
@ r1 = pointer to input buffer 2 (aligned)
@ r2 = pointer to output buffer 1 (aligned)
@ r3 = pointer to output buffer 2 (aligned)
@ stack[0] = size of buffer in floats (multiple of 4, >= 8)
@ stack[1] = matrices
@ stack[2] = iir state
push {r4-r5}
vpush {q4-q7}
ldr r4, [sp, #(4 * (2 + 16 + 0))]
ldr r5, [sp, #(4 * (2 + 16 + 1))]
@ load matrices
vld1.32 {q8, q9}, [r5:128]!
vld1.32 {q10, q11}, [r5:128]!
vld1.32 {q12, q13}, [r5:128]!
vld1.32 {q14, q15}, [r5:128]!
ldr r5, [sp, #(4 * (2 + 16 + 2))]
@ load IIR state
vld1.32 d1, [r5:64]!
vld1.32 d3, [r5:64]!
sub r4, #4
@ q0, q1 are state vector L, R for first unroll
@ q2, q3 are state vector L, R for second unroll
@ q4-q5 are scratch to compute next state vectors
@ q6 is input
@ q8-q11 is matrix 1
@ q12-q15 is matrix 2
vld1.32 d12, [r0:64]!
vld1.32 d13, [r1:64]!
vmul.f32 q2, q8, d12[0]
vmul.f32 q3, q12, d13[0]
vmul.f32 q4, q9, d12[1]
vmul.f32 q5, q13, d13[1]
vmla.f32 q2, q10, d1[0]
vmla.f32 q3, q14, d3[0]
vld1.32 d12, [r0:64]!
vmla.f32 q4, q11, d1[1]
vld1.32 d13, [r1:64]!
vmla.f32 q5, q15, d3[1]
neon_iir_2chan_1:
@ first unroll
vmul.f32 q0, q8, d12[0]
vmul.f32 q1, q12, d13[0]
vadd.f32 q2, q4
vadd.f32 q3, q5
vmla.f32 q0, q9, d12[1]
vld1.32 d12, [r0:64]!
vmla.f32 q1, q13, d13[1]
vld1.32 d13, [r1:64]!
vmul.f32 q4, q10, d5[0]
vst1.32 d4, [r2:64]!
vmul.f32 q5, q14, d7[0]
vmla.f32 q0, q11, d5[1]
vmla.f32 q1, q15, d7[1]
vst1.32 d6, [r3:64]!
@ second unroll
vmul.f32 q2, q8, d12[0]
vmul.f32 q3, q12, d13[0]
vadd.f32 q0, q4
vadd.f32 q1, q5
vmla.f32 q2, q9, d12[1]
vld1.32 d12, [r0:64]!
vmla.f32 q3, q13, d13[1]
vld1.32 d13, [r1:64]!
vmul.f32 q4, q10, d1[0]
vst1.32 d0, [r2:64]!
vmul.f32 q5, q14, d3[0]
vmla.f32 q2, q11, d1[1]
vmla.f32 q3, q15, d3[1]
vst1.32 d2, [r3:64]!
subs r4, #4
bne neon_iir_2chan_1
vmul.f32 q0, q8, d12[0]
vmul.f32 q1, q12, d13[0]
vadd.f32 q2, q4
vadd.f32 q3, q5
vmla.f32 q0, q9, d12[1]
vmla.f32 q1, q13, d13[1]
vmul.f32 q4, q10, d5[0]
vst1.32 d4, [r2:64]!
vmul.f32 q5, q14, d7[0]
vmla.f32 q0, q11, d5[1]
vmla.f32 q1, q15, d7[1]
vst1.32 d6, [r3:64]!
vadd.f32 q0, q4
vadd.f32 q1, q5
vst1.32 d0, [r2:64]!
vst1.32 d2, [r3:64]!
@ save IIR state
sub r5, #16
vst1.32 d1, [r5:64]!
vst1.32 d3, [r5:64]!
vpop {q4-q7}
pop {r4-r5}
bx lr
.size neon_iir_2chan, .-neon_iir_2chan

@ -0,0 +1,289 @@
/*
* Copyright 2013 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Little test app for measuring FIR speed
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <sys/time.h>
#include <math.h>
#include "aligned_buf.h"
#include "fir.h"
// clock_gettime would be a little better, but whatever
double now() {
struct timeval tp;
gettimeofday(&tp, NULL);
return tp.tv_sec + 1e-6 * tp.tv_usec;
}
void condition_governor() {
// sleep for a bit to avoid thermal throttling
static uint32_t v = 0;
struct timespec ts;
ts.tv_sec = 0;
ts.tv_nsec = 900000000 + (v & 1); // 900ms
//nanosleep(&ts, NULL);
// consume cpu a bit to try to coax max cpufreq
uint32_t x = v;
for (int i = 0; i < 10000000; i++) {
x += 42;
x += (x << 10);
x ^= (x >> 6);
}
// storing it in a static guarantees not optimizing out
v = x;
}
float *mkrandom(size_t size) {
float *result = (float *)malloc_aligned(16, size * sizeof(result[0]));
for (int i = 0; i < size; i++) {
result[i] = random() * (2.0 / RAND_MAX) - 1;
}
return result;
}
double test_accuracy(FirFilter<float, float> *f1, FirFilter<float, float> *f2, const float *inp, int nblock) {
float *out1 = (float *)malloc_aligned(16, nblock * sizeof(out1[0]));
float *out2 = (float *)malloc_aligned(16, nblock * sizeof(out2[0]));
f1->process(inp + 1, out1, nblock);
f2->process(inp + 1, out2, nblock);
double err = 0;
for (int i = 0; i < nblock; i++) {
printf("#%d: %f %f\n", i, out1[i], out2[i]);
err += fabs(out1[i] - out2[i]);
}
free(out1);
free(out2);
return err;
}
void benchfir(int size, int experiment) {
condition_governor();
const int nblock = 64;
float *kernel = mkrandom(size);
float *inp = mkrandom(size + nblock);
float *out = (float *)malloc_aligned(16, nblock * sizeof(out[0]));
FirFilter<float, float> *f;
switch(experiment) {
case 0:
f = new SimpleFirFilter(kernel, size);
break;
#ifdef HAVE_NEON
// this will crash on non-NEON devices, but we're only interested
// in testing NEON for now
case 1:
f = new NeonFirFilter(kernel, size);
break;
case 2:
case 3:
f = new Neon16FirFilter(kernel, size, experiment == 3);
break;
#endif
case 4:
f = new HalfRateFirFilter(kernel, size, nblock);
break;
}
double start = now();
for (int j = 0; j < 15625; j++) {
f->process(inp + 1, out, nblock);
}
double elapsed = now() - start;
printf("%i %f\n", size, 1e3 * elapsed);
FirFilter<float, float> *fbase = new SimpleFirFilter(kernel, size);
double accuracy = test_accuracy(fbase, f, inp, nblock);
printf("#accuracy = %g\n", accuracy);
delete f;
delete fbase;
free(kernel);
free(inp);
free(out);
}
void runfirbench() {
printf("set style data linespoints\n"
"set xlabel 'FIR kernel size'\n"
"set ylabel 'ns per sample'\n"
"plot '-' title 'scalar', '-' title '4x4 block', '-' title 'fixed16', '-' title 'fixed16 mirror', '-' title 'half rate'\n");
for (int experiment = 0; experiment < 5; experiment++) {
for (int i = 16; i <= 256; i += 16) {
benchfir(i, experiment);
}
printf("e\n");
}
}
void scalarbiquad(const float *inp, float *out, size_t n,
float b0, float b1, float b2, float a1, float a2) {
float x1 = 0, x2 = 0, y1 = 0, y2 = 0;
for (size_t i = 0; i < n; i++) {
float x = inp[i];
float y = b0 * x + b1 * x1 + b2 * x2 - a1 * y1 - a2 * y2;
out[i] = y;
x2 = x1;
x1 = x;
y2 = y1;
y1 = y;
}
}
void benchscalarbiquad() {
condition_governor();
const int nbuf = 1 << 10;
float *inp = mkrandom(nbuf);
float *out = (float *)malloc_aligned(16, nbuf * sizeof(out[0]));
double start = now();
const int niter = 10000;
for (int i = 0; i < niter; i++) {
scalarbiquad(inp, out, nbuf, 1.0207, -1.7719, .9376, -1.7719, 0.9583);
}
double elapsed = now() - start;
double ns_per_iir = 1e9 * elapsed / nbuf / niter;
printf("scalar: %f ns/iir\n", ns_per_iir);
free(inp);
free(out);
}
extern "C"
void neon_iir_2chan(const float *in1, const float *in2, float *out1, float *out2,
size_t n, const float *matrices, float *state);
// see "lab/biquadin two.ipynb" for why
void initbiquadmatrix(float *matrix, double b0, double b1, double b2, double a1, double a2) {
double c1 = b1 - a1 * b0;
double c2 = b2 - a2 * b0;
matrix[0] = b0;
matrix[1] = c1;
matrix[2] = -a1 * c1 + c2;
matrix[3] = -a2 * c1;
matrix[4] = 0;
matrix[5] = b0;
matrix[6] = c1;
matrix[7] = c2;
matrix[8] = 1;
matrix[9] = -a1;
matrix[10] = -a2 + a1 * a1;
matrix[11] = a1 * a2;
matrix[12] = 0;
matrix[13] = 1;
matrix[14] = -a1;
matrix[15] = -a2;
}
#ifdef HAVE_NEON
void benchbiquadneon() {
const int nbuf = 1 << 10;
float *inp1 = mkrandom(nbuf);
float *inp2 = mkrandom(nbuf);
float *out1 = (float *)malloc_aligned(16, nbuf * sizeof(out1[0]));
float *out2 = (float *)malloc_aligned(16, nbuf * sizeof(out2[0]));
AlignedBuf<float, 32> matrices;
AlignedBuf<float, 4> state;
for (size_t i = 0; i < 4; i++) {
state.get()[i] = 0;
}
double start = now();
const int niter = 100000;
for (int i = 0; i < niter; i++) {
neon_iir_2chan(inp1, inp2, out1, out2, nbuf, matrices.get(), state.get());
}
double elapsed = now() - start;
double ns_per_iir = 1e9 * 0.5 * elapsed / nbuf / niter;
printf("neon: %f ns/iir\n", ns_per_iir);
free(inp1);
free(inp2);
free(out1);
free(out2);
}
void testbiquadaccuracy() {
const int nbuf = 1 << 10;
float *inp1 = mkrandom(nbuf);
float *inp2 = mkrandom(nbuf);
float *out1 = (float *)malloc_aligned(16, nbuf * sizeof(out1[0]));
float *out2 = (float *)malloc_aligned(16, nbuf * sizeof(out2[0]));
float *out1a = (float *)malloc_aligned(16, nbuf * sizeof(out1[0]));
float *out2a = (float *)malloc_aligned(16, nbuf * sizeof(out2[0]));
AlignedBuf<float, 32> matrices;
AlignedBuf<float, 4> state;
double b0 = 1.0207, b1 = -1.7719, b2 = .9376, a1 = -1.7719, a2 = 0.9583;
for (size_t i = 0; i < 4; i++) {
state.get()[i] = 0;
}
initbiquadmatrix(matrices.get(), b0, b1, b2, a1, a2);
initbiquadmatrix(matrices.get() + 16, b0, b1, b2, a1, a2);
neon_iir_2chan(inp1, inp2, out1, out2, nbuf, matrices.get(), state.get());
scalarbiquad(inp1, out1a, nbuf, b0, b1, b2, a1, a2);
float maxerr = 0;
for (int i = 0; i < nbuf; i++) {
float err = fabs(out1[i] - out1a[i]);
if (err > maxerr) {
maxerr = err;
}
}
printf("neon maxerr = %g\n", maxerr);
free(inp1);
free(inp2);
free(out1);
free(out2);
free(out1a);
free(out2a);
}
#endif
void runbiquad() {
benchscalarbiquad();
#ifdef HAVE_NEON
benchbiquadneon();
testbiquadaccuracy();
#endif
}
int main(int argc, char **argv) {
if (argc == 2) {
if (!strcmp("fir", argv[1])) {
runfirbench();
return 0;
} else if (!strcmp("biquad", argv[1])) {
runbiquad();
return 0;
}
}
printf("usage:\n"
" test_filter fir\n"
" test_filter biquad\n");
return 1;
}

@ -1,136 +0,0 @@
/*
* Copyright 2013 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Little test app for measuring FIR speed
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <sys/time.h>
#include <math.h>
#include "fir.h"
// clock_gettime would be a little better, but whatever
double now() {
struct timeval tp;
gettimeofday(&tp, NULL);
return tp.tv_sec + 1e-6 * tp.tv_usec;
}
void condition_governor() {
// sleep for a bit to avoid thermal throttling
static uint32_t v = 0;
struct timespec ts;
ts.tv_sec = 0;
ts.tv_nsec = 900000000 + (v & 1); // 900ms
//nanosleep(&ts, NULL);
// consume cpu a bit to try to coax max cpufreq
uint32_t x = v;
for (int i = 0; i < 10000000; i++) {
x += 42;
x += (x << 10);
x ^= (x >> 6);
}
// storing it in a static guarantees not optimizing out
v = x;
}
float *mkrandom(size_t size) {
float *result = (float *)malloc_aligned(16, size * sizeof(result[0]));
for (int i = 0; i < size; i++) {
result[i] = random() * (2.0 / RAND_MAX) - 1;
}
return result;
}
double test_accuracy(FirFilter<float, float> *f1, FirFilter<float, float> *f2, const float *inp, int nblock) {
float *out1 = (float *)malloc_aligned(16, nblock * sizeof(out1[0]));
float *out2 = (float *)malloc_aligned(16, nblock * sizeof(out2[0]));
f1->process(inp + 1, out1, nblock);
f2->process(inp + 1, out2, nblock);
double err = 0;
for (int i = 0; i < nblock; i++) {
printf("#%d: %f %f\n", i, out1[i], out2[i]);
err += fabs(out1[i] - out2[i]);
}
free(out1);
free(out2);
return err;
}
void benchfir(int size, int experiment) {
condition_governor();
const int nblock = 64;
float *kernel = mkrandom(size);
float *inp = mkrandom(size + nblock);
float *out = (float *)malloc_aligned(16, nblock * sizeof(out[0]));
FirFilter<float, float> *f;
switch(experiment) {
case 0:
f = new SimpleFirFilter(kernel, size);
break;
#ifdef HAVE_NEON
// this will crash on non-NEON devices, but we're only interested
// in testing NEON for now
case 1:
f = new NeonFirFilter(kernel, size);
break;
case 2:
case 3:
f = new Neon16FirFilter(kernel, size, experiment == 3);
break;
#endif
case 4:
f = new HalfRateFirFilter(kernel, size, nblock);
break;
}
double start = now();
for (int j = 0; j < 15625; j++) {
f->process(inp + 1, out, nblock);
}
double elapsed = now() - start;
printf("%i %f\n", size, 1e3 * elapsed);
FirFilter<float, float> *fbase = new SimpleFirFilter(kernel, size);
double accuracy = test_accuracy(fbase, f, inp, nblock);
printf("#accuracy = %g\n", accuracy);
delete f;
delete fbase;
free(kernel);
free(inp);
free(out);
}
int main(int argc, char **argv) {
printf("set style data linespoints\n"
"set xlabel 'FIR kernel size'\n"
"set ylabel 'ns per sample'\n"
"plot '-' title 'scalar', '-' title '4x4 block', '-' title 'fixed16', '-' title 'fixed16 mirror', '-' title 'half rate'\n");
for (int experiment = 0; experiment < 5; experiment++) {
for (int i = 16; i <= 256; i += 16) {
benchfir(i, experiment);
}
printf("e\n");
}
return 0;
}
Loading…
Cancel
Save