This commit adds the NEON code for the matrix-based biquad filter, along with some benchmarking infrastructure.master
parent
1b45fbeb46
commit
85850e22b1
@ -0,0 +1,130 @@ |
||||
@ Copyright 2014 Google Inc.
|
||||
@
|
||||
@ Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@ you may not use this file except in compliance with the License.
|
||||
@ You may obtain a copy of the License at
|
||||
@
|
||||
@ http://www.apache.org/licenses/LICENSE-2.0
|
||||
@
|
||||
@ Unless required by applicable law or agreed to in writing, software
|
||||
@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||
@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
@ See the License for the specific language governing permissions and
|
||||
@ limitations under the License.
|
||||
|
||||
@ NEON assembly implementation of a second-order IIR filter core (suitable
|
||||
@ for biquad filters)
|
||||
|
||||
.text |
||||
|
||||
.align 2
|
||||
.global neon_iir_2chan
|
||||
.type neon_iir_2chan, %function |
||||
neon_iir_2chan: |
||||
@ r0 = pointer to input buffer 1 (aligned)
|
||||
@ r1 = pointer to input buffer 2 (aligned)
|
||||
@ r2 = pointer to output buffer 1 (aligned)
|
||||
@ r3 = pointer to output buffer 2 (aligned)
|
||||
@ stack[0] = size of buffer in floats (multiple of 4, >= 8)
|
||||
@ stack[1] = matrices
|
||||
@ stack[2] = iir state
|
||||
|
||||
push {r4-r5} |
||||
vpush {q4-q7} |
||||
ldr r4, [sp, #(4 * (2 + 16 + 0))] |
||||
ldr r5, [sp, #(4 * (2 + 16 + 1))] |
||||
|
||||
@ load matrices
|
||||
vld1.32 {q8, q9}, [r5:128]! |
||||
vld1.32 {q10, q11}, [r5:128]! |
||||
vld1.32 {q12, q13}, [r5:128]! |
||||
vld1.32 {q14, q15}, [r5:128]! |
||||
|
||||
ldr r5, [sp, #(4 * (2 + 16 + 2))] |
||||
@ load IIR state
|
||||
vld1.32 d1, [r5:64]! |
||||
vld1.32 d3, [r5:64]! |
||||
|
||||
sub r4, #4 |
||||
@ q0, q1 are state vector L, R for first unroll
|
||||
@ q2, q3 are state vector L, R for second unroll
|
||||
@ q4-q5 are scratch to compute next state vectors
|
||||
@ q6 is input
|
||||
@ q8-q11 is matrix 1
|
||||
@ q12-q15 is matrix 2
|
||||
|
||||
vld1.32 d12, [r0:64]! |
||||
vld1.32 d13, [r1:64]! |
||||
vmul.f32 q2, q8, d12[0] |
||||
vmul.f32 q3, q12, d13[0] |
||||
vmul.f32 q4, q9, d12[1] |
||||
vmul.f32 q5, q13, d13[1] |
||||
vmla.f32 q2, q10, d1[0] |
||||
vmla.f32 q3, q14, d3[0] |
||||
vld1.32 d12, [r0:64]! |
||||
vmla.f32 q4, q11, d1[1] |
||||
vld1.32 d13, [r1:64]! |
||||
vmla.f32 q5, q15, d3[1] |
||||
|
||||
neon_iir_2chan_1: |
||||
@ first unroll
|
||||
vmul.f32 q0, q8, d12[0] |
||||
vmul.f32 q1, q12, d13[0] |
||||
vadd.f32 q2, q4 |
||||
vadd.f32 q3, q5 |
||||
vmla.f32 q0, q9, d12[1] |
||||
vld1.32 d12, [r0:64]! |
||||
vmla.f32 q1, q13, d13[1] |
||||
vld1.32 d13, [r1:64]! |
||||
vmul.f32 q4, q10, d5[0] |
||||
vst1.32 d4, [r2:64]! |
||||
vmul.f32 q5, q14, d7[0] |
||||
vmla.f32 q0, q11, d5[1] |
||||
vmla.f32 q1, q15, d7[1] |
||||
vst1.32 d6, [r3:64]! |
||||
|
||||
@ second unroll
|
||||
vmul.f32 q2, q8, d12[0] |
||||
vmul.f32 q3, q12, d13[0] |
||||
vadd.f32 q0, q4 |
||||
vadd.f32 q1, q5 |
||||
vmla.f32 q2, q9, d12[1] |
||||
vld1.32 d12, [r0:64]! |
||||
vmla.f32 q3, q13, d13[1] |
||||
vld1.32 d13, [r1:64]! |
||||
vmul.f32 q4, q10, d1[0] |
||||
vst1.32 d0, [r2:64]! |
||||
vmul.f32 q5, q14, d3[0] |
||||
vmla.f32 q2, q11, d1[1] |
||||
vmla.f32 q3, q15, d3[1] |
||||
vst1.32 d2, [r3:64]! |
||||
subs r4, #4 |
||||
bne neon_iir_2chan_1 |
||||
|
||||
vmul.f32 q0, q8, d12[0] |
||||
vmul.f32 q1, q12, d13[0] |
||||
vadd.f32 q2, q4 |
||||
vadd.f32 q3, q5 |
||||
vmla.f32 q0, q9, d12[1] |
||||
vmla.f32 q1, q13, d13[1] |
||||
vmul.f32 q4, q10, d5[0] |
||||
vst1.32 d4, [r2:64]! |
||||
vmul.f32 q5, q14, d7[0] |
||||
vmla.f32 q0, q11, d5[1] |
||||
vmla.f32 q1, q15, d7[1] |
||||
vst1.32 d6, [r3:64]! |
||||
|
||||
vadd.f32 q0, q4 |
||||
vadd.f32 q1, q5 |
||||
vst1.32 d0, [r2:64]! |
||||
vst1.32 d2, [r3:64]! |
||||
|
||||
@ save IIR state
|
||||
sub r5, #16 |
||||
vst1.32 d1, [r5:64]! |
||||
vst1.32 d3, [r5:64]! |
||||
|
||||
vpop {q4-q7} |
||||
pop {r4-r5} |
||||
bx lr |
||||
.size neon_iir_2chan, .-neon_iir_2chan |
@ -0,0 +1,289 @@ |
||||
/*
|
||||
* Copyright 2013 Google Inc. |
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); |
||||
* you may not use this file except in compliance with the License. |
||||
* You may obtain a copy of the License at |
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
// Little test app for measuring FIR speed
|
||||
|
||||
#include <stdlib.h> |
||||
#include <stdio.h> |
||||
#include <time.h> |
||||
#include <sys/time.h> |
||||
#include <math.h> |
||||
|
||||
#include "aligned_buf.h" |
||||
#include "fir.h" |
||||
|
||||
// clock_gettime would be a little better, but whatever
|
||||
double now() { |
||||
struct timeval tp; |
||||
gettimeofday(&tp, NULL); |
||||
return tp.tv_sec + 1e-6 * tp.tv_usec; |
||||
} |
||||
|
||||
void condition_governor() { |
||||
// sleep for a bit to avoid thermal throttling
|
||||
static uint32_t v = 0; |
||||
struct timespec ts; |
||||
ts.tv_sec = 0; |
||||
ts.tv_nsec = 900000000 + (v & 1); // 900ms
|
||||
//nanosleep(&ts, NULL);
|
||||
|
||||
// consume cpu a bit to try to coax max cpufreq
|
||||
uint32_t x = v; |
||||
for (int i = 0; i < 10000000; i++) { |
||||
x += 42; |
||||
x += (x << 10); |
||||
x ^= (x >> 6); |
||||
} |
||||
// storing it in a static guarantees not optimizing out
|
||||
v = x; |
||||
} |
||||
|
||||
float *mkrandom(size_t size) { |
||||
float *result = (float *)malloc_aligned(16, size * sizeof(result[0])); |
||||
for (int i = 0; i < size; i++) { |
||||
result[i] = random() * (2.0 / RAND_MAX) - 1; |
||||
} |
||||
return result; |
||||
} |
||||
|
||||
double test_accuracy(FirFilter<float, float> *f1, FirFilter<float, float> *f2, const float *inp, int nblock) { |
||||
float *out1 = (float *)malloc_aligned(16, nblock * sizeof(out1[0])); |
||||
float *out2 = (float *)malloc_aligned(16, nblock * sizeof(out2[0])); |
||||
f1->process(inp + 1, out1, nblock); |
||||
f2->process(inp + 1, out2, nblock); |
||||
double err = 0; |
||||
for (int i = 0; i < nblock; i++) { |
||||
printf("#%d: %f %f\n", i, out1[i], out2[i]); |
||||
err += fabs(out1[i] - out2[i]); |
||||
} |
||||
free(out1); |
||||
free(out2); |
||||
return err; |
||||
} |
||||
|
||||
void benchfir(int size, int experiment) { |
||||
condition_governor(); |
||||
|
||||
const int nblock = 64; |
||||
float *kernel = mkrandom(size); |
||||
float *inp = mkrandom(size + nblock); |
||||
float *out = (float *)malloc_aligned(16, nblock * sizeof(out[0])); |
||||
FirFilter<float, float> *f; |
||||
|
||||
switch(experiment) { |
||||
case 0: |
||||
f = new SimpleFirFilter(kernel, size); |
||||
break; |
||||
#ifdef HAVE_NEON |
||||
// this will crash on non-NEON devices, but we're only interested
|
||||
// in testing NEON for now
|
||||
case 1: |
||||
f = new NeonFirFilter(kernel, size); |
||||
break; |
||||
case 2: |
||||
case 3: |
||||
f = new Neon16FirFilter(kernel, size, experiment == 3); |
||||
break; |
||||
#endif |
||||
case 4: |
||||
f = new HalfRateFirFilter(kernel, size, nblock); |
||||
break; |
||||
} |
||||
|
||||
|
||||
double start = now(); |
||||
for (int j = 0; j < 15625; j++) { |
||||
f->process(inp + 1, out, nblock); |
||||
} |
||||
double elapsed = now() - start; |
||||
printf("%i %f\n", size, 1e3 * elapsed); |
||||
|
||||
FirFilter<float, float> *fbase = new SimpleFirFilter(kernel, size); |
||||
double accuracy = test_accuracy(fbase, f, inp, nblock); |
||||
printf("#accuracy = %g\n", accuracy); |
||||
|
||||
delete f; |
||||
delete fbase; |
||||
free(kernel); |
||||
free(inp); |
||||
free(out); |
||||
} |
||||
|
||||
void runfirbench() { |
||||
printf("set style data linespoints\n" |
||||
"set xlabel 'FIR kernel size'\n" |
||||
"set ylabel 'ns per sample'\n" |
||||
"plot '-' title 'scalar', '-' title '4x4 block', '-' title 'fixed16', '-' title 'fixed16 mirror', '-' title 'half rate'\n"); |
||||
for (int experiment = 0; experiment < 5; experiment++) { |
||||
for (int i = 16; i <= 256; i += 16) { |
||||
benchfir(i, experiment); |
||||
} |
||||
printf("e\n"); |
||||
} |
||||
} |
||||
|
||||
void scalarbiquad(const float *inp, float *out, size_t n, |
||||
float b0, float b1, float b2, float a1, float a2) { |
||||
float x1 = 0, x2 = 0, y1 = 0, y2 = 0; |
||||
for (size_t i = 0; i < n; i++) { |
||||
float x = inp[i]; |
||||
float y = b0 * x + b1 * x1 + b2 * x2 - a1 * y1 - a2 * y2; |
||||
out[i] = y; |
||||
x2 = x1; |
||||
x1 = x; |
||||
y2 = y1; |
||||
y1 = y; |
||||
} |
||||
} |
||||
|
||||
void benchscalarbiquad() { |
||||
condition_governor(); |
||||
const int nbuf = 1 << 10; |
||||
float *inp = mkrandom(nbuf); |
||||
float *out = (float *)malloc_aligned(16, nbuf * sizeof(out[0])); |
||||
|
||||
double start = now(); |
||||
const int niter = 10000; |
||||
for (int i = 0; i < niter; i++) { |
||||
scalarbiquad(inp, out, nbuf, 1.0207, -1.7719, .9376, -1.7719, 0.9583); |
||||
} |
||||
double elapsed = now() - start; |
||||
double ns_per_iir = 1e9 * elapsed / nbuf / niter; |
||||
printf("scalar: %f ns/iir\n", ns_per_iir); |
||||
|
||||
free(inp); |
||||
free(out); |
||||
} |
||||
|
||||
extern "C" |
||||
void neon_iir_2chan(const float *in1, const float *in2, float *out1, float *out2, |
||||
size_t n, const float *matrices, float *state); |
||||
|
||||
// see "lab/biquadin two.ipynb" for why
|
||||
void initbiquadmatrix(float *matrix, double b0, double b1, double b2, double a1, double a2) { |
||||
double c1 = b1 - a1 * b0; |
||||
double c2 = b2 - a2 * b0; |
||||
matrix[0] = b0; |
||||
matrix[1] = c1; |
||||
matrix[2] = -a1 * c1 + c2; |
||||
matrix[3] = -a2 * c1; |
||||
matrix[4] = 0; |
||||
matrix[5] = b0; |
||||
matrix[6] = c1; |
||||
matrix[7] = c2; |
||||
matrix[8] = 1; |
||||
matrix[9] = -a1; |
||||
matrix[10] = -a2 + a1 * a1; |
||||
matrix[11] = a1 * a2; |
||||
matrix[12] = 0; |
||||
matrix[13] = 1; |
||||
matrix[14] = -a1; |
||||
matrix[15] = -a2; |
||||
} |
||||
|
||||
#ifdef HAVE_NEON |
||||
void benchbiquadneon() { |
||||
const int nbuf = 1 << 10; |
||||
float *inp1 = mkrandom(nbuf); |
||||
float *inp2 = mkrandom(nbuf); |
||||
float *out1 = (float *)malloc_aligned(16, nbuf * sizeof(out1[0])); |
||||
float *out2 = (float *)malloc_aligned(16, nbuf * sizeof(out2[0])); |
||||
AlignedBuf<float, 32> matrices; |
||||
AlignedBuf<float, 4> state; |
||||
|
||||
for (size_t i = 0; i < 4; i++) { |
||||
state.get()[i] = 0; |
||||
} |
||||
|
||||
double start = now(); |
||||
const int niter = 100000; |
||||
for (int i = 0; i < niter; i++) { |
||||
neon_iir_2chan(inp1, inp2, out1, out2, nbuf, matrices.get(), state.get()); |
||||
} |
||||
|
||||
double elapsed = now() - start; |
||||
double ns_per_iir = 1e9 * 0.5 * elapsed / nbuf / niter; |
||||
printf("neon: %f ns/iir\n", ns_per_iir); |
||||
free(inp1); |
||||
free(inp2); |
||||
free(out1); |
||||
free(out2); |
||||
} |
||||
|
||||
void testbiquadaccuracy() { |
||||
const int nbuf = 1 << 10; |
||||
float *inp1 = mkrandom(nbuf); |
||||
float *inp2 = mkrandom(nbuf); |
||||
float *out1 = (float *)malloc_aligned(16, nbuf * sizeof(out1[0])); |
||||
float *out2 = (float *)malloc_aligned(16, nbuf * sizeof(out2[0])); |
||||
float *out1a = (float *)malloc_aligned(16, nbuf * sizeof(out1[0])); |
||||
float *out2a = (float *)malloc_aligned(16, nbuf * sizeof(out2[0])); |
||||
AlignedBuf<float, 32> matrices; |
||||
AlignedBuf<float, 4> state; |
||||
double b0 = 1.0207, b1 = -1.7719, b2 = .9376, a1 = -1.7719, a2 = 0.9583; |
||||
|
||||
for (size_t i = 0; i < 4; i++) { |
||||
state.get()[i] = 0; |
||||
} |
||||
|
||||
initbiquadmatrix(matrices.get(), b0, b1, b2, a1, a2); |
||||
initbiquadmatrix(matrices.get() + 16, b0, b1, b2, a1, a2); |
||||
|
||||
neon_iir_2chan(inp1, inp2, out1, out2, nbuf, matrices.get(), state.get()); |
||||
|
||||
scalarbiquad(inp1, out1a, nbuf, b0, b1, b2, a1, a2); |
||||
|
||||
float maxerr = 0; |
||||
for (int i = 0; i < nbuf; i++) { |
||||
float err = fabs(out1[i] - out1a[i]); |
||||
if (err > maxerr) { |
||||
maxerr = err; |
||||
} |
||||
} |
||||
printf("neon maxerr = %g\n", maxerr); |
||||
free(inp1); |
||||
free(inp2); |
||||
free(out1); |
||||
free(out2); |
||||
free(out1a); |
||||
free(out2a); |
||||
} |
||||
|
||||
#endif |
||||
|
||||
void runbiquad() { |
||||
benchscalarbiquad(); |
||||
#ifdef HAVE_NEON |
||||
benchbiquadneon(); |
||||
testbiquadaccuracy(); |
||||
#endif |
||||
} |
||||
|
||||
int main(int argc, char **argv) { |
||||
if (argc == 2) { |
||||
if (!strcmp("fir", argv[1])) { |
||||
runfirbench(); |
||||
return 0; |
||||
} else if (!strcmp("biquad", argv[1])) { |
||||
runbiquad(); |
||||
return 0; |
||||
} |
||||
} |
||||
printf("usage:\n" |
||||
" test_filter fir\n" |
||||
" test_filter biquad\n"); |
||||
return 1; |
||||
} |
@ -1,136 +0,0 @@ |
||||
/*
|
||||
* Copyright 2013 Google Inc. |
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); |
||||
* you may not use this file except in compliance with the License. |
||||
* You may obtain a copy of the License at |
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
// Little test app for measuring FIR speed
|
||||
|
||||
#include <stdlib.h> |
||||
#include <stdio.h> |
||||
#include <time.h> |
||||
#include <sys/time.h> |
||||
#include <math.h> |
||||
|
||||
#include "fir.h" |
||||
|
||||
// clock_gettime would be a little better, but whatever
|
||||
double now() { |
||||
struct timeval tp; |
||||
gettimeofday(&tp, NULL); |
||||
return tp.tv_sec + 1e-6 * tp.tv_usec; |
||||
} |
||||
|
||||
void condition_governor() { |
||||
// sleep for a bit to avoid thermal throttling
|
||||
static uint32_t v = 0; |
||||
struct timespec ts; |
||||
ts.tv_sec = 0; |
||||
ts.tv_nsec = 900000000 + (v & 1); // 900ms
|
||||
//nanosleep(&ts, NULL);
|
||||
|
||||
// consume cpu a bit to try to coax max cpufreq
|
||||
uint32_t x = v; |
||||
for (int i = 0; i < 10000000; i++) { |
||||
x += 42; |
||||
x += (x << 10); |
||||
x ^= (x >> 6); |
||||
} |
||||
// storing it in a static guarantees not optimizing out
|
||||
v = x; |
||||
} |
||||
|
||||
float *mkrandom(size_t size) { |
||||
float *result = (float *)malloc_aligned(16, size * sizeof(result[0])); |
||||
for (int i = 0; i < size; i++) { |
||||
result[i] = random() * (2.0 / RAND_MAX) - 1; |
||||
} |
||||
return result; |
||||
} |
||||
|
||||
double test_accuracy(FirFilter<float, float> *f1, FirFilter<float, float> *f2, const float *inp, int nblock) { |
||||
float *out1 = (float *)malloc_aligned(16, nblock * sizeof(out1[0])); |
||||
float *out2 = (float *)malloc_aligned(16, nblock * sizeof(out2[0])); |
||||
f1->process(inp + 1, out1, nblock); |
||||
f2->process(inp + 1, out2, nblock); |
||||
double err = 0; |
||||
for (int i = 0; i < nblock; i++) { |
||||
printf("#%d: %f %f\n", i, out1[i], out2[i]); |
||||
err += fabs(out1[i] - out2[i]); |
||||
} |
||||
free(out1); |
||||
free(out2); |
||||
return err; |
||||
} |
||||
|
||||
void benchfir(int size, int experiment) { |
||||
condition_governor(); |
||||
|
||||
const int nblock = 64; |
||||
float *kernel = mkrandom(size); |
||||
float *inp = mkrandom(size + nblock); |
||||
float *out = (float *)malloc_aligned(16, nblock * sizeof(out[0])); |
||||
FirFilter<float, float> *f; |
||||
|
||||
switch(experiment) { |
||||
case 0: |
||||
f = new SimpleFirFilter(kernel, size); |
||||
break; |
||||
#ifdef HAVE_NEON |
||||
// this will crash on non-NEON devices, but we're only interested
|
||||
// in testing NEON for now
|
||||
case 1: |
||||
f = new NeonFirFilter(kernel, size); |
||||
break; |
||||
case 2: |
||||
case 3: |
||||
f = new Neon16FirFilter(kernel, size, experiment == 3); |
||||
break; |
||||
#endif |
||||
case 4: |
||||
f = new HalfRateFirFilter(kernel, size, nblock); |
||||
break; |
||||
} |
||||
|
||||
|
||||
double start = now(); |
||||
for (int j = 0; j < 15625; j++) { |
||||
f->process(inp + 1, out, nblock); |
||||
} |
||||
double elapsed = now() - start; |
||||
printf("%i %f\n", size, 1e3 * elapsed); |
||||
|
||||
FirFilter<float, float> *fbase = new SimpleFirFilter(kernel, size); |
||||
double accuracy = test_accuracy(fbase, f, inp, nblock); |
||||
printf("#accuracy = %g\n", accuracy); |
||||
|
||||
delete f; |
||||
delete fbase; |
||||
free(kernel); |
||||
free(inp); |
||||
free(out); |
||||
} |
||||
|
||||
int main(int argc, char **argv) { |
||||
printf("set style data linespoints\n" |
||||
"set xlabel 'FIR kernel size'\n" |
||||
"set ylabel 'ns per sample'\n" |
||||
"plot '-' title 'scalar', '-' title '4x4 block', '-' title 'fixed16', '-' title 'fixed16 mirror', '-' title 'half rate'\n"); |
||||
for (int experiment = 0; experiment < 5; experiment++) { |
||||
for (int i = 16; i <= 256; i += 16) { |
||||
benchfir(i, experiment); |
||||
} |
||||
printf("e\n"); |
||||
} |
||||
return 0; |
||||
} |
Loading…
Reference in new issue