Add NEON biquad implementation

This commit adds the NEON code for the matrix-based biquad filter, along with some benchmarking infrastructure.
11 years ago · 85850e22b1
parent 1b45fbeb46
commit 85850e22b1
4 changed files with 423 additions and 139 deletions
--- a/android/jni/Android.mk
+++ b/android/jni/Android.mk
@ -61,18 +61,19 @@ include $(BUILD_EXECUTABLE)
 include $(CLEAR_VARS)
-LOCAL_SRC_FILES := test_fir.cc \
+LOCAL_SRC_FILES := test_filter.cc \
  fir.cc
 ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
    LOCAL_ARM_NEON := true
    LOCAL_CFLAGS := -DHAVE_NEON=1
-    LOCAL_SRC_FILES += neon_fir.s
+    LOCAL_SRC_FILES += neon_fir.s \
                       neon_iir.s
 endif
 LOCAL_CFLAGS += -O3
 LOCAL_STATIC_LIBRARIES += cpufeatures
-LOCAL_MODULE := test_fir
+LOCAL_MODULE := test_filter
 include $(BUILD_EXECUTABLE)
--- a/cpp/src/neon_iir.s
+++ b/cpp/src/neon_iir.s
@ -0,0 +1,130 @@
@ Copyright 2014 Google Inc.
@ 
@ Licensed under the Apache License, Version 2.0 (the "License");
@ you may not use this file except in compliance with the License.
@ You may obtain a copy of the License at
@ 
@      http://www.apache.org/licenses/LICENSE-2.0
@ 
@ Unless required by applicable law or agreed to in writing, software
@ distributed under the License is distributed on an "AS IS" BASIS,
@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ See the License for the specific language governing permissions and
@ limitations under the License.
@ NEON assembly implementation of a second-order IIR filter core (suitable
@ for biquad filters)
 	.text
 	.align 2
 	.global neon_iir_2chan
 	.type neon_iir_2chan, %function
 neon_iir_2chan:
@ r0 = pointer to input buffer 1 (aligned)
@ r1 = pointer to input buffer 2 (aligned)
@ r2 = pointer to output buffer 1 (aligned)
@ r3 = pointer to output buffer 2 (aligned)
@ stack[0] = size of buffer in floats (multiple of 4, >= 8)
@ stack[1] = matrices
@ stack[2] = iir state
 	push {r4-r5}
 	vpush {q4-q7}
 	ldr r4, [sp, #(4 * (2 + 16 + 0))]
 	ldr r5, [sp, #(4 * (2 + 16 + 1))]
 	@ load matrices
 	vld1.32 {q8, q9}, [r5:128]!
 	vld1.32 {q10, q11}, [r5:128]!
 	vld1.32 {q12, q13}, [r5:128]!
 	vld1.32 {q14, q15}, [r5:128]!
 	ldr r5, [sp, #(4 * (2 + 16 + 2))]
 	@ load IIR state
 	vld1.32 d1, [r5:64]!
 	vld1.32 d3, [r5:64]!
 	sub r4, #4
@ q0, q1 are state vector L, R for first unroll
@ q2, q3 are state vector L, R for second unroll 
@ q4-q5 are scratch to compute next state vectors
@ q6 is input
@ q8-q11 is matrix 1
@ q12-q15 is matrix 2
 	vld1.32 d12, [r0:64]!
 	vld1.32 d13, [r1:64]!
 	vmul.f32 q2, q8, d12[0]
 	vmul.f32 q3, q12, d13[0]
 	vmul.f32 q4, q9, d12[1]
 	vmul.f32 q5, q13, d13[1]
 	vmla.f32 q2, q10, d1[0]
 	vmla.f32 q3, q14, d3[0]
 	vld1.32 d12, [r0:64]!
 	vmla.f32 q4, q11, d1[1]
 	vld1.32 d13, [r1:64]!
 	vmla.f32 q5, q15, d3[1]
 neon_iir_2chan_1:
 	@ first unroll
 	vmul.f32 q0, q8, d12[0]
 	vmul.f32 q1, q12, d13[0]
 	vadd.f32 q2, q4
 	vadd.f32 q3, q5
 	vmla.f32 q0, q9, d12[1]
 	vld1.32 d12, [r0:64]!
 	vmla.f32 q1, q13, d13[1]
 	vld1.32 d13, [r1:64]!
 	vmul.f32 q4, q10, d5[0]
 	vst1.32 d4, [r2:64]!
 	vmul.f32 q5, q14, d7[0]
 	vmla.f32 q0, q11, d5[1]
 	vmla.f32 q1, q15, d7[1]
 	vst1.32 d6, [r3:64]!
 	@ second unroll
 	vmul.f32 q2, q8, d12[0]
 	vmul.f32 q3, q12, d13[0]
 	vadd.f32 q0, q4
 	vadd.f32 q1, q5
 	vmla.f32 q2, q9, d12[1]
 	vld1.32 d12, [r0:64]!
 	vmla.f32 q3, q13, d13[1]
 	vld1.32 d13, [r1:64]!
 	vmul.f32 q4, q10, d1[0]
 	vst1.32 d0, [r2:64]!
 	vmul.f32 q5, q14, d3[0]
 	vmla.f32 q2, q11, d1[1]
 	vmla.f32 q3, q15, d3[1]
 	vst1.32 d2, [r3:64]!
 	subs r4, #4
 	bne neon_iir_2chan_1
 	vmul.f32 q0, q8, d12[0]
 	vmul.f32 q1, q12, d13[0]
 	vadd.f32 q2, q4
 	vadd.f32 q3, q5
 	vmla.f32 q0, q9, d12[1]
 	vmla.f32 q1, q13, d13[1]
 	vmul.f32 q4, q10, d5[0]
 	vst1.32 d4, [r2:64]!
 	vmul.f32 q5, q14, d7[0]
 	vmla.f32 q0, q11, d5[1]
 	vmla.f32 q1, q15, d7[1]
 	vst1.32 d6, [r3:64]!
 	vadd.f32 q0, q4
 	vadd.f32 q1, q5
 	vst1.32 d0, [r2:64]!
 	vst1.32 d2, [r3:64]!
 	@ save IIR state
 	sub r5, #16
 	vst1.32 d1, [r5:64]!
 	vst1.32 d3, [r5:64]!
 	vpop {q4-q7}
 	pop {r4-r5}
 	bx lr
 	.size neon_iir_2chan, .-neon_iir_2chan
--- a/cpp/src/test_filter.cc
+++ b/cpp/src/test_filter.cc
@ -0,0 +1,289 @@
 /*
 * Copyright 2013 Google Inc.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 // Little test app for measuring FIR speed
 #include <stdlib.h>
 #include <stdio.h>
 #include <time.h>
 #include <sys/time.h>
 #include <math.h>
 #include "aligned_buf.h"
 #include "fir.h"
 // clock_gettime would be a little better, but whatever
 double now() {
  struct timeval tp;
  gettimeofday(&tp, NULL);
  return tp.tv_sec + 1e-6 * tp.tv_usec;
 }
 void condition_governor() {
  // sleep for a bit to avoid thermal throttling
  static uint32_t v = 0;
  struct timespec ts;
  ts.tv_sec = 0;
  ts.tv_nsec = 900000000 + (v & 1); // 900ms
  //nanosleep(&ts, NULL);
  // consume cpu a bit to try to coax max cpufreq
  uint32_t x = v;
  for (int i = 0; i < 10000000; i++) {
    x += 42;
    x += (x << 10);
    x ^= (x >> 6);
  }
  // storing it in a static guarantees not optimizing out
  v = x;
 }
 float *mkrandom(size_t size) {
  float *result = (float *)malloc_aligned(16, size * sizeof(result[0]));
  for (int i = 0; i < size; i++) {
    result[i] = random() * (2.0 / RAND_MAX) - 1;
  }
  return result;
 }
 double test_accuracy(FirFilter<float, float> *f1, FirFilter<float, float> *f2, const float *inp, int nblock) {
  float *out1 = (float *)malloc_aligned(16, nblock * sizeof(out1[0]));
  float *out2 = (float *)malloc_aligned(16, nblock * sizeof(out2[0]));
  f1->process(inp + 1, out1, nblock);
  f2->process(inp + 1, out2, nblock);
  double err = 0;
  for (int i = 0; i < nblock; i++) {
    printf("#%d: %f %f\n", i, out1[i], out2[i]);
    err += fabs(out1[i] - out2[i]);
  }
  free(out1);
  free(out2);
  return err;
 }
 void benchfir(int size, int experiment) {
  condition_governor();
  const int nblock = 64;
  float *kernel = mkrandom(size);
  float *inp = mkrandom(size + nblock);
  float *out = (float *)malloc_aligned(16, nblock * sizeof(out[0]));
  FirFilter<float, float> *f;
  switch(experiment) {
    case 0:
      f = new SimpleFirFilter(kernel, size);
      break;
 #ifdef HAVE_NEON
    // this will crash on non-NEON devices, but we're only interested
    // in testing NEON for now
    case 1:
      f = new NeonFirFilter(kernel, size);
      break;
    case 2:
    case 3:
      f = new Neon16FirFilter(kernel, size, experiment == 3);
      break;
 #endif
    case 4:
      f = new HalfRateFirFilter(kernel, size, nblock);
      break;
  }
  double start = now();
  for (int j = 0; j < 15625; j++) {
    f->process(inp + 1, out, nblock);
  }
  double elapsed = now() - start;
  printf("%i %f\n", size, 1e3 * elapsed);
  FirFilter<float, float> *fbase = new SimpleFirFilter(kernel, size);
  double accuracy = test_accuracy(fbase, f, inp, nblock);
  printf("#accuracy = %g\n", accuracy);
  delete f;
  delete fbase;
  free(kernel);
  free(inp);
  free(out);
 }
 void runfirbench() {
  printf("set style data linespoints\n"
    "set xlabel 'FIR kernel size'\n"
    "set ylabel 'ns per sample'\n"
    "plot '-' title 'scalar', '-' title '4x4 block', '-' title 'fixed16', '-' title 'fixed16 mirror', '-' title 'half rate'\n");
  for (int experiment = 0; experiment < 5; experiment++) {
    for (int i = 16; i <= 256; i += 16) {
      benchfir(i, experiment);
    }
    printf("e\n");
  }
 }
 void scalarbiquad(const float *inp, float *out, size_t n,
  float b0, float b1, float b2, float a1, float a2) {
  float x1 = 0, x2 = 0, y1 = 0, y2 = 0;
  for (size_t i = 0; i < n; i++) {
    float x = inp[i];
    float y = b0 * x + b1 * x1 + b2 * x2 - a1 * y1 - a2 * y2;
    out[i] = y;
    x2 = x1;
    x1 = x;
    y2 = y1;
    y1 = y;
  }
 }
 void benchscalarbiquad() {
  condition_governor();
  const int nbuf = 1 << 10;
  float *inp = mkrandom(nbuf);
  float *out = (float *)malloc_aligned(16, nbuf * sizeof(out[0]));
  double start = now();
  const int niter = 10000;
  for (int i = 0; i < niter; i++) {
    scalarbiquad(inp, out, nbuf, 1.0207, -1.7719, .9376, -1.7719, 0.9583);
  }
  double elapsed = now() - start;
  double ns_per_iir = 1e9 * elapsed / nbuf / niter;
  printf("scalar: %f ns/iir\n", ns_per_iir);
  free(inp);
  free(out);
 }
 extern "C"
 void neon_iir_2chan(const float *in1, const float *in2, float *out1, float *out2,
  size_t n, const float *matrices, float *state);
 // see "lab/biquadin two.ipynb" for why
 void initbiquadmatrix(float *matrix, double b0, double b1, double b2, double a1, double a2) {
  double c1 = b1 - a1 * b0;
  double c2 = b2 - a2 * b0;
  matrix[0] = b0;
  matrix[1] = c1;
  matrix[2] = -a1 * c1 + c2;
  matrix[3] = -a2 * c1;
  matrix[4] = 0;
  matrix[5] = b0;
  matrix[6] = c1;
  matrix[7] = c2;
  matrix[8] = 1;
  matrix[9] = -a1;
  matrix[10] = -a2 + a1 * a1;
  matrix[11] = a1 * a2;
  matrix[12] = 0;
  matrix[13] = 1;
  matrix[14] = -a1;
  matrix[15] = -a2;
 }
 #ifdef HAVE_NEON
 void benchbiquadneon() {
  const int nbuf = 1 << 10;
  float *inp1 = mkrandom(nbuf);
  float *inp2 = mkrandom(nbuf);
  float *out1 = (float *)malloc_aligned(16, nbuf * sizeof(out1[0]));
  float *out2 = (float *)malloc_aligned(16, nbuf * sizeof(out2[0]));
  AlignedBuf<float, 32> matrices;
  AlignedBuf<float, 4> state;
  for (size_t i = 0; i < 4; i++) {
    state.get()[i] = 0;
  }
  double start = now();
  const int niter = 100000;
  for (int i = 0; i < niter; i++) {
    neon_iir_2chan(inp1, inp2, out1, out2, nbuf, matrices.get(), state.get());
  }
  double elapsed = now() - start;
  double ns_per_iir = 1e9 * 0.5 * elapsed / nbuf / niter;
  printf("neon: %f ns/iir\n", ns_per_iir);
  free(inp1);
  free(inp2);
  free(out1);
  free(out2);
 }
 void testbiquadaccuracy() {
  const int nbuf = 1 << 10;
  float *inp1 = mkrandom(nbuf);
  float *inp2 = mkrandom(nbuf);
  float *out1 = (float *)malloc_aligned(16, nbuf * sizeof(out1[0]));
  float *out2 = (float *)malloc_aligned(16, nbuf * sizeof(out2[0]));
  float *out1a = (float *)malloc_aligned(16, nbuf * sizeof(out1[0]));
  float *out2a = (float *)malloc_aligned(16, nbuf * sizeof(out2[0]));
  AlignedBuf<float, 32> matrices;
  AlignedBuf<float, 4> state;
  double b0 = 1.0207, b1 = -1.7719, b2 = .9376, a1 = -1.7719, a2 = 0.9583;
  for (size_t i = 0; i < 4; i++) {
    state.get()[i] = 0;
  }
  initbiquadmatrix(matrices.get(), b0, b1, b2, a1, a2);
  initbiquadmatrix(matrices.get() + 16, b0, b1, b2, a1, a2);
  neon_iir_2chan(inp1, inp2, out1, out2, nbuf, matrices.get(), state.get());
  scalarbiquad(inp1, out1a, nbuf, b0, b1, b2, a1, a2);
  float maxerr = 0;
  for (int i = 0; i < nbuf; i++) {
    float err = fabs(out1[i] - out1a[i]);
    if (err > maxerr) {
      maxerr = err;
    }
  }
  printf("neon maxerr = %g\n", maxerr);
  free(inp1);
  free(inp2);
  free(out1);
  free(out2);
  free(out1a);
  free(out2a);
 }
 #endif
 void runbiquad() {
  benchscalarbiquad();
 #ifdef HAVE_NEON
  benchbiquadneon();
  testbiquadaccuracy();
 #endif
 }
 int main(int argc, char **argv) {
  if (argc == 2) {
    if (!strcmp("fir", argv[1])) {
      runfirbench();
      return 0;
    } else if (!strcmp("biquad", argv[1])) {
      runbiquad();
      return 0;
    }
  }
  printf("usage:\n"
    "  test_filter fir\n"
    "  test_filter biquad\n");
  return 1;
 }
--- a/cpp/src/test_fir.cc
+++ b/cpp/src/test_fir.cc
@ -1,136 +0,0 @@
 /*
 * Copyright 2013 Google Inc.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 // Little test app for measuring FIR speed
 #include <stdlib.h>
 #include <stdio.h>
 #include <time.h>
 #include <sys/time.h>
 #include <math.h>
 #include "fir.h"
 // clock_gettime would be a little better, but whatever
 double now() {
  struct timeval tp;
  gettimeofday(&tp, NULL);
  return tp.tv_sec + 1e-6 * tp.tv_usec;
 }
 void condition_governor() {
  // sleep for a bit to avoid thermal throttling
  static uint32_t v = 0;
  struct timespec ts;
  ts.tv_sec = 0;
  ts.tv_nsec = 900000000 + (v & 1); // 900ms
  //nanosleep(&ts, NULL);
  // consume cpu a bit to try to coax max cpufreq
  uint32_t x = v;
  for (int i = 0; i < 10000000; i++) {
    x += 42;
    x += (x << 10);
    x ^= (x >> 6);
  }
  // storing it in a static guarantees not optimizing out
  v = x;
 }
 float *mkrandom(size_t size) {
  float *result = (float *)malloc_aligned(16, size * sizeof(result[0]));
  for (int i = 0; i < size; i++) {
    result[i] = random() * (2.0 / RAND_MAX) - 1;
  }
  return result;
 }
 double test_accuracy(FirFilter<float, float> *f1, FirFilter<float, float> *f2, const float *inp, int nblock) {
  float *out1 = (float *)malloc_aligned(16, nblock * sizeof(out1[0]));
  float *out2 = (float *)malloc_aligned(16, nblock * sizeof(out2[0]));
  f1->process(inp + 1, out1, nblock);
  f2->process(inp + 1, out2, nblock);
  double err = 0;
  for (int i = 0; i < nblock; i++) {
    printf("#%d: %f %f\n", i, out1[i], out2[i]);
    err += fabs(out1[i] - out2[i]);
  }
  free(out1);
  free(out2);
  return err;
 }
 void benchfir(int size, int experiment) {
  condition_governor();
  const int nblock = 64;
  float *kernel = mkrandom(size);
  float *inp = mkrandom(size + nblock);
  float *out = (float *)malloc_aligned(16, nblock * sizeof(out[0]));
  FirFilter<float, float> *f;
  switch(experiment) {
    case 0:
      f = new SimpleFirFilter(kernel, size);
      break;
 #ifdef HAVE_NEON
    // this will crash on non-NEON devices, but we're only interested
    // in testing NEON for now
    case 1:
      f = new NeonFirFilter(kernel, size);
      break;
    case 2:
    case 3:
      f = new Neon16FirFilter(kernel, size, experiment == 3);
      break;
 #endif
    case 4:
      f = new HalfRateFirFilter(kernel, size, nblock);
      break;
  }
  double start = now();
  for (int j = 0; j < 15625; j++) {
    f->process(inp + 1, out, nblock);
  }
  double elapsed = now() - start;
  printf("%i %f\n", size, 1e3 * elapsed);
  FirFilter<float, float> *fbase = new SimpleFirFilter(kernel, size);
  double accuracy = test_accuracy(fbase, f, inp, nblock);
  printf("#accuracy = %g\n", accuracy);
  delete f;
  delete fbase;
  free(kernel);
  free(inp);
  free(out);
 }
 int main(int argc, char **argv) {
  printf("set style data linespoints\n"
    "set xlabel 'FIR kernel size'\n"
    "set ylabel 'ns per sample'\n"
    "plot '-' title 'scalar', '-' title '4x4 block', '-' title 'fixed16', '-' title 'fixed16 mirror', '-' title 'half rate'\n");
  for (int experiment = 0; experiment < 5; experiment++) {
    for (int i = 16; i <= 256; i += 16) {
      benchfir(i, experiment);
    }
    printf("e\n");
  }
  return 0;
 }