From 85850e22b103aed14dde224aee5930245a981f26 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Tue, 14 Jan 2014 22:13:21 -0800
Subject: [PATCH] Add NEON biquad implementation

This commit adds the NEON code for the matrix-based biquad filter,
along with some benchmarking infrastructure.
---
 android/jni/Android.mk |   7 +-
 cpp/src/neon_iir.s     | 130 ++++++++++++++++++
 cpp/src/test_filter.cc | 289 +++++++++++++++++++++++++++++++++++++++++
 cpp/src/test_fir.cc    | 136 -------------------
 4 files changed, 423 insertions(+), 139 deletions(-)
 create mode 100644 cpp/src/neon_iir.s
 create mode 100644 cpp/src/test_filter.cc
 delete mode 100644 cpp/src/test_fir.cc

diff --git a/android/jni/Android.mk b/android/jni/Android.mk
index a111be7..1e692c0 100644
--- a/android/jni/Android.mk
+++ b/android/jni/Android.mk
@@ -61,18 +61,19 @@ include $(BUILD_EXECUTABLE)
 
 include $(CLEAR_VARS)
 
-LOCAL_SRC_FILES := test_fir.cc \
+LOCAL_SRC_FILES := test_filter.cc \
   fir.cc
 
 ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
     LOCAL_ARM_NEON := true
     LOCAL_CFLAGS := -DHAVE_NEON=1
-    LOCAL_SRC_FILES += neon_fir.s
+    LOCAL_SRC_FILES += neon_fir.s \
+                       neon_iir.s
 endif
 
 LOCAL_CFLAGS += -O3
 LOCAL_STATIC_LIBRARIES += cpufeatures
-LOCAL_MODULE := test_fir
+LOCAL_MODULE := test_filter
 include $(BUILD_EXECUTABLE)
 
 
diff --git a/cpp/src/neon_iir.s b/cpp/src/neon_iir.s
new file mode 100644
index 0000000..ce61672
--- /dev/null
+++ b/cpp/src/neon_iir.s
@@ -0,0 +1,130 @@
+@ Copyright 2014 Google Inc.
+@ 
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@ 
+@      http://www.apache.org/licenses/LICENSE-2.0
+@ 
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+
+@ NEON assembly implementation of a second-order IIR filter core (suitable
+@ for biquad filters)
+
+	.text
+
+	.align 2
+	.global neon_iir_2chan
+	.type neon_iir_2chan, %function
+neon_iir_2chan:
+@ r0 = pointer to input buffer 1 (aligned)
+@ r1 = pointer to input buffer 2 (aligned)
+@ r2 = pointer to output buffer 1 (aligned)
+@ r3 = pointer to output buffer 2 (aligned)
+@ stack[0] = size of buffer in floats (multiple of 4, >= 8)
+@ stack[1] = matrices
+@ stack[2] = iir state
+
+	push {r4-r5}
+	vpush {q4-q7}
+	ldr r4, [sp, #(4 * (2 + 16 + 0))]
+	ldr r5, [sp, #(4 * (2 + 16 + 1))]
+
+	@ load matrices
+	vld1.32 {q8, q9}, [r5:128]!
+	vld1.32 {q10, q11}, [r5:128]!
+	vld1.32 {q12, q13}, [r5:128]!
+	vld1.32 {q14, q15}, [r5:128]!
+
+	ldr r5, [sp, #(4 * (2 + 16 + 2))]
+	@ load IIR state
+	vld1.32 d1, [r5:64]!
+	vld1.32 d3, [r5:64]!
+
+	sub r4, #4
+@ q0, q1 are state vector L, R for first unroll
+@ q2, q3 are state vector L, R for second unroll 
+@ q4-q5 are scratch to compute next state vectors
+@ q6 is input
+@ q8-q11 is matrix 1
+@ q12-q15 is matrix 2
+
+	vld1.32 d12, [r0:64]!
+	vld1.32 d13, [r1:64]!
+	vmul.f32 q2, q8, d12[0]
+	vmul.f32 q3, q12, d13[0]
+	vmul.f32 q4, q9, d12[1]
+	vmul.f32 q5, q13, d13[1]
+	vmla.f32 q2, q10, d1[0]
+	vmla.f32 q3, q14, d3[0]
+	vld1.32 d12, [r0:64]!
+	vmla.f32 q4, q11, d1[1]
+	vld1.32 d13, [r1:64]!
+	vmla.f32 q5, q15, d3[1]
+
+neon_iir_2chan_1:
+	@ first unroll
+	vmul.f32 q0, q8, d12[0]
+	vmul.f32 q1, q12, d13[0]
+	vadd.f32 q2, q4
+	vadd.f32 q3, q5
+	vmla.f32 q0, q9, d12[1]
+	vld1.32 d12, [r0:64]!
+	vmla.f32 q1, q13, d13[1]
+	vld1.32 d13, [r1:64]!
+	vmul.f32 q4, q10, d5[0]
+	vst1.32 d4, [r2:64]!
+	vmul.f32 q5, q14, d7[0]
+	vmla.f32 q0, q11, d5[1]
+	vmla.f32 q1, q15, d7[1]
+	vst1.32 d6, [r3:64]!
+
+	@ second unroll
+	vmul.f32 q2, q8, d12[0]
+	vmul.f32 q3, q12, d13[0]
+	vadd.f32 q0, q4
+	vadd.f32 q1, q5
+	vmla.f32 q2, q9, d12[1]
+	vld1.32 d12, [r0:64]!
+	vmla.f32 q3, q13, d13[1]
+	vld1.32 d13, [r1:64]!
+	vmul.f32 q4, q10, d1[0]
+	vst1.32 d0, [r2:64]!
+	vmul.f32 q5, q14, d3[0]
+	vmla.f32 q2, q11, d1[1]
+	vmla.f32 q3, q15, d3[1]
+	vst1.32 d2, [r3:64]!
+	subs r4, #4
+	bne neon_iir_2chan_1
+
+	vmul.f32 q0, q8, d12[0]
+	vmul.f32 q1, q12, d13[0]
+	vadd.f32 q2, q4
+	vadd.f32 q3, q5
+	vmla.f32 q0, q9, d12[1]
+	vmla.f32 q1, q13, d13[1]
+	vmul.f32 q4, q10, d5[0]
+	vst1.32 d4, [r2:64]!
+	vmul.f32 q5, q14, d7[0]
+	vmla.f32 q0, q11, d5[1]
+	vmla.f32 q1, q15, d7[1]
+	vst1.32 d6, [r3:64]!
+
+	vadd.f32 q0, q4
+	vadd.f32 q1, q5
+	vst1.32 d0, [r2:64]!
+	vst1.32 d2, [r3:64]!
+
+	@ save IIR state
+	sub r5, #16
+	vst1.32 d1, [r5:64]!
+	vst1.32 d3, [r5:64]!
+
+	vpop {q4-q7}
+	pop {r4-r5}
+	bx lr
+	.size neon_iir_2chan, .-neon_iir_2chan
diff --git a/cpp/src/test_filter.cc b/cpp/src/test_filter.cc
new file mode 100644
index 0000000..1f9d8c7
--- /dev/null
+++ b/cpp/src/test_filter.cc
@@ -0,0 +1,289 @@
+/*
+ * Copyright 2013 Google Inc.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Little test app for measuring FIR speed
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <sys/time.h>
+#include <math.h>
+
+#include "aligned_buf.h"
+#include "fir.h"
+
+// clock_gettime would be a little better, but whatever
+double now() {
+  struct timeval tp;
+  gettimeofday(&tp, NULL);
+  return tp.tv_sec + 1e-6 * tp.tv_usec;
+}
+
+void condition_governor() {
+  // sleep for a bit to avoid thermal throttling
+  static uint32_t v = 0;
+  struct timespec ts;
+  ts.tv_sec = 0;
+  ts.tv_nsec = 900000000 + (v & 1); // 900ms
+  //nanosleep(&ts, NULL);
+
+  // consume cpu a bit to try to coax max cpufreq
+  uint32_t x = v;
+  for (int i = 0; i < 10000000; i++) {
+    x += 42;
+    x += (x << 10);
+    x ^= (x >> 6);
+  }
+  // storing it in a static guarantees not optimizing out
+  v = x;
+}
+
+float *mkrandom(size_t size) {
+  float *result = (float *)malloc_aligned(16, size * sizeof(result[0]));
+  for (int i = 0; i < size; i++) {
+    result[i] = random() * (2.0 / RAND_MAX) - 1;
+  }
+  return result;
+}
+
+double test_accuracy(FirFilter<float, float> *f1, FirFilter<float, float> *f2, const float *inp, int nblock) {
+  float *out1 = (float *)malloc_aligned(16, nblock * sizeof(out1[0]));
+  float *out2 = (float *)malloc_aligned(16, nblock * sizeof(out2[0]));
+  f1->process(inp + 1, out1, nblock);
+  f2->process(inp + 1, out2, nblock);
+  double err = 0;
+  for (int i = 0; i < nblock; i++) {
+    printf("#%d: %f %f\n", i, out1[i], out2[i]);
+    err += fabs(out1[i] - out2[i]);
+  }
+  free(out1);
+  free(out2);
+  return err;
+}
+
+void benchfir(int size, int experiment) {
+  condition_governor();
+
+  const int nblock = 64;
+  float *kernel = mkrandom(size);
+  float *inp = mkrandom(size + nblock);
+  float *out = (float *)malloc_aligned(16, nblock * sizeof(out[0]));
+  FirFilter<float, float> *f;
+
+  switch(experiment) {
+    case 0:
+      f = new SimpleFirFilter(kernel, size);
+      break;
+#ifdef HAVE_NEON
+    // this will crash on non-NEON devices, but we're only interested
+    // in testing NEON for now
+    case 1:
+      f = new NeonFirFilter(kernel, size);
+      break;
+    case 2:
+    case 3:
+      f = new Neon16FirFilter(kernel, size, experiment == 3);
+      break;
+#endif
+    case 4:
+      f = new HalfRateFirFilter(kernel, size, nblock);
+      break;
+  }
+
+
+  double start = now();
+  for (int j = 0; j < 15625; j++) {
+    f->process(inp + 1, out, nblock);
+  }
+  double elapsed = now() - start;
+  printf("%i %f\n", size, 1e3 * elapsed);
+
+  FirFilter<float, float> *fbase = new SimpleFirFilter(kernel, size);
+  double accuracy = test_accuracy(fbase, f, inp, nblock);
+  printf("#accuracy = %g\n", accuracy);
+
+  delete f;
+  delete fbase;
+  free(kernel);
+  free(inp);
+  free(out);
+}
+
+void runfirbench() {
+  printf("set style data linespoints\n"
+    "set xlabel 'FIR kernel size'\n"
+    "set ylabel 'ns per sample'\n"
+    "plot '-' title 'scalar', '-' title '4x4 block', '-' title 'fixed16', '-' title 'fixed16 mirror', '-' title 'half rate'\n");
+  for (int experiment = 0; experiment < 5; experiment++) {
+    for (int i = 16; i <= 256; i += 16) {
+      benchfir(i, experiment);
+    }
+    printf("e\n");
+  }
+}
+
+void scalarbiquad(const float *inp, float *out, size_t n,
+  float b0, float b1, float b2, float a1, float a2) {
+  float x1 = 0, x2 = 0, y1 = 0, y2 = 0;
+  for (size_t i = 0; i < n; i++) {
+    float x = inp[i];
+    float y = b0 * x + b1 * x1 + b2 * x2 - a1 * y1 - a2 * y2;
+    out[i] = y;
+    x2 = x1;
+    x1 = x;
+    y2 = y1;
+    y1 = y;
+  }
+}
+
+void benchscalarbiquad() {
+  condition_governor();
+  const int nbuf = 1 << 10;
+  float *inp = mkrandom(nbuf);
+  float *out = (float *)malloc_aligned(16, nbuf * sizeof(out[0]));
+
+  double start = now();
+  const int niter = 10000;
+  for (int i = 0; i < niter; i++) {
+    scalarbiquad(inp, out, nbuf, 1.0207, -1.7719, .9376, -1.7719, 0.9583);
+  }
+  double elapsed = now() - start;
+  double ns_per_iir = 1e9 * elapsed / nbuf / niter;
+  printf("scalar: %f ns/iir\n", ns_per_iir);
+
+  free(inp);
+  free(out);
+}
+
+extern "C"
+void neon_iir_2chan(const float *in1, const float *in2, float *out1, float *out2,
+  size_t n, const float *matrices, float *state);
+
+// see "lab/biquadin two.ipynb" for why
+void initbiquadmatrix(float *matrix, double b0, double b1, double b2, double a1, double a2) {
+  double c1 = b1 - a1 * b0;
+  double c2 = b2 - a2 * b0;
+  matrix[0] = b0;
+  matrix[1] = c1;
+  matrix[2] = -a1 * c1 + c2;
+  matrix[3] = -a2 * c1;
+  matrix[4] = 0;
+  matrix[5] = b0;
+  matrix[6] = c1;
+  matrix[7] = c2;
+  matrix[8] = 1;
+  matrix[9] = -a1;
+  matrix[10] = -a2 + a1 * a1;
+  matrix[11] = a1 * a2;
+  matrix[12] = 0;
+  matrix[13] = 1;
+  matrix[14] = -a1;
+  matrix[15] = -a2;
+}
+
+#ifdef HAVE_NEON
+void benchbiquadneon() {
+  const int nbuf = 1 << 10;
+  float *inp1 = mkrandom(nbuf);
+  float *inp2 = mkrandom(nbuf);
+  float *out1 = (float *)malloc_aligned(16, nbuf * sizeof(out1[0]));
+  float *out2 = (float *)malloc_aligned(16, nbuf * sizeof(out2[0]));
+  AlignedBuf<float, 32> matrices;
+  AlignedBuf<float, 4> state;
+
+  for (size_t i = 0; i < 4; i++) {
+    state.get()[i] = 0;
+  }
+
+  double start = now();
+  const int niter = 100000;
+  for (int i = 0; i < niter; i++) {
+    neon_iir_2chan(inp1, inp2, out1, out2, nbuf, matrices.get(), state.get());
+  }
+
+  double elapsed = now() - start;
+  double ns_per_iir = 1e9 * 0.5 * elapsed / nbuf / niter;
+  printf("neon: %f ns/iir\n", ns_per_iir);
+  free(inp1);
+  free(inp2);
+  free(out1);
+  free(out2);
+}
+
+void testbiquadaccuracy() {
+  const int nbuf = 1 << 10;
+  float *inp1 = mkrandom(nbuf);
+  float *inp2 = mkrandom(nbuf);
+  float *out1 = (float *)malloc_aligned(16, nbuf * sizeof(out1[0]));
+  float *out2 = (float *)malloc_aligned(16, nbuf * sizeof(out2[0]));
+  float *out1a = (float *)malloc_aligned(16, nbuf * sizeof(out1[0]));
+  float *out2a = (float *)malloc_aligned(16, nbuf * sizeof(out2[0]));
+  AlignedBuf<float, 32> matrices;
+  AlignedBuf<float, 4> state;
+  double b0 = 1.0207, b1 = -1.7719, b2 = .9376, a1 = -1.7719, a2 = 0.9583;
+
+  for (size_t i = 0; i < 4; i++) {
+    state.get()[i] = 0;
+  }
+
+  initbiquadmatrix(matrices.get(), b0, b1, b2, a1, a2);
+  initbiquadmatrix(matrices.get() + 16, b0, b1, b2, a1, a2);
+
+  neon_iir_2chan(inp1, inp2, out1, out2, nbuf, matrices.get(), state.get());
+
+  scalarbiquad(inp1, out1a, nbuf, b0, b1, b2, a1, a2);
+
+  float maxerr = 0;
+  for (int i = 0; i < nbuf; i++) {
+    float err = fabs(out1[i] - out1a[i]);
+    if (err > maxerr) {
+      maxerr = err;
+    }
+  }
+  printf("neon maxerr = %g\n", maxerr);
+  free(inp1);
+  free(inp2);
+  free(out1);
+  free(out2);
+  free(out1a);
+  free(out2a);
+}
+
+#endif
+
+void runbiquad() {
+  benchscalarbiquad();
+#ifdef HAVE_NEON
+  benchbiquadneon();
+  testbiquadaccuracy();
+#endif
+}
+
+int main(int argc, char **argv) {
+  if (argc == 2) {
+    if (!strcmp("fir", argv[1])) {
+      runfirbench();
+      return 0;
+    } else if (!strcmp("biquad", argv[1])) {
+      runbiquad();
+      return 0;
+    }
+  }
+  printf("usage:\n"
+    "  test_filter fir\n"
+    "  test_filter biquad\n");
+  return 1;
+}
diff --git a/cpp/src/test_fir.cc b/cpp/src/test_fir.cc
deleted file mode 100644
index 6c055c6..0000000
--- a/cpp/src/test_fir.cc
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright 2013 Google Inc.
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
- *      http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Little test app for measuring FIR speed
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <time.h>
-#include <sys/time.h>
-#include <math.h>
-
-#include "fir.h"
-
-// clock_gettime would be a little better, but whatever
-double now() {
-  struct timeval tp;
-  gettimeofday(&tp, NULL);
-  return tp.tv_sec + 1e-6 * tp.tv_usec;
-}
-
-void condition_governor() {
-  // sleep for a bit to avoid thermal throttling
-  static uint32_t v = 0;
-  struct timespec ts;
-  ts.tv_sec = 0;
-  ts.tv_nsec = 900000000 + (v & 1); // 900ms
-  //nanosleep(&ts, NULL);
-
-  // consume cpu a bit to try to coax max cpufreq
-  uint32_t x = v;
-  for (int i = 0; i < 10000000; i++) {
-    x += 42;
-    x += (x << 10);
-    x ^= (x >> 6);
-  }
-  // storing it in a static guarantees not optimizing out
-  v = x;
-}
-
-float *mkrandom(size_t size) {
-  float *result = (float *)malloc_aligned(16, size * sizeof(result[0]));
-  for (int i = 0; i < size; i++) {
-    result[i] = random() * (2.0 / RAND_MAX) - 1;
-  }
-  return result;
-}
-
-double test_accuracy(FirFilter<float, float> *f1, FirFilter<float, float> *f2, const float *inp, int nblock) {
-  float *out1 = (float *)malloc_aligned(16, nblock * sizeof(out1[0]));
-  float *out2 = (float *)malloc_aligned(16, nblock * sizeof(out2[0]));
-  f1->process(inp + 1, out1, nblock);
-  f2->process(inp + 1, out2, nblock);
-  double err = 0;
-  for (int i = 0; i < nblock; i++) {
-    printf("#%d: %f %f\n", i, out1[i], out2[i]);
-    err += fabs(out1[i] - out2[i]);
-  }
-  free(out1);
-  free(out2);
-  return err;
-}
-
-void benchfir(int size, int experiment) {
-  condition_governor();
-
-  const int nblock = 64;
-  float *kernel = mkrandom(size);
-  float *inp = mkrandom(size + nblock);
-  float *out = (float *)malloc_aligned(16, nblock * sizeof(out[0]));
-  FirFilter<float, float> *f;
-
-  switch(experiment) {
-    case 0:
-      f = new SimpleFirFilter(kernel, size);
-      break;
-#ifdef HAVE_NEON
-    // this will crash on non-NEON devices, but we're only interested
-    // in testing NEON for now
-    case 1:
-      f = new NeonFirFilter(kernel, size);
-      break;
-    case 2:
-    case 3:
-      f = new Neon16FirFilter(kernel, size, experiment == 3);
-      break;
-#endif
-    case 4:
-      f = new HalfRateFirFilter(kernel, size, nblock);
-      break;
-  }
-
-
-  double start = now();
-  for (int j = 0; j < 15625; j++) {
-    f->process(inp + 1, out, nblock);
-  }
-  double elapsed = now() - start;
-  printf("%i %f\n", size, 1e3 * elapsed);
-
-  FirFilter<float, float> *fbase = new SimpleFirFilter(kernel, size);
-  double accuracy = test_accuracy(fbase, f, inp, nblock);
-  printf("#accuracy = %g\n", accuracy);
-
-  delete f;
-  delete fbase;
-  free(kernel);
-  free(inp);
-  free(out);
-}
-
-int main(int argc, char **argv) {
-  printf("set style data linespoints\n"
-    "set xlabel 'FIR kernel size'\n"
-    "set ylabel 'ns per sample'\n"
-    "plot '-' title 'scalar', '-' title '4x4 block', '-' title 'fixed16', '-' title 'fixed16 mirror', '-' title 'half rate'\n");
-  for (int experiment = 0; experiment < 5; experiment++) {
-    for (int i = 16; i <= 256; i += 16) {
-      benchfir(i, experiment);
-    }
-    printf("e\n");
-  }
-  return 0;
-}