From 287427513ccaeb086fe47586225f67a8c0ff1b5d Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Thu, 12 Dec 2013 00:00:12 -0800
Subject: [PATCH] FIR filter implementation

This is a FIR filter with NEON speedup. The NEON version has been tested
for accuracy against the scalar one, and there is simple benchmarking
code in place as well.
---
 android/AndroidManifest.xml |   2 +-
 android/jni/Android.mk      |  24 ++++++-
 cpp/src/fir.cc              |  72 +++++++++++++++++++++
 cpp/src/fir.h               |  48 ++++++++++++++
 cpp/src/neon_fir.s          |  97 ++++++++++++++++++++++++++++
 cpp/src/test_fir.cc         | 125 ++++++++++++++++++++++++++++++++++++
 6 files changed, 365 insertions(+), 3 deletions(-)
 create mode 100644 cpp/src/fir.cc
 create mode 100644 cpp/src/fir.h
 create mode 100644 cpp/src/neon_fir.s
 create mode 100644 cpp/src/test_fir.cc
diff --git a/android/AndroidManifest.xml b/android/AndroidManifest.xml
index ad906e8..3099d62 100644
--- a/android/AndroidManifest.xml
+++ b/android/AndroidManifest.xml
@@ -2,7 +2,7 @@
 <manifest
     xmlns:android="http://schemas.android.com/apk/res/android"
     package="com.levien.synthesizer"
-    android:versionCode="4"
+    android:versionCode="5"
     android:versionName="0.93">
   <uses-sdk android:minSdkVersion="9" 
       android:targetSdkVersion="17"
diff --git a/android/jni/Android.mk b/android/jni/Android.mk
index b8057e0..a111be7 100644
--- a/android/jni/Android.mk
+++ b/android/jni/Android.mk
@@ -7,6 +7,7 @@ LOCAL_SRC_FILES := android_glue.cc \
                    dx7note.cc \
                    env.cc \
                    exp2.cc \
+                   fir.cc \
                    fm_core.cc \
                    fm_op_kernel.cc \
                    freqlut.cc \
@@ -23,7 +24,8 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
     LOCAL_ARM_NEON := true
     LOCAL_CFLAGS := -DHAVE_NEON=1
     LOCAL_SRC_FILES += neon_fm_kernel.s \
-                       neon_ladder.s
+                       neon_ladder.s \
+                       neon_fir.s
 endif
 
 # for native audio
@@ -45,7 +47,8 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
     LOCAL_ARM_NEON := true
     LOCAL_CFLAGS := -DHAVE_NEON=1
     LOCAL_SRC_FILES += neon_fm_kernel.s \
-                       neon_ladder.s
+                       neon_ladder.s \
+                       neon_fir.s
 endif
 
 LOCAL_CFLAGS += -O3
@@ -56,4 +59,21 @@ LOCAL_MODULE := test_neon
 
 include $(BUILD_EXECUTABLE)
 
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := test_fir.cc \
+  fir.cc
+
+ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
+    LOCAL_ARM_NEON := true
+    LOCAL_CFLAGS := -DHAVE_NEON=1
+    LOCAL_SRC_FILES += neon_fir.s
+endif
+
+LOCAL_CFLAGS += -O3
+LOCAL_STATIC_LIBRARIES += cpufeatures
+LOCAL_MODULE := test_fir
+include $(BUILD_EXECUTABLE)
+
+
 $(call import-module,android/cpufeatures)
diff --git a/cpp/src/fir.cc b/cpp/src/fir.cc
new file mode 100644
index 0000000..861e5f5
--- /dev/null
+++ b/cpp/src/fir.cc
@@ -0,0 +1,72 @@
+/*
+ * Copyright 2013 Google Inc.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Implementation of FIR filtering (convolution)
+
+#include <stdio.h> // for debugging, remove
+#include <stdlib.h>
+#include <malloc.h>
+
+#include "fir.h"
+
+// Should probably ifdef this to make it more portable
+void *malloc_aligned(size_t alignment, size_t nbytes) {
+  return memalign(alignment, nbytes);
+}
+
+SimpleFirFilter::SimpleFirFilter(const float *kernel, size_t nk) : nk(nk) {
+  k = (float *)malloc(nk * sizeof(k[0]));
+  for (size_t i = 0; i < nk; i++) {
+    k[i] = kernel[nk - i - 1];
+  }
+}
+
+SimpleFirFilter::~SimpleFirFilter() {
+  free(k);
+}
+
+void SimpleFirFilter::process(const float *in, float *out, size_t n) {
+  for (size_t i = 0; i < n; i++) {
+    float y = 0;
+    for (size_t j = 0; j < nk; j++) {
+      y += k[j] * in[i + j];
+    }
+    out[i] = y;
+  }
+}
+
+NeonFirFilter::NeonFirFilter(const float *kernel, size_t nk) : nk(nk) {
+  // TODO: handle odd size nk (must be multiple of 4)
+  k = (float *)malloc_aligned(16, nk * sizeof(k[0]));
+  for (size_t i = 0; i < nk; i += 4) {
+    for (size_t j = 0; j < 4; j++) {
+      k[i + j] = kernel[nk - i - 4 + j];
+    }
+  }
+}
+
+NeonFirFilter::~NeonFirFilter() {
+  free(k);
+}
+
+#ifdef HAVE_NEON
+extern "C"
+void neon_fir_direct(const float *in, const float *k, float *out, size_t n, size_t nk);
+#endif
+
+void NeonFirFilter::process(const float *in, float *out, size_t n) {
+  neon_fir_direct(in - 1, k, out, n, nk);
+}
diff --git a/cpp/src/fir.h b/cpp/src/fir.h
new file mode 100644
index 0000000..2dc3d7e
--- /dev/null
+++ b/cpp/src/fir.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2013 Google Inc.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// move to generic utility file?
+void *malloc_aligned(size_t alignment, size_t nbytes);
+
+// Abstract class
+class FirFilter {
+ public:
+  // preconditions:
+  // in + (nk - 1) is aligned to 128 bits
+  // out is aligned to 128 bits
+  virtual void process(const float *in, float *out, size_t n) = 0;
+};
+
+class SimpleFirFilter : public FirFilter {
+ public:
+  SimpleFirFilter(const float *kernel, size_t nk);
+  ~SimpleFirFilter();
+  void process(const float *in, float *out, size_t n);
+ private:
+  size_t nk;
+  float *k;
+};
+
+// should make conditional compilation?
+class NeonFirFilter : public FirFilter {
+ public:
+  NeonFirFilter(const float *kernel, size_t nk);
+  ~NeonFirFilter();
+  void process(const float *in, float *out, size_t n);
+ private:
+  size_t nk;
+  float *k;
+};
diff --git a/cpp/src/neon_fir.s b/cpp/src/neon_fir.s
new file mode 100644
index 0000000..b7d3131
--- /dev/null
+++ b/cpp/src/neon_fir.s
@@ -0,0 +1,97 @@
+@ Copyright 2013 Google Inc.
+@ 
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@ 
+@      http://www.apache.org/licenses/LICENSE-2.0
+@ 
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+
+@ NEON assembly implementation of FIR filter core
+
+	.text
+
+	.align 2
+	.global neon_fir_direct
+	.type neon_fir_direct, %function
+neon_fir_direct:
+@ r0 = pointer to input (aligned)
+@ r1 = pointer to kernel (aligned)
+@ r2 = pointer to output (aligned)
+@ r3  = size of input in floats (multiple of 4)
+@ stack[0] = size of kernel in floats (multiple of 4)
+
+	push {r4-r7}
+	ldr r4, [sp, #(4 * 4)]
+
+	lsl r6, r4, #2
+	sub r6, #16
+
+	@ compute initial overlap
+	mov r7, r4
+	vmov.i32 q9, #0
+	vmov.i32 q10, #0
+	vmov.i32 q11, #0
+neon_fir_direct1:
+	vld1.i32 {q0}, [r0:128]!  @ load 4 samples from input
+	vld1.i32 {q1}, [r1:128]!  @ load 4 samples from kernel
+	vmla.f32 q9, q1, d0[1]
+	vmla.f32 q10, q1, d1[0]
+	vmla.f32 q11, q1, d1[1]
+	subs r7, #4
+	bne neon_fir_direct1
+	vmov.i32 q12, #0
+	vext.32 q13, q9, q12, #3
+	vext.32 q1, q10, q12, #2
+	vadd.f32 q13, q1
+	vext.32 q1, q11, q12, #1
+	vadd.f32 q8, q13, q1
+	sub r0, r6
+	sub r1, r4, lsl #2
+
+neon_fir_direct2:
+
+	mov r7, r4
+	vmov.i32 q9, #0
+	vmov.i32 q10, #0
+	vmov.i32 q11, #0
+	@ inner loop
+neon_fir_direct3:
+	vld1.32 {q0}, [r0:128]!  @ load 4 samples from input
+	vld1.i32 {q1}, [r1:128]!  @ load 4 samples from kernel
+	vmla.f32 q9, q1, d0[1]
+	vmla.f32 q8, q1, d0[0]
+	vmla.f32 q10, q1, d1[0]
+	vmla.f32 q11, q1, d1[1]
+	subs r7, #4
+	bne neon_fir_direct3
+
+	@ process overlaps
+	vext.32 q0, q12, q9, #3
+	vext.32 q13, q9, q12, #3
+	vadd.f32 q8, q0
+	vext.32 q0, q12, q10, #2
+	vext.32 q1, q10, q12, #2
+	vadd.f32 q8, q0
+	vadd.f32 q13, q1
+	vext.32 q0, q12, q11, #1
+	vext.32 q1, q11, q12, #1
+	vadd.f32 q0, q8
+	vadd.f32 q8, q13, q1
+
+	sub r0, r6
+	sub r1, r4, lsl #2
+	vst1.32 {q0}, [r2:128]!
+
+	subs r3, #4
+	bne neon_fir_direct2
+
+	pop {r4-r7}
+	bx lr
+
+	.size neon_fir_direct, .-neon_fir_direct
diff --git a/cpp/src/test_fir.cc b/cpp/src/test_fir.cc
new file mode 100644
index 0000000..d26638a
--- /dev/null
+++ b/cpp/src/test_fir.cc
@@ -0,0 +1,125 @@
+/*
+ * Copyright 2013 Google Inc.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Little test app for measuring FIR speed
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <sys/time.h>
+#include <math.h>
+
+#include "fir.h"
+
+// clock_gettime would be a little better, but whatever
+double now() {
+  struct timeval tp;
+  gettimeofday(&tp, NULL);
+  return tp.tv_sec + 1e-6 * tp.tv_usec;
+}
+
+void condition_governor() {
+  // sleep for a bit to avoid thermal throttling
+  static uint32_t v = 0;
+  struct timespec ts;
+  ts.tv_sec = 0;
+  ts.tv_nsec = 900000000 + (v & 1); // 900ms
+  nanosleep(&ts, NULL);
+
+  // consume cpu a bit to try to coax max cpufreq
+  uint32_t x = v;
+  for (int i = 0; i < 10000000; i++) {
+    x += 42;
+    x += (x << 10);
+    x ^= (x >> 6);
+  }
+  // storing it in a static guarantees not optimizing out
+  v = x;
+}
+
+float *mkrandom(size_t size) {
+  float *result = (float *)malloc_aligned(16, size * sizeof(result[0]));
+  for (int i = 0; i < size; i++) {
+    result[i] = random() * (2.0 / RAND_MAX) - 1;
+  }
+  return result;
+}
+
+double test_accuracy(FirFilter *f1, FirFilter *f2, const float *inp, int nblock) {
+  float *out1 = (float *)malloc_aligned(16, nblock * sizeof(out1[0]));
+  float *out2 = (float *)malloc_aligned(16, nblock * sizeof(out2[0]));
+  f1->process(inp + 1, out1, nblock);
+  f2->process(inp + 1, out2, nblock);
+  double err = 0;
+  for (int i = 0; i < nblock; i++) {
+    //printf("%d: %f %f\n", i, out1[i], out2[i]);
+    err += fabs(out1[i] - out2[i]);
+  }
+  free(out1);
+  free(out2);
+  return err;
+}
+
+void benchfir(int size, int experiment) {
+  condition_governor();
+
+  const int nblock = 64;
+  float *kernel = mkrandom(size);
+  float *inp = mkrandom(size + nblock);
+  float *out = (float *)malloc_aligned(16, nblock * sizeof(out[0]));
+  FirFilter *f;
+
+  switch(experiment) {
+    case 0:
+      f = new SimpleFirFilter(kernel, size);
+      break;
+    case 1:
+      f = new NeonFirFilter(kernel, size);
+      break;
+  }
+
+
+  double start = now();
+  for (int j = 0; j < 15625; j++) {
+    f->process(inp + 1, out, 64);
+  }
+  double elapsed = now() - start;
+  printf("%i %f\n", size, 1e3 * elapsed);
+
+  FirFilter *fbase = new SimpleFirFilter(kernel, size);
+  double accuracy = test_accuracy(fbase, f, inp, nblock);
+  //printf("accuracy = %g\n", accuracy);
+
+  delete f;
+  delete fbase;
+  free(kernel);
+  free(inp);
+  free(out);
+}
+
+int main(int argc, char **argv) {
+  printf("set style data linespoints\n"
+    "set xlabel 'FIR kernel size'\n"
+    "set ylabel 'ns per sample'\n"
+    "plot '-' title 'scalar', '-' title '4x4 block'\n");
+  for (int experiment = 0; experiment < 2; experiment++) {
+    for (int i = 16; i <= 128; i += 16) {
+      benchfir(i, experiment);
+    }
+    printf("e\n");
+  }
+  return 0;
+}