FIR filter implementation

This is a FIR filter with NEON speedup. The NEON version has been tested for accuracy against the scalar one, and there is simple benchmarking code in place as well.
11 years ago · 287427513c
parent 2d433d1056
commit 287427513c
6 changed files with 365 additions and 3 deletions
--- a/android/AndroidManifest.xml
+++ b/android/AndroidManifest.xml
@ -2,7 +2,7 @@
 <manifest
    xmlns:android="http://schemas.android.com/apk/res/android"
    package="com.levien.synthesizer"
-    android:versionCode="4"
+    android:versionCode="5"
    android:versionName="0.93">
  <uses-sdk android:minSdkVersion="9" 
      android:targetSdkVersion="17"
--- a/android/jni/Android.mk
+++ b/android/jni/Android.mk
@ -7,6 +7,7 @@ LOCAL_SRC_FILES := android_glue.cc \
                   dx7note.cc \
                   env.cc \
                   exp2.cc \
                   fir.cc \
                   fm_core.cc \
                   fm_op_kernel.cc \
                   freqlut.cc \
@ -23,7 +24,8 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
    LOCAL_ARM_NEON := true
    LOCAL_CFLAGS := -DHAVE_NEON=1
    LOCAL_SRC_FILES += neon_fm_kernel.s \
-                       neon_ladder.s
+                       neon_ladder.s \
                       neon_fir.s
 endif
 # for native audio
@ -45,7 +47,8 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
    LOCAL_ARM_NEON := true
    LOCAL_CFLAGS := -DHAVE_NEON=1
    LOCAL_SRC_FILES += neon_fm_kernel.s \
-                       neon_ladder.s
+                       neon_ladder.s \
                       neon_fir.s
 endif
 LOCAL_CFLAGS += -O3
@ -56,4 +59,21 @@ LOCAL_MODULE := test_neon
 include $(BUILD_EXECUTABLE)
 include $(CLEAR_VARS)
 LOCAL_SRC_FILES := test_fir.cc \
  fir.cc
 ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
    LOCAL_ARM_NEON := true
    LOCAL_CFLAGS := -DHAVE_NEON=1
    LOCAL_SRC_FILES += neon_fir.s
 endif
 LOCAL_CFLAGS += -O3
 LOCAL_STATIC_LIBRARIES += cpufeatures
 LOCAL_MODULE := test_fir
 include $(BUILD_EXECUTABLE)
 $(call import-module,android/cpufeatures)
--- a/cpp/src/fir.cc
+++ b/cpp/src/fir.cc
@ -0,0 +1,72 @@
 /*
 * Copyright 2013 Google Inc.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 // Implementation of FIR filtering (convolution)
 #include <stdio.h> // for debugging, remove
 #include <stdlib.h>
 #include <malloc.h>
 #include "fir.h"
 // Should probably ifdef this to make it more portable
 void *malloc_aligned(size_t alignment, size_t nbytes) {
  return memalign(alignment, nbytes);
 }
 SimpleFirFilter::SimpleFirFilter(const float *kernel, size_t nk) : nk(nk) {
  k = (float *)malloc(nk * sizeof(k[0]));
  for (size_t i = 0; i < nk; i++) {
    k[i] = kernel[nk - i - 1];
  }
 }
 SimpleFirFilter::~SimpleFirFilter() {
  free(k);
 }
 void SimpleFirFilter::process(const float *in, float *out, size_t n) {
  for (size_t i = 0; i < n; i++) {
    float y = 0;
    for (size_t j = 0; j < nk; j++) {
      y += k[j] * in[i + j];
    }
    out[i] = y;
  }
 }
 NeonFirFilter::NeonFirFilter(const float *kernel, size_t nk) : nk(nk) {
  // TODO: handle odd size nk (must be multiple of 4)
  k = (float *)malloc_aligned(16, nk * sizeof(k[0]));
  for (size_t i = 0; i < nk; i += 4) {
    for (size_t j = 0; j < 4; j++) {
      k[i + j] = kernel[nk - i - 4 + j];
    }
  }
 }
 NeonFirFilter::~NeonFirFilter() {
  free(k);
 }
 #ifdef HAVE_NEON
 extern "C"
 void neon_fir_direct(const float *in, const float *k, float *out, size_t n, size_t nk);
 #endif
 void NeonFirFilter::process(const float *in, float *out, size_t n) {
  neon_fir_direct(in - 1, k, out, n, nk);
 }
--- a/cpp/src/fir.h
+++ b/cpp/src/fir.h
@ -0,0 +1,48 @@
 /*
 * Copyright 2013 Google Inc.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 // move to generic utility file?
 void *malloc_aligned(size_t alignment, size_t nbytes);
 // Abstract class
 class FirFilter {
 public:
  // preconditions:
  // in + (nk - 1) is aligned to 128 bits
  // out is aligned to 128 bits
  virtual void process(const float *in, float *out, size_t n) = 0;
 };
 class SimpleFirFilter : public FirFilter {
 public:
  SimpleFirFilter(const float *kernel, size_t nk);
  ~SimpleFirFilter();
  void process(const float *in, float *out, size_t n);
 private:
  size_t nk;
  float *k;
 };
 // should make conditional compilation?
 class NeonFirFilter : public FirFilter {
 public:
  NeonFirFilter(const float *kernel, size_t nk);
  ~NeonFirFilter();
  void process(const float *in, float *out, size_t n);
 private:
  size_t nk;
  float *k;
 };
--- a/cpp/src/neon_fir.s
+++ b/cpp/src/neon_fir.s
@ -0,0 +1,97 @@
@ Copyright 2013 Google Inc.
@ 
@ Licensed under the Apache License, Version 2.0 (the "License");
@ you may not use this file except in compliance with the License.
@ You may obtain a copy of the License at
@ 
@      http://www.apache.org/licenses/LICENSE-2.0
@ 
@ Unless required by applicable law or agreed to in writing, software
@ distributed under the License is distributed on an "AS IS" BASIS,
@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ See the License for the specific language governing permissions and
@ limitations under the License.
@ NEON assembly implementation of FIR filter core
 	.text
 	.align 2
 	.global neon_fir_direct
 	.type neon_fir_direct, %function
 neon_fir_direct:
@ r0 = pointer to input (aligned)
@ r1 = pointer to kernel (aligned)
@ r2 = pointer to output (aligned)
@ r3  = size of input in floats (multiple of 4)
@ stack[0] = size of kernel in floats (multiple of 4)
 	push {r4-r7}
 	ldr r4, [sp, #(4 * 4)]
 	lsl r6, r4, #2
 	sub r6, #16
 	@ compute initial overlap
 	mov r7, r4
 	vmov.i32 q9, #0
 	vmov.i32 q10, #0
 	vmov.i32 q11, #0
 neon_fir_direct1:
 	vld1.i32 {q0}, [r0:128]!  @ load 4 samples from input
 	vld1.i32 {q1}, [r1:128]!  @ load 4 samples from kernel
 	vmla.f32 q9, q1, d0[1]
 	vmla.f32 q10, q1, d1[0]
 	vmla.f32 q11, q1, d1[1]
 	subs r7, #4
 	bne neon_fir_direct1
 	vmov.i32 q12, #0
 	vext.32 q13, q9, q12, #3
 	vext.32 q1, q10, q12, #2
 	vadd.f32 q13, q1
 	vext.32 q1, q11, q12, #1
 	vadd.f32 q8, q13, q1
 	sub r0, r6
 	sub r1, r4, lsl #2
 neon_fir_direct2:
 	mov r7, r4
 	vmov.i32 q9, #0
 	vmov.i32 q10, #0
 	vmov.i32 q11, #0
 	@ inner loop
 neon_fir_direct3:
 	vld1.32 {q0}, [r0:128]!  @ load 4 samples from input
 	vld1.i32 {q1}, [r1:128]!  @ load 4 samples from kernel
 	vmla.f32 q9, q1, d0[1]
 	vmla.f32 q8, q1, d0[0]
 	vmla.f32 q10, q1, d1[0]
 	vmla.f32 q11, q1, d1[1]
 	subs r7, #4
 	bne neon_fir_direct3
 	@ process overlaps
 	vext.32 q0, q12, q9, #3
 	vext.32 q13, q9, q12, #3
 	vadd.f32 q8, q0
 	vext.32 q0, q12, q10, #2
 	vext.32 q1, q10, q12, #2
 	vadd.f32 q8, q0
 	vadd.f32 q13, q1
 	vext.32 q0, q12, q11, #1
 	vext.32 q1, q11, q12, #1
 	vadd.f32 q0, q8
 	vadd.f32 q8, q13, q1
 	sub r0, r6
 	sub r1, r4, lsl #2
 	vst1.32 {q0}, [r2:128]!
 	subs r3, #4
 	bne neon_fir_direct2
 	pop {r4-r7}
 	bx lr
 	.size neon_fir_direct, .-neon_fir_direct
--- a/cpp/src/test_fir.cc
+++ b/cpp/src/test_fir.cc
@ -0,0 +1,125 @@
 /*
 * Copyright 2013 Google Inc.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 // Little test app for measuring FIR speed
 #include <stdlib.h>
 #include <stdio.h>
 #include <time.h>
 #include <sys/time.h>
 #include <math.h>
 #include "fir.h"
 // clock_gettime would be a little better, but whatever
 double now() {
  struct timeval tp;
  gettimeofday(&tp, NULL);
  return tp.tv_sec + 1e-6 * tp.tv_usec;
 }
 void condition_governor() {
  // sleep for a bit to avoid thermal throttling
  static uint32_t v = 0;
  struct timespec ts;
  ts.tv_sec = 0;
  ts.tv_nsec = 900000000 + (v & 1); // 900ms
  nanosleep(&ts, NULL);
  // consume cpu a bit to try to coax max cpufreq
  uint32_t x = v;
  for (int i = 0; i < 10000000; i++) {
    x += 42;
    x += (x << 10);
    x ^= (x >> 6);
  }
  // storing it in a static guarantees not optimizing out
  v = x;
 }
 float *mkrandom(size_t size) {
  float *result = (float *)malloc_aligned(16, size * sizeof(result[0]));
  for (int i = 0; i < size; i++) {
    result[i] = random() * (2.0 / RAND_MAX) - 1;
  }
  return result;
 }
 double test_accuracy(FirFilter *f1, FirFilter *f2, const float *inp, int nblock) {
  float *out1 = (float *)malloc_aligned(16, nblock * sizeof(out1[0]));
  float *out2 = (float *)malloc_aligned(16, nblock * sizeof(out2[0]));
  f1->process(inp + 1, out1, nblock);
  f2->process(inp + 1, out2, nblock);
  double err = 0;
  for (int i = 0; i < nblock; i++) {
    //printf("%d: %f %f\n", i, out1[i], out2[i]);
    err += fabs(out1[i] - out2[i]);
  }
  free(out1);
  free(out2);
  return err;
 }
 void benchfir(int size, int experiment) {
  condition_governor();
  const int nblock = 64;
  float *kernel = mkrandom(size);
  float *inp = mkrandom(size + nblock);
  float *out = (float *)malloc_aligned(16, nblock * sizeof(out[0]));
  FirFilter *f;
  switch(experiment) {
    case 0:
      f = new SimpleFirFilter(kernel, size);
      break;
    case 1:
      f = new NeonFirFilter(kernel, size);
      break;
  }
  double start = now();
  for (int j = 0; j < 15625; j++) {
    f->process(inp + 1, out, 64);
  }
  double elapsed = now() - start;
  printf("%i %f\n", size, 1e3 * elapsed);
  FirFilter *fbase = new SimpleFirFilter(kernel, size);
  double accuracy = test_accuracy(fbase, f, inp, nblock);
  //printf("accuracy = %g\n", accuracy);
  delete f;
  delete fbase;
  free(kernel);
  free(inp);
  free(out);
 }
 int main(int argc, char **argv) {
  printf("set style data linespoints\n"
    "set xlabel 'FIR kernel size'\n"
    "set ylabel 'ns per sample'\n"
    "plot '-' title 'scalar', '-' title '4x4 block'\n");
  for (int experiment = 0; experiment < 2; experiment++) {
    for (int i = 16; i <= 128; i += 16) {
      benchfir(i, experiment);
    }
    printf("e\n");
  }
  return 0;
 }