Neon-optimized ladder filter

Extremely highly optimized implementations of the ladder filter in Neon assembler. The audio loops (both linear and non-linear) are hooked up and running, while the matrix generation is still running in scalar code, though the Neon version has been tested and benchmarked. Performance numbers on Nexus 10: linear audio loop = 22.5 cycles. Nonlinear audio loop = 62 cycles. Matrix generation = 580 cycles. Note that the current code will crash on ARM v7 devices without Neon (for example, Motorola Xoom).
12 years ago · cc93548845
parent e3b5ceebfc
commit cc93548845
6 changed files with 453 additions and 26 deletions
--- a/android/jni/Android.mk
+++ b/android/jni/Android.mk
@ -22,7 +22,8 @@ LOCAL_SRC_FILES := android_glue.cc \
 ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
    LOCAL_ARM_NEON := true
    LOCAL_CFLAGS := -DHAVE_NEON=1
-    LOCAL_SRC_FILES += neon_fm_kernel.s
+    LOCAL_SRC_FILES += neon_fm_kernel.s \
                       neon_ladder.s
 endif
 # for native audio
@ -36,4 +37,23 @@ LOCAL_CFLAGS += -O3
 include $(BUILD_SHARED_LIBRARY)
-$(call import-module,android/cpufeatures)
+include $(CLEAR_VARS)
 LOCAL_SRC_FILES := test_neon.cc \
  resofilter.cc
 ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
    LOCAL_ARM_NEON := true
    LOCAL_CFLAGS := -DHAVE_NEON=1
    LOCAL_SRC_FILES += neon_fm_kernel.s \
                       neon_ladder.s
 endif
 LOCAL_CFLAGS += -O3
 LOCAL_STATIC_LIBRARIES += cpufeatures
 LOCAL_MODULE := test_neon
 include $(BUILD_EXECUTABLE)
 $(call import-module,android/cpufeatures)
--- a/cpp/src/neon_ladder.s
+++ b/cpp/src/neon_ladder.s
@ -0,0 +1,321 @@
    .text
    .align  2
    .global neon_ladder_nl
    .type   neon_ladder_nl, %function
 neon_ladder_nl:
@ r0 = pointer to input
@ r1 = pointer to filter (A then B then k then overdrive)
@ r2 = pointer to output
@ r3 = number of samples
@ stack[0] = pointer to state
@q12 - q15 matrix (A)
@q11 B
@q0 input
@q1 state
@q3 const 1.0
@q6 overdrive, output gain
@q10 resonance (k)
 	push {r4-r5}
 	vpush {q4-q7}
 	ldr r4, [sp, #(4 * 18)]
 	vld1.32 {q1}, [r4:128]
 	vld1.32 {q12}, [r1:128]!
 	vld1.32 {q13}, [r1:128]!
 	vld1.32 {q14}, [r1:128]!
 	vld1.32 {q15}, [r1:128]!
 	vld1.32 {q11}, [r1:128]!
 	vld1.32 {d20[], d21[]}, [r1]!
 	vld1.32 {d12[0]}, [r1]!
 	vld1.32 {d12[1]}, [r1]!
 	adr r5, neon_ladder_nl_const
 	vld1.32 {q3}, [r5:128]!
 neon_ladder_nl_1:
 	@ cycle counting suggests that careful unrolling would
 	@ save about 10% - we will avoid the complexity for now
 	vld1.32 {d0[0]}, [r0]!
 	vcvt.f32.s32 q0, q0
 	@ compute resonance
 	vmls.f32 q0, q10, d3[1]
 	@ q4 gets q1/sqrt(1 + q1*q1)
 	@ q0 gets q0/sqrt(1 + q0*q0)
 	vmul.f32 q2, q1, d12[0]		@ q1 * overdrive
 	vmul.f32 q7, q0, d12[0]
 	vmul.f32 q2, q2, q2
 	vmul.f32 q7, q7, q7
 	vadd.f32 q2, q3				@ z = 1 + (q1 * overdrive)^2
 	vadd.f32 q7, q3
 	vrsqrte.f32 q4, q2			@ est ~ 1/sqrt(z)
 	vrsqrte.f32 q8, q7
 	vmul.f32 q2, q4, q2			@ z * est
 	vmul.f32 q7, q8, q7
 	vrsqrts.f32 q2, q2, q4 		@ (3 - z * est^2)/2
 	vrsqrts.f32 q7, q7, q8
 	vmul.f32 q4, q1				@ q1 * est
 	vmul.f32 q0, q8
 	vmul.f32 q4, q2 			@ q1 * est * (3 - z * est^2)/2
 	vmul.f32 q0, q7
 	vmla.f32 q1, q12, d8[0]  @ sigmoid(y0)
 	vmul.f32 q2, q11, d0[0]  @ sigmod(x - k * y3)
 	vmla.f32 q1, q13, d8[1]  @ sigmoid(y1)
 	vmla.f32 q2, q14, d9[0]  @ sigmoid(y2)
 	vmla.f32 q1, q15, d9[1]  @ sigmoid(y3)
 	vadd.f32 q1, q2
 	vmul.f32 q4, q1, d12[1]
 	vcvt.s32.f32 q4, q4
 	vst1.32 {d9[1]}, [r2]!
 	subs r3, #1
 	bne neon_ladder_nl_1
 	vst1.32 {q1}, [r4:128]
 	vpop {q4-q7}
 	pop {r4-r5}
 	bx lr
 	.size   neon_ladder_nl, .-neon_ladder_nl
 	.balign 16
 neon_ladder_nl_const:
 	.float 1.0, 1.0, 1.0, 1.0
    .align  2
    .global neon_ladder_lin
    .type   neon_ladder_lin, %function
 	.if 0
@ Simpler, not as optimized approach (26.3 cycles on N10)
 neon_ladder_lin:
@ r0 = pointer to input
@ r1 = pointer to filter (A then B)
@ r2 = pointer to output
@ r3 = number of samples
@ stack[0] = pointer to state
@q12 - q15 matrix (A)
@q11 B
@q0 input
@q1 state
 	push {r4-r5}
 	ldr r4, [sp, #(4 * 2)]
 	vld1.32 {q1}, [r4:128]
 	vld1.32 {q12}, [r1:128]!
 	vld1.32 {q13}, [r1:128]!
 	vld1.32 {q14}, [r1:128]!
 	vld1.32 {q15}, [r1:128]!
 	vld1.32 {q11}, [r1:128]!
 neon_ladder_lin_1:
 	vld1.32 {d0[0]}, [r0]!
 	vmov q3, q1
 	vcvt.f32.s32 q0, q0
 	vmul.f32 q1, q12, d6[0]  @ y0
 	vmul.f32 q2, q13, d6[1]  @ y1
 	vmla.f32 q1, q11, d0[0]  @ x
 	vmla.f32 q2, q14, d7[0]  @ y2
 	vmla.f32 q1, q15, d7[1]  @ y3
 	vadd.f32 q1, q2
 	vcvt.s32.f32 q3, q1
 	vst1.32 {d7[1]}, [r2]!
 	subs r3, #1
 	bne neon_ladder_lin_1
 	vst1.32 {q1}, [r4:128]
 	pop {r4-r5}
 	bx lr
 .else
@ Unrolled, more highly optimized loop (22.5 cycles on N10)
 neon_ladder_lin:
@ r0 = pointer to input
@ r1 = pointer to filter (A then B)
@ r2 = pointer to output
@ r3 = number of samples
@ stack[0] = pointer to state
@q12 - q15 matrix (A)
@q11 B
@q0 input
@q1 state
 	push {r4-r5}
 	ldr r4, [sp, #(4 * 2)]
 	vld1.32 {q1}, [r4:128]
 	vld1.32 {q12}, [r1:128]!
 	vld1.32 {q13}, [r1:128]!
 	vld1.32 {q14}, [r1:128]!
 	vld1.32 {q15}, [r1:128]!
 	vld1.32 {q11}, [r1:128]!
 neon_ladder_lin_1:
 	vld1.32 {q0}, [r0:128]!
 	vcvt.f32.s32 q0, q0
 	vmul.f32 q9, q12, d2[0]  @ y0
 	vmul.f32 q2, q13, d2[1]  @ y1
 	vmla.f32 q9, q11, d0[0]  @ x
 	vmla.f32 q2, q14, d3[0]  @ y2
 	vmla.f32 q9, q15, d3[1]  @ y3
 	vadd.f32 q2, q9
 	vmul.f32 q1, q11, d0[1]  @ x
 	vmul.f32 q9, q12, d4[0]  @ y0
 	vmla.f32 q1, q13, d4[1]  @ y1
 	vmla.f32 q9, q14, d5[0]  @ y2
 	vcvt.s32.f32 q3, q2
 	vmla.f32 q1, q15, d5[1]  @ y3
 	vst1.32 {d7[1]}, [r2]!
 	vadd.f32 q1, q9
 	vmul.f32 q9, q11, d1[0]  @ x
 	vmul.f32 q2, q12, d2[0]  @ y0
 	vmla.f32 q9, q13, d2[1]  @ y1
 	vmla.f32 q2, q14, d3[0]  @ y2
 	vcvt.s32.f32 q3, q1
 	vmla.f32 q9, q15, d3[1]  @ y3
 	vst1.32 {d7[1]}, [r2]!
 	vadd.f32 q2, q9
 	vmul.f32 q1, q11, d1[1]  @ x
 	vmul.f32 q9, q12, d4[0]  @ y0
 	vmla.f32 q1, q13, d4[1]  @ y1
 	vmla.f32 q9, q14, d5[0]  @ y2
 	vcvt.s32.f32 q3, q2
 	vmla.f32 q1, q15, d5[1]  @ y3
 	vst1.32 {d7[1]}, [r2]!
 	vadd.f32 q1, q9
 	vcvt.s32.f32 q3, q1
 	vst1.32 {d7[1]}, [r2]!
 	subs r3, #4
 	bne neon_ladder_lin_1
 	vst1.32 {q1}, [r4:128]
 	pop {r4-r5}
 	bx lr
 .endif
 	.size   neon_ladder_lin, .-neon_ladder_lin
    .global neon_ladder_mkmatrix
    .type   neon_ladder_mkmatrix, %function
 neon_ladder_mkmatrix:
@ r0 = pointer to params (a, k)
@ r1 = out pointer to matrix (A then B, just like consumer)
 	vpush {q4-q7}
 	vld1.32 {d0[]}, [r0]!  @ a
 	vmov.i32 d1, #0
 	vneg.f32 d2, d0
 	vmov.f32 s0, s4
 	vld1.32 {d6[0]}, [r0]  @ k
 	vmov.i32 q2, #0
 	vmul.f32 s9, s12, s0
 	adr r2, neon_ladder_mkmatrix_const
 	vmov.i32 q1, #0
 	vld1.32 {d2[0]}, [r2]!
 	vadd.f32 q14, q1, q0
 	vmov q15, q2
 	vmov q6, q0
 	vmov q7, q2
 	@ (q0, q2) is jacobian matrix, (q14, q15) is series accum
 	@ (q6, q7) is jacobian ^ i
 	mov r3, #3
 neon_ladder_mkmatrix1:
 	vext.32 q8, q7, q6, #3
 	vmul.f32 q12, q6, d0[0]
 	vmul.f32 q13, q8, d4[1]
 	vext.32 q9, q7, q6, #2
 	vmla.f32 q12, q8, d0[1]
 	vmla.f32 q13, q9, d5[0]
 	vext.32 q10, q6, q6, #1
 	vmla.f32 q12, q9, d1[0]
 	vmla.f32 q13, q10, d5[1]
 	vmla.f32 q12, q10, d1[1]
 	vmla.f32 q13, q7, d0[0]
 	vld1.32 {d6[0]}, [r2]!
 	vmla.f32 q14, q12, d6[0]
 	vmov q6, q12
 	vmla.f32 q15, q13, d6[0]
 	vmov q7, q13
 	subs r3, #1
 	bne neon_ladder_mkmatrix1
 	vmov q0, q14
 	vmov q2, q15
 	mov r3, #4
 neon_ladder_mkmatrix2:
 	vext.32 q8, q2, q0, #3
 	@ q0 = {a11, a21, a31, a41}, q2 = {0, a14, a24, a34}
 	@ square the matrix
 	vmul.f32 q12, q0, d0[0]
 	vmul.f32 q13, q8, d4[1]
 	vext.32 q9, q2, q0, #2
 	vmla.f32 q12, q8, d0[1]
 	vmla.f32 q13, q9, d5[0]
 	vext.32 q10, q2, q0, #1
 	vmla.f32 q12, q9, d1[0]
 	vmla.f32 q13, q10, d5[1]
 	vmla.f32 q12, q10, d1[1]
 	vmla.f32 q13, q2, d0[0]
 	vmov q0, q12
 	vmov q2, q13
 	subs r3, #1
 	bne neon_ladder_mkmatrix2
 	@ unwrap toeplitz matrix into the full form
 	vst1.32 {q0}, [r1]!
 	vext.32 q8, q2, q0, #3
 	vst1.32 {q8}, [r1]!
 	vext.32 q9, q2, q0, #2
 	vst1.32 {q9}, [r1]!
 	vext.32 q10, q2, q0, #1
 	vst1.32 {q10}, [r1]!
 	adr r2, neon_ladder_mkmatrix_const
 	vld1.32 {d2[], d3[]}, [r2]
 	vld1.32 {d4[0]}, [r0]  @ k
 	vadd.f32 d4, d2
 	vrecpe.f32 d6, d4
 	vrecps.f32 d2, d6, d4
 	vadd.f32 q0, q8
 	vadd.f32 q9, q10
 	vsub.f32 q15, q1, q0
 	vmul.f32 d6, d2  @ 1 / (1 + k)
 	vsub.f32 q15, q9
 	vmul.f32 q15, d6[0]
 	vst1.32 {q15}, [r1]!
 	vpop {q4-q7}
 	bx lr
 	.size   neon_ladder_mkmatrix, .-neon_ladder_mkmatrix
 	.balign 16
 neon_ladder_mkmatrix_const:
 	.float 1.0, 0.5, .16666667, .041666667
--- a/cpp/src/resofilter.cc
+++ b/cpp/src/resofilter.cc
@ -31,6 +31,15 @@
 #include "aligned_buf.h"
 #include "resofilter.h"
 #ifdef HAVE_NEON
 extern "C"
 void neon_ladder_nl(const int32_t *in, const float *a, int32_t *out, int count,
  float *state);
 extern "C"
 void neon_ladder_lin(const int32_t *in, const float *a, int32_t *out, int count,
  float *state);
 #endif
 double this_sample_rate;
 void ResoFilter::init(double sample_rate) {
@ -39,7 +48,7 @@ void ResoFilter::init(double sample_rate) {
 ResoFilter::ResoFilter() {
  for (int i = 0; i < 4; i++) {
-    x[i] = 0;
+    x.get()[i] = 0;
 #if defined(NONLINEARITY)
    w[i] = 0;
 #endif
@ -188,14 +197,28 @@ void ResoFilter::process(const int32_t **inbufs, const int32_t *control_in,
  float overdrive = control_in[2] * (1.0 / (1 << 24));
  const int32_t *ibuf = inbufs[0];
  int32_t *obuf = outbufs[0];
  bool useneon = false;
 #ifdef HAVE_NEON
  useneon = true;  // TODO: detect
 #endif
  if (overdrive == 0) {
-    for (int i = 0; i < n; i++) {
+    if (useneon) {
-      float signal = ibuf[i];
+#ifdef HAVE_NEON
-      float tmp[4];
+      AlignedBuf<float, 20> a_neon;
-      matvec4(tmp, a.get() + 4, x);
+      matcopy(a_neon.get(), a.get() + 4, 16);
-      for (int k = 0; k < 4; k++) {
+      matcopy(a_neon.get() + 16, a.get(), 4);
-        x[k] = tmp[k] + signal * a.get()[k];
+      neon_ladder_lin(ibuf, a_neon.get(), obuf, n, x.get());
-        obuf[i] = x[3];
+#endif
    } else {
      for (int i = 0; i < n; i++) {
        float signal = ibuf[i];
        float tmp[4];
        matvec4(tmp, a.get() + 4, x.get());
        for (int k = 0; k < 4; k++) {
          x.get()[k] = tmp[k] + signal * a.get()[k];
          obuf[i] = x.get()[3];
        }
      }
    }
  } else {
@ -205,18 +228,31 @@ void ResoFilter::process(const int32_t **inbufs, const int32_t *control_in,
      a.get()[4 + 5 * i] -= 1.0;
      a.get()[16 + i] += k * a.get()[i];
    }
-    for (int i = 0; i < n; i++) {
+    if (useneon) {
-      float signal = ibuf[i];
+#ifdef HAVE_NEON
-      float tmp[4];
+      // Neon implementation has A first, then B
-      float tx[4];
+      AlignedBuf<float, 23> a_neon;
-      for (int j = 0; j < 4; j++) {
+      matcopy(a_neon.get(), a.get() + 4, 16);
-        tx[j] = sigmoid(x[j], overdrive);
+      matcopy(a_neon.get() + 16, a.get(), 4);
-      }
+      a_neon.get()[20] = k;
-      matvec4(tmp, a.get() + 4, tx);
+      a_neon.get()[21] = overdrive * (1.0 / (1 << 24));
-      float xin = sigmoid(signal - k * x[3], overdrive);
+      a_neon.get()[22] = ogain;
-      for (int j = 0; j < 4; j++) {
+      neon_ladder_nl(ibuf, a_neon.get(), obuf, n, x.get());
-        x[j] += tmp[j] + xin * a.get()[j];
+#endif
-        obuf[i] = x[3] * ogain;
+    } else {
      for (int i = 0; i < n; i++) {
        float signal = ibuf[i];
        float tmp[4];
        float tx[4];
        for (int j = 0; j < 4; j++) {
          tx[j] = sigmoid(x.get()[j], overdrive);
        }
        matvec4(tmp, a.get() + 4, tx);
        float xin = sigmoid(signal - k * x.get()[3], overdrive);
        for (int j = 0; j < 4; j++) {
          x.get()[j] += tmp[j] + xin * a.get()[j];
          obuf[i] = x.get()[3] * ogain;
        }
      }
    }
  }
@ -288,3 +324,25 @@ void ResoFilter::process(const int32_t **inbufs, const int32_t *control_in,
 #endif
 }
 #endif  // USE_MATRIX
 void reso_benchmark(int niter, bool nonlinear) {
 #ifdef HAVE_NEON
  AlignedBuf<float, 23> a_neon;
  for (int i = 0; i < 23; i++) {
    float y = 0.0;
    if (i < 20 && (i % 5) == 0) y = 1.0;
    a_neon.get()[i] = y;
  }
  const int n = 64;
  AlignedBuf<int32_t, n> inbuf;
  AlignedBuf<int32_t, n> outbuf;
  AlignedBuf<float, 4> x;
  for (int i = 0; i < niter; i++) {
    if (nonlinear) {
      neon_ladder_nl(inbuf.get(), a_neon.get(), outbuf.get(), n, x.get());
    } else {
      neon_ladder_lin(inbuf.get(), a_neon.get(), outbuf.get(), n, x.get());
    }
  }
 #endif
 }
--- a/cpp/src/resofilter.h
+++ b/cpp/src/resofilter.h
@ -31,7 +31,7 @@ class ResoFilter : Module {
 			   const int32_t *control_last, int32_t **outbufs);
 private:
 #if defined(USE_MATRIX)
-  float x[4];
+  AlignedBuf<float, 4> x;
 #else
  int32_t x[4];
 #if defined(NONLINEARITY)
--- a/cpp/src/synth_unit.cc
+++ b/cpp/src/synth_unit.cc
@ -250,7 +250,7 @@ void SynthUnit::GetSamples(int n_samples, int16_t *buffer) {
  for (; i < n_samples; i += N) {
    AlignedBuf<int32_t, N> audiobuf;
-    int32_t audiobuf2[N];
+    AlignedBuf<int32_t, N> audiobuf2;
    for (int j = 0; j < N; ++j) {
      audiobuf.get()[j] = 0;
    }
@ -263,11 +263,11 @@ void SynthUnit::GetSamples(int n_samples, int16_t *buffer) {
      }
    }
    const int32_t *bufs[] = { audiobuf.get() };
-    int32_t *bufs2[] = { audiobuf2 };
+    int32_t *bufs2[] = { audiobuf2.get() };
    filter_.process(bufs, filter_control_, filter_control_, bufs2);
    int jmax = n_samples - i;
    for (int j = 0; j < N; ++j) {
-      int32_t val = audiobuf2[j] >> 4;
+      int32_t val = audiobuf2.get()[j] >> 4;
      int clip_val = val < -(1 << 24) ? 0x8000 : val >= (1 << 24) ? 0x7fff :
        val >> 9;
      // TODO: maybe some dithering?
--- a/cpp/src/test_neon.cc
+++ b/cpp/src/test_neon.cc
@ -0,0 +1,28 @@
 #include <stdio.h>
 #include "aligned_buf.h"
 void reso_benchmark(int n, bool nonlinear);
 #ifdef HAVE_NEON
 extern "C" void neon_ladder_mkmatrix(const float *in, float *out);
 #endif
 int main(int argc, char** argv) {
 #ifdef HAVE_NEON
 	float in[2] = {0.1, 3.9};
 	AlignedBuf<float, 20> a;
 	for (int i = 0; i < 10000000; i++) {
 		neon_ladder_mkmatrix(in, a.get());
 	}
 	for (int i = 0; i < 4; i++) {
 		for (int j = 0; j < 5; j++) {
 			printf("%6f ", a.get()[j * 4 + i]);
 		}
 		printf("\n");
 	}
 #endif
 	//reso_benchmark(1000000, false);
 	return 0;
 }