Neon-optimized ladder filter

Extremely highly optimized implementations of the ladder filter in Neon assembler. The audio loops (both linear and non-linear) are hooked up and running, while the matrix generation is still running in scalar code, though the Neon version has been tested and benchmarked. Performance numbers on Nexus 10: linear audio loop = 22.5 cycles. Nonlinear audio loop = 62 cycles. Matrix generation = 580 cycles. Note that the current code will crash on ARM v7 devices without Neon (for example, Motorola Xoom).
12 years ago · cc93548845
parent e3b5ceebfc
commit cc93548845
6 changed files with 453 additions and 26 deletions
--- a/android/jni/Android.mk
+++ b/android/jni/Android.mk
@ -22,7 +22,8 @@ LOCAL_SRC_FILES := android_glue.cc \
 ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
    LOCAL_ARM_NEON := true
    LOCAL_CFLAGS := -DHAVE_NEON=1
-    LOCAL_SRC_FILES += neon_fm_kernel.s
+    LOCAL_SRC_FILES += neon_fm_kernel.s \
+                       neon_ladder.s
 endif

 # for native audio
@ -36,4 +37,23 @@ LOCAL_CFLAGS += -O3

 include $(BUILD_SHARED_LIBRARY)

+include $(CLEAR_VARS)
+LOCAL_SRC_FILES := test_neon.cc \
+  resofilter.cc
+
+ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
+    LOCAL_ARM_NEON := true
+    LOCAL_CFLAGS := -DHAVE_NEON=1
+    LOCAL_SRC_FILES += neon_fm_kernel.s \
+                       neon_ladder.s
+endif
+
+LOCAL_CFLAGS += -O3
+
+LOCAL_STATIC_LIBRARIES += cpufeatures
+
+LOCAL_MODULE := test_neon
+
+include $(BUILD_EXECUTABLE)
+
 $(call import-module,android/cpufeatures)
--- a/cpp/src/neon_ladder.s
+++ b/cpp/src/neon_ladder.s
@ -0,0 +1,321 @@
+    .text
+
+    .align  2
+    .global neon_ladder_nl
+    .type   neon_ladder_nl, %function
+neon_ladder_nl:
+@ r0 = pointer to input
+@ r1 = pointer to filter (A then B then k then overdrive)
+@ r2 = pointer to output
+@ r3 = number of samples
+@ stack[0] = pointer to state
+@q12 - q15 matrix (A)
+@q11 B
+
+@q0 input
+@q1 state
+@q3 const 1.0
+@q6 overdrive, output gain
+@q10 resonance (k)
+
+	push {r4-r5}
+	vpush {q4-q7}
+	ldr r4, [sp, #(4 * 18)]
+	vld1.32 {q1}, [r4:128]
+
+	vld1.32 {q12}, [r1:128]!
+	vld1.32 {q13}, [r1:128]!
+	vld1.32 {q14}, [r1:128]!
+	vld1.32 {q15}, [r1:128]!
+	vld1.32 {q11}, [r1:128]!
+	vld1.32 {d20[], d21[]}, [r1]!
+	vld1.32 {d12[0]}, [r1]!
+	vld1.32 {d12[1]}, [r1]!
+
+	adr r5, neon_ladder_nl_const
+	vld1.32 {q3}, [r5:128]!
+
+neon_ladder_nl_1:
+	@ cycle counting suggests that careful unrolling would
+	@ save about 10% - we will avoid the complexity for now
+
+	vld1.32 {d0[0]}, [r0]!
+	vcvt.f32.s32 q0, q0
+
+	@ compute resonance
+	vmls.f32 q0, q10, d3[1]
+
+	@ q4 gets q1/sqrt(1 + q1*q1)
+	@ q0 gets q0/sqrt(1 + q0*q0)
+	vmul.f32 q2, q1, d12[0]		@ q1 * overdrive
+	vmul.f32 q7, q0, d12[0]
+	vmul.f32 q2, q2, q2
+	vmul.f32 q7, q7, q7
+	vadd.f32 q2, q3				@ z = 1 + (q1 * overdrive)^2
+	vadd.f32 q7, q3
+	vrsqrte.f32 q4, q2			@ est ~ 1/sqrt(z)
+	vrsqrte.f32 q8, q7
+	vmul.f32 q2, q4, q2			@ z * est
+	vmul.f32 q7, q8, q7
+	vrsqrts.f32 q2, q2, q4 		@ (3 - z * est^2)/2
+	vrsqrts.f32 q7, q7, q8
+	vmul.f32 q4, q1				@ q1 * est
+	vmul.f32 q0, q8
+	vmul.f32 q4, q2 			@ q1 * est * (3 - z * est^2)/2
+	vmul.f32 q0, q7
+
+	vmla.f32 q1, q12, d8[0]  @ sigmoid(y0)
+	vmul.f32 q2, q11, d0[0]  @ sigmod(x - k * y3)
+	vmla.f32 q1, q13, d8[1]  @ sigmoid(y1)
+	vmla.f32 q2, q14, d9[0]  @ sigmoid(y2)
+	vmla.f32 q1, q15, d9[1]  @ sigmoid(y3)
+	vadd.f32 q1, q2
+
+	vmul.f32 q4, q1, d12[1]
+	vcvt.s32.f32 q4, q4
+	vst1.32 {d9[1]}, [r2]!
+	subs r3, #1
+	bne neon_ladder_nl_1
+
+	vst1.32 {q1}, [r4:128]
+	vpop {q4-q7}
+	pop {r4-r5}
+	bx lr
+
+	.size   neon_ladder_nl, .-neon_ladder_nl
+
+	.balign 16
+neon_ladder_nl_const:
+	.float 1.0, 1.0, 1.0, 1.0
+
+    .align  2
+    .global neon_ladder_lin
+    .type   neon_ladder_lin, %function
+
+	.if 0
+@ Simpler, not as optimized approach (26.3 cycles on N10)
+
+neon_ladder_lin:
+
+@ r0 = pointer to input
+@ r1 = pointer to filter (A then B)
+@ r2 = pointer to output
+@ r3 = number of samples
+@ stack[0] = pointer to state
+@q12 - q15 matrix (A)
+@q11 B
+
+@q0 input
+@q1 state
+
+	push {r4-r5}
+	ldr r4, [sp, #(4 * 2)]
+	vld1.32 {q1}, [r4:128]
+
+	vld1.32 {q12}, [r1:128]!
+	vld1.32 {q13}, [r1:128]!
+	vld1.32 {q14}, [r1:128]!
+	vld1.32 {q15}, [r1:128]!
+	vld1.32 {q11}, [r1:128]!
+
+neon_ladder_lin_1:
+	vld1.32 {d0[0]}, [r0]!
+	vmov q3, q1
+	vcvt.f32.s32 q0, q0
+
+	vmul.f32 q1, q12, d6[0]  @ y0
+	vmul.f32 q2, q13, d6[1]  @ y1
+	vmla.f32 q1, q11, d0[0]  @ x
+	vmla.f32 q2, q14, d7[0]  @ y2
+	vmla.f32 q1, q15, d7[1]  @ y3
+	vadd.f32 q1, q2
+
+	vcvt.s32.f32 q3, q1
+	vst1.32 {d7[1]}, [r2]!
+	subs r3, #1
+	bne neon_ladder_lin_1
+
+	vst1.32 {q1}, [r4:128]
+	pop {r4-r5}
+	bx lr
+.else
+@ Unrolled, more highly optimized loop (22.5 cycles on N10)
+
+neon_ladder_lin:
+@ r0 = pointer to input
+@ r1 = pointer to filter (A then B)
+@ r2 = pointer to output
+@ r3 = number of samples
+@ stack[0] = pointer to state
+@q12 - q15 matrix (A)
+@q11 B
+
+@q0 input
+@q1 state
+
+	push {r4-r5}
+	ldr r4, [sp, #(4 * 2)]
+	vld1.32 {q1}, [r4:128]
+
+	vld1.32 {q12}, [r1:128]!
+	vld1.32 {q13}, [r1:128]!
+	vld1.32 {q14}, [r1:128]!
+	vld1.32 {q15}, [r1:128]!
+	vld1.32 {q11}, [r1:128]!
+
+neon_ladder_lin_1:
+	vld1.32 {q0}, [r0:128]!
+	vcvt.f32.s32 q0, q0
+
+	vmul.f32 q9, q12, d2[0]  @ y0
+	vmul.f32 q2, q13, d2[1]  @ y1
+	vmla.f32 q9, q11, d0[0]  @ x
+	vmla.f32 q2, q14, d3[0]  @ y2
+	vmla.f32 q9, q15, d3[1]  @ y3
+	vadd.f32 q2, q9
+
+	vmul.f32 q1, q11, d0[1]  @ x
+	vmul.f32 q9, q12, d4[0]  @ y0
+	vmla.f32 q1, q13, d4[1]  @ y1
+	vmla.f32 q9, q14, d5[0]  @ y2
+	vcvt.s32.f32 q3, q2
+	vmla.f32 q1, q15, d5[1]  @ y3
+	vst1.32 {d7[1]}, [r2]!
+	vadd.f32 q1, q9
+
+	vmul.f32 q9, q11, d1[0]  @ x
+	vmul.f32 q2, q12, d2[0]  @ y0
+	vmla.f32 q9, q13, d2[1]  @ y1
+	vmla.f32 q2, q14, d3[0]  @ y2
+	vcvt.s32.f32 q3, q1
+	vmla.f32 q9, q15, d3[1]  @ y3
+	vst1.32 {d7[1]}, [r2]!
+	vadd.f32 q2, q9
+
+	vmul.f32 q1, q11, d1[1]  @ x
+	vmul.f32 q9, q12, d4[0]  @ y0
+	vmla.f32 q1, q13, d4[1]  @ y1
+	vmla.f32 q9, q14, d5[0]  @ y2
+	vcvt.s32.f32 q3, q2
+	vmla.f32 q1, q15, d5[1]  @ y3
+	vst1.32 {d7[1]}, [r2]!
+	vadd.f32 q1, q9
+
+	vcvt.s32.f32 q3, q1
+	vst1.32 {d7[1]}, [r2]!
+	subs r3, #4
+	bne neon_ladder_lin_1
+
+	vst1.32 {q1}, [r4:128]
+	pop {r4-r5}
+	bx lr
+.endif
+
+	.size   neon_ladder_lin, .-neon_ladder_lin
+
+    .global neon_ladder_mkmatrix
+    .type   neon_ladder_mkmatrix, %function
+neon_ladder_mkmatrix:
+@ r0 = pointer to params (a, k)
+@ r1 = out pointer to matrix (A then B, just like consumer)
+	vpush {q4-q7}
+	vld1.32 {d0[]}, [r0]!  @ a
+	vmov.i32 d1, #0
+	vneg.f32 d2, d0
+	vmov.f32 s0, s4
+
+	vld1.32 {d6[0]}, [r0]  @ k
+	vmov.i32 q2, #0
+	vmul.f32 s9, s12, s0
+
+	adr r2, neon_ladder_mkmatrix_const
+	vmov.i32 q1, #0
+	vld1.32 {d2[0]}, [r2]!
+	vadd.f32 q14, q1, q0
+	vmov q15, q2
+	vmov q6, q0
+	vmov q7, q2
+
+	@ (q0, q2) is jacobian matrix, (q14, q15) is series accum
+	@ (q6, q7) is jacobian ^ i
+
+	mov r3, #3
+neon_ladder_mkmatrix1:
+	vext.32 q8, q7, q6, #3
+	vmul.f32 q12, q6, d0[0]
+	vmul.f32 q13, q8, d4[1]
+	vext.32 q9, q7, q6, #2
+	vmla.f32 q12, q8, d0[1]
+	vmla.f32 q13, q9, d5[0]
+	vext.32 q10, q6, q6, #1
+	vmla.f32 q12, q9, d1[0]
+	vmla.f32 q13, q10, d5[1]
+	vmla.f32 q12, q10, d1[1]
+	vmla.f32 q13, q7, d0[0]
+
+	vld1.32 {d6[0]}, [r2]!
+	vmla.f32 q14, q12, d6[0]
+	vmov q6, q12
+	vmla.f32 q15, q13, d6[0]
+	vmov q7, q13
+	subs r3, #1
+	bne neon_ladder_mkmatrix1
+
+	vmov q0, q14
+	vmov q2, q15
+
+	mov r3, #4
+
+neon_ladder_mkmatrix2:
+	vext.32 q8, q2, q0, #3
+
+	@ q0 = {a11, a21, a31, a41}, q2 = {0, a14, a24, a34}
+
+	@ square the matrix
+	vmul.f32 q12, q0, d0[0]
+	vmul.f32 q13, q8, d4[1]
+	vext.32 q9, q2, q0, #2
+	vmla.f32 q12, q8, d0[1]
+	vmla.f32 q13, q9, d5[0]
+	vext.32 q10, q2, q0, #1
+	vmla.f32 q12, q9, d1[0]
+	vmla.f32 q13, q10, d5[1]
+	vmla.f32 q12, q10, d1[1]
+	vmla.f32 q13, q2, d0[0]
+
+	vmov q0, q12
+	vmov q2, q13
+	subs r3, #1
+	bne neon_ladder_mkmatrix2
+
+	@ unwrap toeplitz matrix into the full form
+	vst1.32 {q0}, [r1]!
+	vext.32 q8, q2, q0, #3
+	vst1.32 {q8}, [r1]!
+	vext.32 q9, q2, q0, #2
+	vst1.32 {q9}, [r1]!
+	vext.32 q10, q2, q0, #1
+	vst1.32 {q10}, [r1]!
+
+	adr r2, neon_ladder_mkmatrix_const
+	vld1.32 {d2[], d3[]}, [r2]
+	vld1.32 {d4[0]}, [r0]  @ k
+	vadd.f32 d4, d2
+	vrecpe.f32 d6, d4
+	vrecps.f32 d2, d6, d4
+
+	vadd.f32 q0, q8
+	vadd.f32 q9, q10
+	vsub.f32 q15, q1, q0
+	vmul.f32 d6, d2  @ 1 / (1 + k)
+	vsub.f32 q15, q9
+	vmul.f32 q15, d6[0]
+	vst1.32 {q15}, [r1]!
+
+	vpop {q4-q7}
+	bx lr
+
+	.size   neon_ladder_mkmatrix, .-neon_ladder_mkmatrix
+	.balign 16
+neon_ladder_mkmatrix_const:
+	.float 1.0, 0.5, .16666667, .041666667
--- a/cpp/src/resofilter.cc
+++ b/cpp/src/resofilter.cc
@ -31,6 +31,15 @@
 #include "aligned_buf.h"
 #include "resofilter.h"

+#ifdef HAVE_NEON
+extern "C"
+void neon_ladder_nl(const int32_t *in, const float *a, int32_t *out, int count,
+  float *state);
+extern "C"
+void neon_ladder_lin(const int32_t *in, const float *a, int32_t *out, int count,
+  float *state);
+#endif
+
 double this_sample_rate;

 void ResoFilter::init(double sample_rate) {
@ -39,7 +48,7 @@ void ResoFilter::init(double sample_rate) {

 ResoFilter::ResoFilter() {
  for (int i = 0; i < 4; i++) {
-    x[i] = 0;
+    x.get()[i] = 0;
 #if defined(NONLINEARITY)
    w[i] = 0;
 #endif
@ -188,14 +197,28 @@ void ResoFilter::process(const int32_t **inbufs, const int32_t *control_in,
  float overdrive = control_in[2] * (1.0 / (1 << 24));
  const int32_t *ibuf = inbufs[0];
  int32_t *obuf = outbufs[0];
+  bool useneon = false;
+#ifdef HAVE_NEON
+  useneon = true;  // TODO: detect
+#endif
+
  if (overdrive == 0) {
+    if (useneon) {
+#ifdef HAVE_NEON
+      AlignedBuf<float, 20> a_neon;
+      matcopy(a_neon.get(), a.get() + 4, 16);
+      matcopy(a_neon.get() + 16, a.get(), 4);
+      neon_ladder_lin(ibuf, a_neon.get(), obuf, n, x.get());
+#endif
+    } else {
      for (int i = 0; i < n; i++) {
        float signal = ibuf[i];
        float tmp[4];
-      matvec4(tmp, a.get() + 4, x);
+        matvec4(tmp, a.get() + 4, x.get());
        for (int k = 0; k < 4; k++) {
-        x[k] = tmp[k] + signal * a.get()[k];
-        obuf[i] = x[3];
+          x.get()[k] = tmp[k] + signal * a.get()[k];
+          obuf[i] = x.get()[3];
+        }
      }
    }
  } else {
@ -205,18 +228,31 @@ void ResoFilter::process(const int32_t **inbufs, const int32_t *control_in,
      a.get()[4 + 5 * i] -= 1.0;
      a.get()[16 + i] += k * a.get()[i];
    }
+    if (useneon) {
+#ifdef HAVE_NEON
+      // Neon implementation has A first, then B
+      AlignedBuf<float, 23> a_neon;
+      matcopy(a_neon.get(), a.get() + 4, 16);
+      matcopy(a_neon.get() + 16, a.get(), 4);
+      a_neon.get()[20] = k;
+      a_neon.get()[21] = overdrive * (1.0 / (1 << 24));
+      a_neon.get()[22] = ogain;
+      neon_ladder_nl(ibuf, a_neon.get(), obuf, n, x.get());
+#endif
+    } else {
      for (int i = 0; i < n; i++) {
        float signal = ibuf[i];
        float tmp[4];
        float tx[4];
        for (int j = 0; j < 4; j++) {
-        tx[j] = sigmoid(x[j], overdrive);
+          tx[j] = sigmoid(x.get()[j], overdrive);
        }
        matvec4(tmp, a.get() + 4, tx);
-      float xin = sigmoid(signal - k * x[3], overdrive);
+        float xin = sigmoid(signal - k * x.get()[3], overdrive);
        for (int j = 0; j < 4; j++) {
-        x[j] += tmp[j] + xin * a.get()[j];
-        obuf[i] = x[3] * ogain;
+          x.get()[j] += tmp[j] + xin * a.get()[j];
+          obuf[i] = x.get()[3] * ogain;
+        }
      }
    }
  }
@ -288,3 +324,25 @@ void ResoFilter::process(const int32_t **inbufs, const int32_t *control_in,
 #endif
 }
 #endif  // USE_MATRIX
+
+void reso_benchmark(int niter, bool nonlinear) {
+#ifdef HAVE_NEON
+  AlignedBuf<float, 23> a_neon;
+  for (int i = 0; i < 23; i++) {
+    float y = 0.0;
+    if (i < 20 && (i % 5) == 0) y = 1.0;
+    a_neon.get()[i] = y;
+  }
+  const int n = 64;
+  AlignedBuf<int32_t, n> inbuf;
+  AlignedBuf<int32_t, n> outbuf;
+  AlignedBuf<float, 4> x;
+  for (int i = 0; i < niter; i++) {
+    if (nonlinear) {
+      neon_ladder_nl(inbuf.get(), a_neon.get(), outbuf.get(), n, x.get());
+    } else {
+      neon_ladder_lin(inbuf.get(), a_neon.get(), outbuf.get(), n, x.get());
+    }
+  }
+#endif
+}
--- a/cpp/src/resofilter.h
+++ b/cpp/src/resofilter.h
@ -31,7 +31,7 @@ class ResoFilter : Module {
 			   const int32_t *control_last, int32_t **outbufs);
 private:
 #if defined(USE_MATRIX)
-  float x[4];
+  AlignedBuf<float, 4> x;
 #else
  int32_t x[4];
 #if defined(NONLINEARITY)
--- a/cpp/src/synth_unit.cc
+++ b/cpp/src/synth_unit.cc
@ -250,7 +250,7 @@ void SynthUnit::GetSamples(int n_samples, int16_t *buffer) {

  for (; i < n_samples; i += N) {
    AlignedBuf<int32_t, N> audiobuf;
-    int32_t audiobuf2[N];
+    AlignedBuf<int32_t, N> audiobuf2;
    for (int j = 0; j < N; ++j) {
      audiobuf.get()[j] = 0;
    }
@ -263,11 +263,11 @@ void SynthUnit::GetSamples(int n_samples, int16_t *buffer) {
      }
    }
    const int32_t *bufs[] = { audiobuf.get() };
-    int32_t *bufs2[] = { audiobuf2 };
+    int32_t *bufs2[] = { audiobuf2.get() };
    filter_.process(bufs, filter_control_, filter_control_, bufs2);
    int jmax = n_samples - i;
    for (int j = 0; j < N; ++j) {
-      int32_t val = audiobuf2[j] >> 4;
+      int32_t val = audiobuf2.get()[j] >> 4;
      int clip_val = val < -(1 << 24) ? 0x8000 : val >= (1 << 24) ? 0x7fff :
        val >> 9;
      // TODO: maybe some dithering?
--- a/cpp/src/test_neon.cc
+++ b/cpp/src/test_neon.cc
@ -0,0 +1,28 @@
+#include <stdio.h>
+
+#include "aligned_buf.h"
+
+void reso_benchmark(int n, bool nonlinear);
+
+#ifdef HAVE_NEON
+extern "C" void neon_ladder_mkmatrix(const float *in, float *out);
+#endif
+
+int main(int argc, char** argv) {
+#ifdef HAVE_NEON
+	float in[2] = {0.1, 3.9};
+	AlignedBuf<float, 20> a;
+	for (int i = 0; i < 10000000; i++) {
+		neon_ladder_mkmatrix(in, a.get());
+	}
+	for (int i = 0; i < 4; i++) {
+		for (int j = 0; j < 5; j++) {
+			printf("%6f ", a.get()[j * 4 + i]);
+		}
+		printf("\n");
+	}
+#endif
+
+	//reso_benchmark(1000000, false);
+	return 0;
+}