From 34b70e38d29d1a54ef36d28616e68583332f488e Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Sun, 20 Apr 2014 21:33:19 -0700
Subject: [PATCH] Fix and enable NEON ladder matrix generation

The NEON code for computing the matrix for the ladder filter was almost
correct but had some bugs. This patch fixes those and enables it, for
a nice speed improvement (processing time was dominated by the scalar
matrix generation).
---
 cpp/src/neon_ladder.s | 25 +++++++++++++------------
 cpp/src/resofilter.cc | 25 ++++++++++++++++++++-----
 2 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/cpp/src/neon_ladder.s b/cpp/src/neon_ladder.s
index 565e50c..aec0697 100644
--- a/cpp/src/neon_ladder.s
+++ b/cpp/src/neon_ladder.s
@@ -233,12 +233,12 @@ neon_ladder_lin_1:
     .type   neon_ladder_mkmatrix, %function
 neon_ladder_mkmatrix:
 @ r0 = pointer to params (a, k)
-@ r1 = out pointer to matrix (A then B, just like consumer)
+@ r1 = out pointer to matrix (B then A)
+@ TODO: produce output as A then B, avoiding copies
 	vpush {q4-q7}
 	vld1.32 {d0[]}, [r0]!  @ a
 	vmov.i32 d1, #0
-	vneg.f32 d2, d0
-	vmov.f32 s0, s4
+	vneg.f32 s0, s0
 
 	vld1.32 {d6[0]}, [r0]  @ k
 	vmov.i32 q2, #0
@@ -305,28 +305,29 @@ neon_ladder_mkmatrix2:
 	bne neon_ladder_mkmatrix2
 
 	@ unwrap toeplitz matrix into the full form
-	vst1.32 {q0}, [r1]!
+	add r1, #16  @ TODO: remove this for A then B output
+	vst1.32 {q0}, [r1,:128]!
 	vext.32 q8, q2, q0, #3
-	vst1.32 {q8}, [r1]!
+	vst1.32 {q8}, [r1,:128]!
 	vext.32 q9, q2, q0, #2
-	vst1.32 {q9}, [r1]!
+	vst1.32 {q9}, [r1,:128]!
 	vext.32 q10, q2, q0, #1
-	vst1.32 {q10}, [r1]!
+	vst1.32 {q10}, [r1,:128]!
+	sub r1, #80  @ TODO: remove this for A then B output
 
-	adr r2, neon_ladder_mkmatrix_const
-	vld1.32 {d2[], d3[]}, [r2]
+	vmov.f32 q1, 1.0
 	vld1.32 {d4[0]}, [r0]  @ k
 	vadd.f32 d4, d2
 	vrecpe.f32 d6, d4
-	vrecps.f32 d2, d6, d4
+	vrecps.f32 d5, d6, d4
 
 	vadd.f32 q0, q8
 	vadd.f32 q9, q10
 	vsub.f32 q15, q1, q0
-	vmul.f32 d6, d2  @ 1 / (1 + k)
+	vmul.f32 d6, d5  @ 1 / (1 + k)
 	vsub.f32 q15, q9
 	vmul.f32 q15, d6[0]
-	vst1.32 {q15}, [r1]!
+	vst1.32 {q15}, [r1,:128]!
 
 	vpop {q4-q7}
 	bx lr
diff --git a/cpp/src/resofilter.cc b/cpp/src/resofilter.cc
index 80aaffc..7db35bb 100644
--- a/cpp/src/resofilter.cc
+++ b/cpp/src/resofilter.cc
@@ -38,6 +38,7 @@ void neon_ladder_nl(const int32_t *in, const float *a, int32_t *out, int count,
 extern "C"
 void neon_ladder_lin(const int32_t *in, const float *a, int32_t *out, int count,
   float *state);
+extern "C" void neon_ladder_mkmatrix(const float *in, float *out);
 #endif
 
 double this_sample_rate;
@@ -56,7 +57,6 @@ ResoFilter::ResoFilter() {
 }
 
 int32_t compute_alpha(int32_t logf) {
-  // TODO: better tuning
   return min(1 << 24, Freqlut::lookup(logf));
 }
 
@@ -178,9 +178,15 @@ static void make_state_transition(float result[20], int32_t f0, int32_t k) {
 }
 
 void test_matrix() {
-  float a[20];
-  make_state_transition(a, 1.0 * (1 << 24), 3.99 * (1 << 24));
-  dump_matrix(a);
+  float params[2] = {1.0, 3.99};
+  AlignedBuf<float, 20> a;
+  make_state_transition(a.get(), params[0] * (1 << 24), params[1] * (1 << 24));
+  dump_matrix(a.get());
+#ifdef HAVE_NEON
+  params[0] /= 16;
+  neon_ladder_mkmatrix(params, a.get());
+  dump_matrix(a.get());
+#endif
 }
 
 #if defined(USE_MATRIX)
@@ -193,11 +199,20 @@ static float sigmoid(float x, float overdrive) {
 void ResoFilter::process(const int32_t **inbufs, const int32_t *control_in,
                          const int32_t *control_last, int32_t **outbufs) {
   AlignedBuf<float, 20> a;
-  make_state_transition(a.get(), compute_alpha(control_in[0]), control_in[1]);
   float overdrive = control_in[2] * (1.0 / (1 << 24));
   const int32_t *ibuf = inbufs[0];
   int32_t *obuf = outbufs[0];
   bool useneon = hasNeon();
+  if (useneon) {
+#ifdef HAVE_NEON
+    float params[2];
+    params[0] = compute_alpha(control_in[0]) * (1.0 / (1 << 28));
+    params[1] = control_in[1] * (1.0 / (1 << 24));
+    neon_ladder_mkmatrix(params, a.get());
+#endif
+  } else {
+    make_state_transition(a.get(), compute_alpha(control_in[0]), control_in[1]);
+  }
 
   if (overdrive < 0.01) {
     if (useneon) {