Fix and enable NEON ladder matrix generation

The NEON code for computing the matrix for the ladder filter was almost correct but had some bugs. This patch fixes those and enables it, for a nice speed improvement (processing time was dominated by the scalar matrix generation).
11 years ago · 34b70e38d2
parent 102484e439
commit 34b70e38d2
2 changed files with 33 additions and 17 deletions
--- a/cpp/src/neon_ladder.s
+++ b/cpp/src/neon_ladder.s
@ -233,12 +233,12 @@ neon_ladder_lin_1:
    .type   neon_ladder_mkmatrix, %function
 neon_ladder_mkmatrix:
@ r0 = pointer to params (a, k)
-@ r1 = out pointer to matrix (A then B, just like consumer)
+@ r1 = out pointer to matrix (B then A)
+@ TODO: produce output as A then B, avoiding copies
 	vpush {q4-q7}
 	vld1.32 {d0[]}, [r0]!  @ a
 	vmov.i32 d1, #0
-	vneg.f32 d2, d0
-	vmov.f32 s0, s4
+	vneg.f32 s0, s0

 	vld1.32 {d6[0]}, [r0]  @ k
 	vmov.i32 q2, #0
@ -305,28 +305,29 @@ neon_ladder_mkmatrix2:
 	bne neon_ladder_mkmatrix2

 	@ unwrap toeplitz matrix into the full form
-	vst1.32 {q0}, [r1]!
+	add r1, #16  @ TODO: remove this for A then B output
+	vst1.32 {q0}, [r1,:128]!
 	vext.32 q8, q2, q0, #3
-	vst1.32 {q8}, [r1]!
+	vst1.32 {q8}, [r1,:128]!
 	vext.32 q9, q2, q0, #2
-	vst1.32 {q9}, [r1]!
+	vst1.32 {q9}, [r1,:128]!
 	vext.32 q10, q2, q0, #1
-	vst1.32 {q10}, [r1]!
+	vst1.32 {q10}, [r1,:128]!
+	sub r1, #80  @ TODO: remove this for A then B output

-	adr r2, neon_ladder_mkmatrix_const
-	vld1.32 {d2[], d3[]}, [r2]
+	vmov.f32 q1, 1.0
 	vld1.32 {d4[0]}, [r0]  @ k
 	vadd.f32 d4, d2
 	vrecpe.f32 d6, d4
-	vrecps.f32 d2, d6, d4
+	vrecps.f32 d5, d6, d4

 	vadd.f32 q0, q8
 	vadd.f32 q9, q10
 	vsub.f32 q15, q1, q0
-	vmul.f32 d6, d2  @ 1 / (1 + k)
+	vmul.f32 d6, d5  @ 1 / (1 + k)
 	vsub.f32 q15, q9
 	vmul.f32 q15, d6[0]
-	vst1.32 {q15}, [r1]!
+	vst1.32 {q15}, [r1,:128]!

 	vpop {q4-q7}
 	bx lr
--- a/cpp/src/resofilter.cc
+++ b/cpp/src/resofilter.cc
@ -38,6 +38,7 @@ void neon_ladder_nl(const int32_t *in, const float *a, int32_t *out, int count,
 extern "C"
 void neon_ladder_lin(const int32_t *in, const float *a, int32_t *out, int count,
  float *state);
+extern "C" void neon_ladder_mkmatrix(const float *in, float *out);
 #endif

 double this_sample_rate;
@ -56,7 +57,6 @@ ResoFilter::ResoFilter() {
 }

 int32_t compute_alpha(int32_t logf) {
-  // TODO: better tuning
  return min(1 << 24, Freqlut::lookup(logf));
 }

@ -178,9 +178,15 @@ static void make_state_transition(float result[20], int32_t f0, int32_t k) {
 }

 void test_matrix() {
-  float a[20];
-  make_state_transition(a, 1.0 * (1 << 24), 3.99 * (1 << 24));
-  dump_matrix(a);
+  float params[2] = {1.0, 3.99};
+  AlignedBuf<float, 20> a;
+  make_state_transition(a.get(), params[0] * (1 << 24), params[1] * (1 << 24));
+  dump_matrix(a.get());
+#ifdef HAVE_NEON
+  params[0] /= 16;
+  neon_ladder_mkmatrix(params, a.get());
+  dump_matrix(a.get());
+#endif
 }

 #if defined(USE_MATRIX)
@ -193,11 +199,20 @@ static float sigmoid(float x, float overdrive) {
 void ResoFilter::process(const int32_t **inbufs, const int32_t *control_in,
                         const int32_t *control_last, int32_t **outbufs) {
  AlignedBuf<float, 20> a;
-  make_state_transition(a.get(), compute_alpha(control_in[0]), control_in[1]);
  float overdrive = control_in[2] * (1.0 / (1 << 24));
  const int32_t *ibuf = inbufs[0];
  int32_t *obuf = outbufs[0];
  bool useneon = hasNeon();
+  if (useneon) {
+#ifdef HAVE_NEON
+    float params[2];
+    params[0] = compute_alpha(control_in[0]) * (1.0 / (1 << 28));
+    params[1] = control_in[1] * (1.0 / (1 << 24));
+    neon_ladder_mkmatrix(params, a.get());
+#endif
+  } else {
+    make_state_transition(a.get(), compute_alpha(control_in[0]), control_in[1]);
+  }

  if (overdrive < 0.01) {
    if (useneon) {