diff --git a/cpp/src/neon_ladder.s b/cpp/src/neon_ladder.s index 565e50c..aec0697 100644 --- a/cpp/src/neon_ladder.s +++ b/cpp/src/neon_ladder.s @@ -233,12 +233,12 @@ neon_ladder_lin_1: .type neon_ladder_mkmatrix, %function neon_ladder_mkmatrix: @ r0 = pointer to params (a, k) -@ r1 = out pointer to matrix (A then B, just like consumer) +@ r1 = out pointer to matrix (B then A) +@ TODO: produce output as A then B, avoiding copies vpush {q4-q7} vld1.32 {d0[]}, [r0]! @ a vmov.i32 d1, #0 - vneg.f32 d2, d0 - vmov.f32 s0, s4 + vneg.f32 s0, s0 vld1.32 {d6[0]}, [r0] @ k vmov.i32 q2, #0 @@ -305,28 +305,29 @@ neon_ladder_mkmatrix2: bne neon_ladder_mkmatrix2 @ unwrap toeplitz matrix into the full form - vst1.32 {q0}, [r1]! + add r1, #16 @ TODO: remove this for A then B output + vst1.32 {q0}, [r1,:128]! vext.32 q8, q2, q0, #3 - vst1.32 {q8}, [r1]! + vst1.32 {q8}, [r1,:128]! vext.32 q9, q2, q0, #2 - vst1.32 {q9}, [r1]! + vst1.32 {q9}, [r1,:128]! vext.32 q10, q2, q0, #1 - vst1.32 {q10}, [r1]! + vst1.32 {q10}, [r1,:128]! + sub r1, #80 @ TODO: remove this for A then B output - adr r2, neon_ladder_mkmatrix_const - vld1.32 {d2[], d3[]}, [r2] + vmov.f32 q1, 1.0 vld1.32 {d4[0]}, [r0] @ k vadd.f32 d4, d2 vrecpe.f32 d6, d4 - vrecps.f32 d2, d6, d4 + vrecps.f32 d5, d6, d4 vadd.f32 q0, q8 vadd.f32 q9, q10 vsub.f32 q15, q1, q0 - vmul.f32 d6, d2 @ 1 / (1 + k) + vmul.f32 d6, d5 @ 1 / (1 + k) vsub.f32 q15, q9 vmul.f32 q15, d6[0] - vst1.32 {q15}, [r1]! + vst1.32 {q15}, [r1,:128]! vpop {q4-q7} bx lr diff --git a/cpp/src/resofilter.cc b/cpp/src/resofilter.cc index 80aaffc..7db35bb 100644 --- a/cpp/src/resofilter.cc +++ b/cpp/src/resofilter.cc @@ -38,6 +38,7 @@ void neon_ladder_nl(const int32_t *in, const float *a, int32_t *out, int count, extern "C" void neon_ladder_lin(const int32_t *in, const float *a, int32_t *out, int count, float *state); +extern "C" void neon_ladder_mkmatrix(const float *in, float *out); #endif double this_sample_rate; @@ -56,7 +57,6 @@ ResoFilter::ResoFilter() { } int32_t compute_alpha(int32_t logf) { - // TODO: better tuning return min(1 << 24, Freqlut::lookup(logf)); } @@ -178,9 +178,15 @@ static void make_state_transition(float result[20], int32_t f0, int32_t k) { } void test_matrix() { - float a[20]; - make_state_transition(a, 1.0 * (1 << 24), 3.99 * (1 << 24)); - dump_matrix(a); + float params[2] = {1.0, 3.99}; + AlignedBuf a; + make_state_transition(a.get(), params[0] * (1 << 24), params[1] * (1 << 24)); + dump_matrix(a.get()); +#ifdef HAVE_NEON + params[0] /= 16; + neon_ladder_mkmatrix(params, a.get()); + dump_matrix(a.get()); +#endif } #if defined(USE_MATRIX) @@ -193,11 +199,20 @@ static float sigmoid(float x, float overdrive) { void ResoFilter::process(const int32_t **inbufs, const int32_t *control_in, const int32_t *control_last, int32_t **outbufs) { AlignedBuf a; - make_state_transition(a.get(), compute_alpha(control_in[0]), control_in[1]); float overdrive = control_in[2] * (1.0 / (1 << 24)); const int32_t *ibuf = inbufs[0]; int32_t *obuf = outbufs[0]; bool useneon = hasNeon(); + if (useneon) { +#ifdef HAVE_NEON + float params[2]; + params[0] = compute_alpha(control_in[0]) * (1.0 / (1 << 28)); + params[1] = control_in[1] * (1.0 / (1 << 24)); + neon_ladder_mkmatrix(params, a.get()); +#endif + } else { + make_state_transition(a.get(), compute_alpha(control_in[0]), control_in[1]); + } if (overdrive < 0.01) { if (useneon) {