Fix and enable NEON ladder matrix generation

The NEON code for computing the matrix for the ladder filter was almost
correct but had some bugs. This patch fixes those and enables it, for
a nice speed improvement (processing time was dominated by the scalar
matrix generation).
master
Raph Levien 10 years ago
parent 102484e439
commit 34b70e38d2
  1. 25
      cpp/src/neon_ladder.s
  2. 25
      cpp/src/resofilter.cc

@ -233,12 +233,12 @@ neon_ladder_lin_1:
.type neon_ladder_mkmatrix, %function
neon_ladder_mkmatrix:
@ r0 = pointer to params (a, k)
@ r1 = out pointer to matrix (A then B, just like consumer)
@ r1 = out pointer to matrix (B then A)
@ TODO: produce output as A then B, avoiding copies
vpush {q4-q7}
vld1.32 {d0[]}, [r0]! @ a
vmov.i32 d1, #0
vneg.f32 d2, d0
vmov.f32 s0, s4
vneg.f32 s0, s0
vld1.32 {d6[0]}, [r0] @ k
vmov.i32 q2, #0
@ -305,28 +305,29 @@ neon_ladder_mkmatrix2:
bne neon_ladder_mkmatrix2
@ unwrap toeplitz matrix into the full form
vst1.32 {q0}, [r1]!
add r1, #16 @ TODO: remove this for A then B output
vst1.32 {q0}, [r1,:128]!
vext.32 q8, q2, q0, #3
vst1.32 {q8}, [r1]!
vst1.32 {q8}, [r1,:128]!
vext.32 q9, q2, q0, #2
vst1.32 {q9}, [r1]!
vst1.32 {q9}, [r1,:128]!
vext.32 q10, q2, q0, #1
vst1.32 {q10}, [r1]!
vst1.32 {q10}, [r1,:128]!
sub r1, #80 @ TODO: remove this for A then B output
adr r2, neon_ladder_mkmatrix_const
vld1.32 {d2[], d3[]}, [r2]
vmov.f32 q1, 1.0
vld1.32 {d4[0]}, [r0] @ k
vadd.f32 d4, d2
vrecpe.f32 d6, d4
vrecps.f32 d2, d6, d4
vrecps.f32 d5, d6, d4
vadd.f32 q0, q8
vadd.f32 q9, q10
vsub.f32 q15, q1, q0
vmul.f32 d6, d2 @ 1 / (1 + k)
vmul.f32 d6, d5 @ 1 / (1 + k)
vsub.f32 q15, q9
vmul.f32 q15, d6[0]
vst1.32 {q15}, [r1]!
vst1.32 {q15}, [r1,:128]!
vpop {q4-q7}
bx lr

@ -38,6 +38,7 @@ void neon_ladder_nl(const int32_t *in, const float *a, int32_t *out, int count,
extern "C"
void neon_ladder_lin(const int32_t *in, const float *a, int32_t *out, int count,
float *state);
extern "C" void neon_ladder_mkmatrix(const float *in, float *out);
#endif
double this_sample_rate;
@ -56,7 +57,6 @@ ResoFilter::ResoFilter() {
}
int32_t compute_alpha(int32_t logf) {
// TODO: better tuning
return min(1 << 24, Freqlut::lookup(logf));
}
@ -178,9 +178,15 @@ static void make_state_transition(float result[20], int32_t f0, int32_t k) {
}
void test_matrix() {
float a[20];
make_state_transition(a, 1.0 * (1 << 24), 3.99 * (1 << 24));
dump_matrix(a);
float params[2] = {1.0, 3.99};
AlignedBuf<float, 20> a;
make_state_transition(a.get(), params[0] * (1 << 24), params[1] * (1 << 24));
dump_matrix(a.get());
#ifdef HAVE_NEON
params[0] /= 16;
neon_ladder_mkmatrix(params, a.get());
dump_matrix(a.get());
#endif
}
#if defined(USE_MATRIX)
@ -193,11 +199,20 @@ static float sigmoid(float x, float overdrive) {
void ResoFilter::process(const int32_t **inbufs, const int32_t *control_in,
const int32_t *control_last, int32_t **outbufs) {
AlignedBuf<float, 20> a;
make_state_transition(a.get(), compute_alpha(control_in[0]), control_in[1]);
float overdrive = control_in[2] * (1.0 / (1 << 24));
const int32_t *ibuf = inbufs[0];
int32_t *obuf = outbufs[0];
bool useneon = hasNeon();
if (useneon) {
#ifdef HAVE_NEON
float params[2];
params[0] = compute_alpha(control_in[0]) * (1.0 / (1 << 28));
params[1] = control_in[1] * (1.0 / (1 << 24));
neon_ladder_mkmatrix(params, a.get());
#endif
} else {
make_state_transition(a.get(), compute_alpha(control_in[0]), control_in[1]);
}
if (overdrive < 0.01) {
if (useneon) {

Loading…
Cancel
Save