Neon-optimized ladder filter

Extremely highly optimized implementations of the ladder filter in Neon
assembler. The audio loops (both linear and non-linear) are hooked up
and running, while the matrix generation is still running in scalar
code, though the Neon version has been tested and benchmarked.

Performance numbers on Nexus 10: linear audio loop = 22.5 cycles.
Nonlinear audio loop = 62 cycles. Matrix generation = 580 cycles.

Note that the current code will crash on ARM v7 devices without Neon
(for example, Motorola Xoom).
master
Raph Levien 12 years ago
parent e3b5ceebfc
commit cc93548845
  1. 22
      android/jni/Android.mk
  2. 321
      cpp/src/neon_ladder.s
  3. 74
      cpp/src/resofilter.cc
  4. 2
      cpp/src/resofilter.h
  5. 6
      cpp/src/synth_unit.cc
  6. 28
      cpp/src/test_neon.cc

@ -22,7 +22,8 @@ LOCAL_SRC_FILES := android_glue.cc \
ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
LOCAL_ARM_NEON := true
LOCAL_CFLAGS := -DHAVE_NEON=1
LOCAL_SRC_FILES += neon_fm_kernel.s
LOCAL_SRC_FILES += neon_fm_kernel.s \
neon_ladder.s
endif
# for native audio
@ -36,4 +37,23 @@ LOCAL_CFLAGS += -O3
include $(BUILD_SHARED_LIBRARY)
include $(CLEAR_VARS)
LOCAL_SRC_FILES := test_neon.cc \
resofilter.cc
ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
LOCAL_ARM_NEON := true
LOCAL_CFLAGS := -DHAVE_NEON=1
LOCAL_SRC_FILES += neon_fm_kernel.s \
neon_ladder.s
endif
LOCAL_CFLAGS += -O3
LOCAL_STATIC_LIBRARIES += cpufeatures
LOCAL_MODULE := test_neon
include $(BUILD_EXECUTABLE)
$(call import-module,android/cpufeatures)

@ -0,0 +1,321 @@
.text
.align 2
.global neon_ladder_nl
.type neon_ladder_nl, %function
neon_ladder_nl:
@ r0 = pointer to input
@ r1 = pointer to filter (A then B then k then overdrive)
@ r2 = pointer to output
@ r3 = number of samples
@ stack[0] = pointer to state
@q12 - q15 matrix (A)
@q11 B
@q0 input
@q1 state
@q3 const 1.0
@q6 overdrive, output gain
@q10 resonance (k)
push {r4-r5}
vpush {q4-q7}
ldr r4, [sp, #(4 * 18)]
vld1.32 {q1}, [r4:128]
vld1.32 {q12}, [r1:128]!
vld1.32 {q13}, [r1:128]!
vld1.32 {q14}, [r1:128]!
vld1.32 {q15}, [r1:128]!
vld1.32 {q11}, [r1:128]!
vld1.32 {d20[], d21[]}, [r1]!
vld1.32 {d12[0]}, [r1]!
vld1.32 {d12[1]}, [r1]!
adr r5, neon_ladder_nl_const
vld1.32 {q3}, [r5:128]!
neon_ladder_nl_1:
@ cycle counting suggests that careful unrolling would
@ save about 10% - we will avoid the complexity for now
vld1.32 {d0[0]}, [r0]!
vcvt.f32.s32 q0, q0
@ compute resonance
vmls.f32 q0, q10, d3[1]
@ q4 gets q1/sqrt(1 + q1*q1)
@ q0 gets q0/sqrt(1 + q0*q0)
vmul.f32 q2, q1, d12[0] @ q1 * overdrive
vmul.f32 q7, q0, d12[0]
vmul.f32 q2, q2, q2
vmul.f32 q7, q7, q7
vadd.f32 q2, q3 @ z = 1 + (q1 * overdrive)^2
vadd.f32 q7, q3
vrsqrte.f32 q4, q2 @ est ~ 1/sqrt(z)
vrsqrte.f32 q8, q7
vmul.f32 q2, q4, q2 @ z * est
vmul.f32 q7, q8, q7
vrsqrts.f32 q2, q2, q4 @ (3 - z * est^2)/2
vrsqrts.f32 q7, q7, q8
vmul.f32 q4, q1 @ q1 * est
vmul.f32 q0, q8
vmul.f32 q4, q2 @ q1 * est * (3 - z * est^2)/2
vmul.f32 q0, q7
vmla.f32 q1, q12, d8[0] @ sigmoid(y0)
vmul.f32 q2, q11, d0[0] @ sigmod(x - k * y3)
vmla.f32 q1, q13, d8[1] @ sigmoid(y1)
vmla.f32 q2, q14, d9[0] @ sigmoid(y2)
vmla.f32 q1, q15, d9[1] @ sigmoid(y3)
vadd.f32 q1, q2
vmul.f32 q4, q1, d12[1]
vcvt.s32.f32 q4, q4
vst1.32 {d9[1]}, [r2]!
subs r3, #1
bne neon_ladder_nl_1
vst1.32 {q1}, [r4:128]
vpop {q4-q7}
pop {r4-r5}
bx lr
.size neon_ladder_nl, .-neon_ladder_nl
.balign 16
neon_ladder_nl_const:
.float 1.0, 1.0, 1.0, 1.0
.align 2
.global neon_ladder_lin
.type neon_ladder_lin, %function
.if 0
@ Simpler, not as optimized approach (26.3 cycles on N10)
neon_ladder_lin:
@ r0 = pointer to input
@ r1 = pointer to filter (A then B)
@ r2 = pointer to output
@ r3 = number of samples
@ stack[0] = pointer to state
@q12 - q15 matrix (A)
@q11 B
@q0 input
@q1 state
push {r4-r5}
ldr r4, [sp, #(4 * 2)]
vld1.32 {q1}, [r4:128]
vld1.32 {q12}, [r1:128]!
vld1.32 {q13}, [r1:128]!
vld1.32 {q14}, [r1:128]!
vld1.32 {q15}, [r1:128]!
vld1.32 {q11}, [r1:128]!
neon_ladder_lin_1:
vld1.32 {d0[0]}, [r0]!
vmov q3, q1
vcvt.f32.s32 q0, q0
vmul.f32 q1, q12, d6[0] @ y0
vmul.f32 q2, q13, d6[1] @ y1
vmla.f32 q1, q11, d0[0] @ x
vmla.f32 q2, q14, d7[0] @ y2
vmla.f32 q1, q15, d7[1] @ y3
vadd.f32 q1, q2
vcvt.s32.f32 q3, q1
vst1.32 {d7[1]}, [r2]!
subs r3, #1
bne neon_ladder_lin_1
vst1.32 {q1}, [r4:128]
pop {r4-r5}
bx lr
.else
@ Unrolled, more highly optimized loop (22.5 cycles on N10)
neon_ladder_lin:
@ r0 = pointer to input
@ r1 = pointer to filter (A then B)
@ r2 = pointer to output
@ r3 = number of samples
@ stack[0] = pointer to state
@q12 - q15 matrix (A)
@q11 B
@q0 input
@q1 state
push {r4-r5}
ldr r4, [sp, #(4 * 2)]
vld1.32 {q1}, [r4:128]
vld1.32 {q12}, [r1:128]!
vld1.32 {q13}, [r1:128]!
vld1.32 {q14}, [r1:128]!
vld1.32 {q15}, [r1:128]!
vld1.32 {q11}, [r1:128]!
neon_ladder_lin_1:
vld1.32 {q0}, [r0:128]!
vcvt.f32.s32 q0, q0
vmul.f32 q9, q12, d2[0] @ y0
vmul.f32 q2, q13, d2[1] @ y1
vmla.f32 q9, q11, d0[0] @ x
vmla.f32 q2, q14, d3[0] @ y2
vmla.f32 q9, q15, d3[1] @ y3
vadd.f32 q2, q9
vmul.f32 q1, q11, d0[1] @ x
vmul.f32 q9, q12, d4[0] @ y0
vmla.f32 q1, q13, d4[1] @ y1
vmla.f32 q9, q14, d5[0] @ y2
vcvt.s32.f32 q3, q2
vmla.f32 q1, q15, d5[1] @ y3
vst1.32 {d7[1]}, [r2]!
vadd.f32 q1, q9
vmul.f32 q9, q11, d1[0] @ x
vmul.f32 q2, q12, d2[0] @ y0
vmla.f32 q9, q13, d2[1] @ y1
vmla.f32 q2, q14, d3[0] @ y2
vcvt.s32.f32 q3, q1
vmla.f32 q9, q15, d3[1] @ y3
vst1.32 {d7[1]}, [r2]!
vadd.f32 q2, q9
vmul.f32 q1, q11, d1[1] @ x
vmul.f32 q9, q12, d4[0] @ y0
vmla.f32 q1, q13, d4[1] @ y1
vmla.f32 q9, q14, d5[0] @ y2
vcvt.s32.f32 q3, q2
vmla.f32 q1, q15, d5[1] @ y3
vst1.32 {d7[1]}, [r2]!
vadd.f32 q1, q9
vcvt.s32.f32 q3, q1
vst1.32 {d7[1]}, [r2]!
subs r3, #4
bne neon_ladder_lin_1
vst1.32 {q1}, [r4:128]
pop {r4-r5}
bx lr
.endif
.size neon_ladder_lin, .-neon_ladder_lin
.global neon_ladder_mkmatrix
.type neon_ladder_mkmatrix, %function
neon_ladder_mkmatrix:
@ r0 = pointer to params (a, k)
@ r1 = out pointer to matrix (A then B, just like consumer)
vpush {q4-q7}
vld1.32 {d0[]}, [r0]! @ a
vmov.i32 d1, #0
vneg.f32 d2, d0
vmov.f32 s0, s4
vld1.32 {d6[0]}, [r0] @ k
vmov.i32 q2, #0
vmul.f32 s9, s12, s0
adr r2, neon_ladder_mkmatrix_const
vmov.i32 q1, #0
vld1.32 {d2[0]}, [r2]!
vadd.f32 q14, q1, q0
vmov q15, q2
vmov q6, q0
vmov q7, q2
@ (q0, q2) is jacobian matrix, (q14, q15) is series accum
@ (q6, q7) is jacobian ^ i
mov r3, #3
neon_ladder_mkmatrix1:
vext.32 q8, q7, q6, #3
vmul.f32 q12, q6, d0[0]
vmul.f32 q13, q8, d4[1]
vext.32 q9, q7, q6, #2
vmla.f32 q12, q8, d0[1]
vmla.f32 q13, q9, d5[0]
vext.32 q10, q6, q6, #1
vmla.f32 q12, q9, d1[0]
vmla.f32 q13, q10, d5[1]
vmla.f32 q12, q10, d1[1]
vmla.f32 q13, q7, d0[0]
vld1.32 {d6[0]}, [r2]!
vmla.f32 q14, q12, d6[0]
vmov q6, q12
vmla.f32 q15, q13, d6[0]
vmov q7, q13
subs r3, #1
bne neon_ladder_mkmatrix1
vmov q0, q14
vmov q2, q15
mov r3, #4
neon_ladder_mkmatrix2:
vext.32 q8, q2, q0, #3
@ q0 = {a11, a21, a31, a41}, q2 = {0, a14, a24, a34}
@ square the matrix
vmul.f32 q12, q0, d0[0]
vmul.f32 q13, q8, d4[1]
vext.32 q9, q2, q0, #2
vmla.f32 q12, q8, d0[1]
vmla.f32 q13, q9, d5[0]
vext.32 q10, q2, q0, #1
vmla.f32 q12, q9, d1[0]
vmla.f32 q13, q10, d5[1]
vmla.f32 q12, q10, d1[1]
vmla.f32 q13, q2, d0[0]
vmov q0, q12
vmov q2, q13
subs r3, #1
bne neon_ladder_mkmatrix2
@ unwrap toeplitz matrix into the full form
vst1.32 {q0}, [r1]!
vext.32 q8, q2, q0, #3
vst1.32 {q8}, [r1]!
vext.32 q9, q2, q0, #2
vst1.32 {q9}, [r1]!
vext.32 q10, q2, q0, #1
vst1.32 {q10}, [r1]!
adr r2, neon_ladder_mkmatrix_const
vld1.32 {d2[], d3[]}, [r2]
vld1.32 {d4[0]}, [r0] @ k
vadd.f32 d4, d2
vrecpe.f32 d6, d4
vrecps.f32 d2, d6, d4
vadd.f32 q0, q8
vadd.f32 q9, q10
vsub.f32 q15, q1, q0
vmul.f32 d6, d2 @ 1 / (1 + k)
vsub.f32 q15, q9
vmul.f32 q15, d6[0]
vst1.32 {q15}, [r1]!
vpop {q4-q7}
bx lr
.size neon_ladder_mkmatrix, .-neon_ladder_mkmatrix
.balign 16
neon_ladder_mkmatrix_const:
.float 1.0, 0.5, .16666667, .041666667

@ -31,6 +31,15 @@
#include "aligned_buf.h"
#include "resofilter.h"
#ifdef HAVE_NEON
extern "C"
void neon_ladder_nl(const int32_t *in, const float *a, int32_t *out, int count,
float *state);
extern "C"
void neon_ladder_lin(const int32_t *in, const float *a, int32_t *out, int count,
float *state);
#endif
double this_sample_rate;
void ResoFilter::init(double sample_rate) {
@ -39,7 +48,7 @@ void ResoFilter::init(double sample_rate) {
ResoFilter::ResoFilter() {
for (int i = 0; i < 4; i++) {
x[i] = 0;
x.get()[i] = 0;
#if defined(NONLINEARITY)
w[i] = 0;
#endif
@ -188,14 +197,28 @@ void ResoFilter::process(const int32_t **inbufs, const int32_t *control_in,
float overdrive = control_in[2] * (1.0 / (1 << 24));
const int32_t *ibuf = inbufs[0];
int32_t *obuf = outbufs[0];
bool useneon = false;
#ifdef HAVE_NEON
useneon = true; // TODO: detect
#endif
if (overdrive == 0) {
if (useneon) {
#ifdef HAVE_NEON
AlignedBuf<float, 20> a_neon;
matcopy(a_neon.get(), a.get() + 4, 16);
matcopy(a_neon.get() + 16, a.get(), 4);
neon_ladder_lin(ibuf, a_neon.get(), obuf, n, x.get());
#endif
} else {
for (int i = 0; i < n; i++) {
float signal = ibuf[i];
float tmp[4];
matvec4(tmp, a.get() + 4, x);
matvec4(tmp, a.get() + 4, x.get());
for (int k = 0; k < 4; k++) {
x[k] = tmp[k] + signal * a.get()[k];
obuf[i] = x[3];
x.get()[k] = tmp[k] + signal * a.get()[k];
obuf[i] = x.get()[3];
}
}
}
} else {
@ -205,18 +228,31 @@ void ResoFilter::process(const int32_t **inbufs, const int32_t *control_in,
a.get()[4 + 5 * i] -= 1.0;
a.get()[16 + i] += k * a.get()[i];
}
if (useneon) {
#ifdef HAVE_NEON
// Neon implementation has A first, then B
AlignedBuf<float, 23> a_neon;
matcopy(a_neon.get(), a.get() + 4, 16);
matcopy(a_neon.get() + 16, a.get(), 4);
a_neon.get()[20] = k;
a_neon.get()[21] = overdrive * (1.0 / (1 << 24));
a_neon.get()[22] = ogain;
neon_ladder_nl(ibuf, a_neon.get(), obuf, n, x.get());
#endif
} else {
for (int i = 0; i < n; i++) {
float signal = ibuf[i];
float tmp[4];
float tx[4];
for (int j = 0; j < 4; j++) {
tx[j] = sigmoid(x[j], overdrive);
tx[j] = sigmoid(x.get()[j], overdrive);
}
matvec4(tmp, a.get() + 4, tx);
float xin = sigmoid(signal - k * x[3], overdrive);
float xin = sigmoid(signal - k * x.get()[3], overdrive);
for (int j = 0; j < 4; j++) {
x[j] += tmp[j] + xin * a.get()[j];
obuf[i] = x[3] * ogain;
x.get()[j] += tmp[j] + xin * a.get()[j];
obuf[i] = x.get()[3] * ogain;
}
}
}
}
@ -288,3 +324,25 @@ void ResoFilter::process(const int32_t **inbufs, const int32_t *control_in,
#endif
}
#endif // USE_MATRIX
void reso_benchmark(int niter, bool nonlinear) {
#ifdef HAVE_NEON
AlignedBuf<float, 23> a_neon;
for (int i = 0; i < 23; i++) {
float y = 0.0;
if (i < 20 && (i % 5) == 0) y = 1.0;
a_neon.get()[i] = y;
}
const int n = 64;
AlignedBuf<int32_t, n> inbuf;
AlignedBuf<int32_t, n> outbuf;
AlignedBuf<float, 4> x;
for (int i = 0; i < niter; i++) {
if (nonlinear) {
neon_ladder_nl(inbuf.get(), a_neon.get(), outbuf.get(), n, x.get());
} else {
neon_ladder_lin(inbuf.get(), a_neon.get(), outbuf.get(), n, x.get());
}
}
#endif
}

@ -31,7 +31,7 @@ class ResoFilter : Module {
const int32_t *control_last, int32_t **outbufs);
private:
#if defined(USE_MATRIX)
float x[4];
AlignedBuf<float, 4> x;
#else
int32_t x[4];
#if defined(NONLINEARITY)

@ -250,7 +250,7 @@ void SynthUnit::GetSamples(int n_samples, int16_t *buffer) {
for (; i < n_samples; i += N) {
AlignedBuf<int32_t, N> audiobuf;
int32_t audiobuf2[N];
AlignedBuf<int32_t, N> audiobuf2;
for (int j = 0; j < N; ++j) {
audiobuf.get()[j] = 0;
}
@ -263,11 +263,11 @@ void SynthUnit::GetSamples(int n_samples, int16_t *buffer) {
}
}
const int32_t *bufs[] = { audiobuf.get() };
int32_t *bufs2[] = { audiobuf2 };
int32_t *bufs2[] = { audiobuf2.get() };
filter_.process(bufs, filter_control_, filter_control_, bufs2);
int jmax = n_samples - i;
for (int j = 0; j < N; ++j) {
int32_t val = audiobuf2[j] >> 4;
int32_t val = audiobuf2.get()[j] >> 4;
int clip_val = val < -(1 << 24) ? 0x8000 : val >= (1 << 24) ? 0x7fff :
val >> 9;
// TODO: maybe some dithering?

@ -0,0 +1,28 @@
#include <stdio.h>
#include "aligned_buf.h"
void reso_benchmark(int n, bool nonlinear);
#ifdef HAVE_NEON
extern "C" void neon_ladder_mkmatrix(const float *in, float *out);
#endif
int main(int argc, char** argv) {
#ifdef HAVE_NEON
float in[2] = {0.1, 3.9};
AlignedBuf<float, 20> a;
for (int i = 0; i < 10000000; i++) {
neon_ladder_mkmatrix(in, a.get());
}
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 5; j++) {
printf("%6f ", a.get()[j * 4 + i]);
}
printf("\n");
}
#endif
//reso_benchmark(1000000, false);
return 0;
}
Loading…
Cancel
Save