diff --git a/android/jni/Android.mk b/android/jni/Android.mk index 9c3b5d6..b8057e0 100644 --- a/android/jni/Android.mk +++ b/android/jni/Android.mk @@ -22,7 +22,8 @@ LOCAL_SRC_FILES := android_glue.cc \ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a) LOCAL_ARM_NEON := true LOCAL_CFLAGS := -DHAVE_NEON=1 - LOCAL_SRC_FILES += neon_fm_kernel.s + LOCAL_SRC_FILES += neon_fm_kernel.s \ + neon_ladder.s endif # for native audio @@ -36,4 +37,23 @@ LOCAL_CFLAGS += -O3 include $(BUILD_SHARED_LIBRARY) -$(call import-module,android/cpufeatures) \ No newline at end of file +include $(CLEAR_VARS) +LOCAL_SRC_FILES := test_neon.cc \ + resofilter.cc + +ifeq ($(TARGET_ARCH_ABI),armeabi-v7a) + LOCAL_ARM_NEON := true + LOCAL_CFLAGS := -DHAVE_NEON=1 + LOCAL_SRC_FILES += neon_fm_kernel.s \ + neon_ladder.s +endif + +LOCAL_CFLAGS += -O3 + +LOCAL_STATIC_LIBRARIES += cpufeatures + +LOCAL_MODULE := test_neon + +include $(BUILD_EXECUTABLE) + +$(call import-module,android/cpufeatures) diff --git a/cpp/src/neon_ladder.s b/cpp/src/neon_ladder.s new file mode 100644 index 0000000..9a9a6dc --- /dev/null +++ b/cpp/src/neon_ladder.s @@ -0,0 +1,321 @@ + .text + + .align 2 + .global neon_ladder_nl + .type neon_ladder_nl, %function +neon_ladder_nl: +@ r0 = pointer to input +@ r1 = pointer to filter (A then B then k then overdrive) +@ r2 = pointer to output +@ r3 = number of samples +@ stack[0] = pointer to state +@q12 - q15 matrix (A) +@q11 B + +@q0 input +@q1 state +@q3 const 1.0 +@q6 overdrive, output gain +@q10 resonance (k) + + push {r4-r5} + vpush {q4-q7} + ldr r4, [sp, #(4 * 18)] + vld1.32 {q1}, [r4:128] + + vld1.32 {q12}, [r1:128]! + vld1.32 {q13}, [r1:128]! + vld1.32 {q14}, [r1:128]! + vld1.32 {q15}, [r1:128]! + vld1.32 {q11}, [r1:128]! + vld1.32 {d20[], d21[]}, [r1]! + vld1.32 {d12[0]}, [r1]! + vld1.32 {d12[1]}, [r1]! + + adr r5, neon_ladder_nl_const + vld1.32 {q3}, [r5:128]! + +neon_ladder_nl_1: + @ cycle counting suggests that careful unrolling would + @ save about 10% - we will avoid the complexity for now + + vld1.32 {d0[0]}, [r0]! + vcvt.f32.s32 q0, q0 + + @ compute resonance + vmls.f32 q0, q10, d3[1] + + @ q4 gets q1/sqrt(1 + q1*q1) + @ q0 gets q0/sqrt(1 + q0*q0) + vmul.f32 q2, q1, d12[0] @ q1 * overdrive + vmul.f32 q7, q0, d12[0] + vmul.f32 q2, q2, q2 + vmul.f32 q7, q7, q7 + vadd.f32 q2, q3 @ z = 1 + (q1 * overdrive)^2 + vadd.f32 q7, q3 + vrsqrte.f32 q4, q2 @ est ~ 1/sqrt(z) + vrsqrte.f32 q8, q7 + vmul.f32 q2, q4, q2 @ z * est + vmul.f32 q7, q8, q7 + vrsqrts.f32 q2, q2, q4 @ (3 - z * est^2)/2 + vrsqrts.f32 q7, q7, q8 + vmul.f32 q4, q1 @ q1 * est + vmul.f32 q0, q8 + vmul.f32 q4, q2 @ q1 * est * (3 - z * est^2)/2 + vmul.f32 q0, q7 + + vmla.f32 q1, q12, d8[0] @ sigmoid(y0) + vmul.f32 q2, q11, d0[0] @ sigmod(x - k * y3) + vmla.f32 q1, q13, d8[1] @ sigmoid(y1) + vmla.f32 q2, q14, d9[0] @ sigmoid(y2) + vmla.f32 q1, q15, d9[1] @ sigmoid(y3) + vadd.f32 q1, q2 + + vmul.f32 q4, q1, d12[1] + vcvt.s32.f32 q4, q4 + vst1.32 {d9[1]}, [r2]! + subs r3, #1 + bne neon_ladder_nl_1 + + vst1.32 {q1}, [r4:128] + vpop {q4-q7} + pop {r4-r5} + bx lr + + .size neon_ladder_nl, .-neon_ladder_nl + + .balign 16 +neon_ladder_nl_const: + .float 1.0, 1.0, 1.0, 1.0 + + .align 2 + .global neon_ladder_lin + .type neon_ladder_lin, %function + + .if 0 +@ Simpler, not as optimized approach (26.3 cycles on N10) + +neon_ladder_lin: + +@ r0 = pointer to input +@ r1 = pointer to filter (A then B) +@ r2 = pointer to output +@ r3 = number of samples +@ stack[0] = pointer to state +@q12 - q15 matrix (A) +@q11 B + +@q0 input +@q1 state + + push {r4-r5} + ldr r4, [sp, #(4 * 2)] + vld1.32 {q1}, [r4:128] + + vld1.32 {q12}, [r1:128]! + vld1.32 {q13}, [r1:128]! + vld1.32 {q14}, [r1:128]! + vld1.32 {q15}, [r1:128]! + vld1.32 {q11}, [r1:128]! + +neon_ladder_lin_1: + vld1.32 {d0[0]}, [r0]! + vmov q3, q1 + vcvt.f32.s32 q0, q0 + + vmul.f32 q1, q12, d6[0] @ y0 + vmul.f32 q2, q13, d6[1] @ y1 + vmla.f32 q1, q11, d0[0] @ x + vmla.f32 q2, q14, d7[0] @ y2 + vmla.f32 q1, q15, d7[1] @ y3 + vadd.f32 q1, q2 + + vcvt.s32.f32 q3, q1 + vst1.32 {d7[1]}, [r2]! + subs r3, #1 + bne neon_ladder_lin_1 + + vst1.32 {q1}, [r4:128] + pop {r4-r5} + bx lr +.else +@ Unrolled, more highly optimized loop (22.5 cycles on N10) + +neon_ladder_lin: +@ r0 = pointer to input +@ r1 = pointer to filter (A then B) +@ r2 = pointer to output +@ r3 = number of samples +@ stack[0] = pointer to state +@q12 - q15 matrix (A) +@q11 B + +@q0 input +@q1 state + + push {r4-r5} + ldr r4, [sp, #(4 * 2)] + vld1.32 {q1}, [r4:128] + + vld1.32 {q12}, [r1:128]! + vld1.32 {q13}, [r1:128]! + vld1.32 {q14}, [r1:128]! + vld1.32 {q15}, [r1:128]! + vld1.32 {q11}, [r1:128]! + +neon_ladder_lin_1: + vld1.32 {q0}, [r0:128]! + vcvt.f32.s32 q0, q0 + + vmul.f32 q9, q12, d2[0] @ y0 + vmul.f32 q2, q13, d2[1] @ y1 + vmla.f32 q9, q11, d0[0] @ x + vmla.f32 q2, q14, d3[0] @ y2 + vmla.f32 q9, q15, d3[1] @ y3 + vadd.f32 q2, q9 + + vmul.f32 q1, q11, d0[1] @ x + vmul.f32 q9, q12, d4[0] @ y0 + vmla.f32 q1, q13, d4[1] @ y1 + vmla.f32 q9, q14, d5[0] @ y2 + vcvt.s32.f32 q3, q2 + vmla.f32 q1, q15, d5[1] @ y3 + vst1.32 {d7[1]}, [r2]! + vadd.f32 q1, q9 + + vmul.f32 q9, q11, d1[0] @ x + vmul.f32 q2, q12, d2[0] @ y0 + vmla.f32 q9, q13, d2[1] @ y1 + vmla.f32 q2, q14, d3[0] @ y2 + vcvt.s32.f32 q3, q1 + vmla.f32 q9, q15, d3[1] @ y3 + vst1.32 {d7[1]}, [r2]! + vadd.f32 q2, q9 + + vmul.f32 q1, q11, d1[1] @ x + vmul.f32 q9, q12, d4[0] @ y0 + vmla.f32 q1, q13, d4[1] @ y1 + vmla.f32 q9, q14, d5[0] @ y2 + vcvt.s32.f32 q3, q2 + vmla.f32 q1, q15, d5[1] @ y3 + vst1.32 {d7[1]}, [r2]! + vadd.f32 q1, q9 + + vcvt.s32.f32 q3, q1 + vst1.32 {d7[1]}, [r2]! + subs r3, #4 + bne neon_ladder_lin_1 + + vst1.32 {q1}, [r4:128] + pop {r4-r5} + bx lr +.endif + + .size neon_ladder_lin, .-neon_ladder_lin + + .global neon_ladder_mkmatrix + .type neon_ladder_mkmatrix, %function +neon_ladder_mkmatrix: +@ r0 = pointer to params (a, k) +@ r1 = out pointer to matrix (A then B, just like consumer) + vpush {q4-q7} + vld1.32 {d0[]}, [r0]! @ a + vmov.i32 d1, #0 + vneg.f32 d2, d0 + vmov.f32 s0, s4 + + vld1.32 {d6[0]}, [r0] @ k + vmov.i32 q2, #0 + vmul.f32 s9, s12, s0 + + adr r2, neon_ladder_mkmatrix_const + vmov.i32 q1, #0 + vld1.32 {d2[0]}, [r2]! + vadd.f32 q14, q1, q0 + vmov q15, q2 + vmov q6, q0 + vmov q7, q2 + + @ (q0, q2) is jacobian matrix, (q14, q15) is series accum + @ (q6, q7) is jacobian ^ i + + mov r3, #3 +neon_ladder_mkmatrix1: + vext.32 q8, q7, q6, #3 + vmul.f32 q12, q6, d0[0] + vmul.f32 q13, q8, d4[1] + vext.32 q9, q7, q6, #2 + vmla.f32 q12, q8, d0[1] + vmla.f32 q13, q9, d5[0] + vext.32 q10, q6, q6, #1 + vmla.f32 q12, q9, d1[0] + vmla.f32 q13, q10, d5[1] + vmla.f32 q12, q10, d1[1] + vmla.f32 q13, q7, d0[0] + + vld1.32 {d6[0]}, [r2]! + vmla.f32 q14, q12, d6[0] + vmov q6, q12 + vmla.f32 q15, q13, d6[0] + vmov q7, q13 + subs r3, #1 + bne neon_ladder_mkmatrix1 + + vmov q0, q14 + vmov q2, q15 + + mov r3, #4 + +neon_ladder_mkmatrix2: + vext.32 q8, q2, q0, #3 + + @ q0 = {a11, a21, a31, a41}, q2 = {0, a14, a24, a34} + + @ square the matrix + vmul.f32 q12, q0, d0[0] + vmul.f32 q13, q8, d4[1] + vext.32 q9, q2, q0, #2 + vmla.f32 q12, q8, d0[1] + vmla.f32 q13, q9, d5[0] + vext.32 q10, q2, q0, #1 + vmla.f32 q12, q9, d1[0] + vmla.f32 q13, q10, d5[1] + vmla.f32 q12, q10, d1[1] + vmla.f32 q13, q2, d0[0] + + vmov q0, q12 + vmov q2, q13 + subs r3, #1 + bne neon_ladder_mkmatrix2 + + @ unwrap toeplitz matrix into the full form + vst1.32 {q0}, [r1]! + vext.32 q8, q2, q0, #3 + vst1.32 {q8}, [r1]! + vext.32 q9, q2, q0, #2 + vst1.32 {q9}, [r1]! + vext.32 q10, q2, q0, #1 + vst1.32 {q10}, [r1]! + + adr r2, neon_ladder_mkmatrix_const + vld1.32 {d2[], d3[]}, [r2] + vld1.32 {d4[0]}, [r0] @ k + vadd.f32 d4, d2 + vrecpe.f32 d6, d4 + vrecps.f32 d2, d6, d4 + + vadd.f32 q0, q8 + vadd.f32 q9, q10 + vsub.f32 q15, q1, q0 + vmul.f32 d6, d2 @ 1 / (1 + k) + vsub.f32 q15, q9 + vmul.f32 q15, d6[0] + vst1.32 {q15}, [r1]! + + vpop {q4-q7} + bx lr + + .size neon_ladder_mkmatrix, .-neon_ladder_mkmatrix + .balign 16 +neon_ladder_mkmatrix_const: + .float 1.0, 0.5, .16666667, .041666667 diff --git a/cpp/src/resofilter.cc b/cpp/src/resofilter.cc index 43f50af..f0419ef 100644 --- a/cpp/src/resofilter.cc +++ b/cpp/src/resofilter.cc @@ -31,6 +31,15 @@ #include "aligned_buf.h" #include "resofilter.h" +#ifdef HAVE_NEON +extern "C" +void neon_ladder_nl(const int32_t *in, const float *a, int32_t *out, int count, + float *state); +extern "C" +void neon_ladder_lin(const int32_t *in, const float *a, int32_t *out, int count, + float *state); +#endif + double this_sample_rate; void ResoFilter::init(double sample_rate) { @@ -39,7 +48,7 @@ void ResoFilter::init(double sample_rate) { ResoFilter::ResoFilter() { for (int i = 0; i < 4; i++) { - x[i] = 0; + x.get()[i] = 0; #if defined(NONLINEARITY) w[i] = 0; #endif @@ -188,14 +197,28 @@ void ResoFilter::process(const int32_t **inbufs, const int32_t *control_in, float overdrive = control_in[2] * (1.0 / (1 << 24)); const int32_t *ibuf = inbufs[0]; int32_t *obuf = outbufs[0]; + bool useneon = false; +#ifdef HAVE_NEON + useneon = true; // TODO: detect +#endif + if (overdrive == 0) { - for (int i = 0; i < n; i++) { - float signal = ibuf[i]; - float tmp[4]; - matvec4(tmp, a.get() + 4, x); - for (int k = 0; k < 4; k++) { - x[k] = tmp[k] + signal * a.get()[k]; - obuf[i] = x[3]; + if (useneon) { +#ifdef HAVE_NEON + AlignedBuf a_neon; + matcopy(a_neon.get(), a.get() + 4, 16); + matcopy(a_neon.get() + 16, a.get(), 4); + neon_ladder_lin(ibuf, a_neon.get(), obuf, n, x.get()); +#endif + } else { + for (int i = 0; i < n; i++) { + float signal = ibuf[i]; + float tmp[4]; + matvec4(tmp, a.get() + 4, x.get()); + for (int k = 0; k < 4; k++) { + x.get()[k] = tmp[k] + signal * a.get()[k]; + obuf[i] = x.get()[3]; + } } } } else { @@ -205,18 +228,31 @@ void ResoFilter::process(const int32_t **inbufs, const int32_t *control_in, a.get()[4 + 5 * i] -= 1.0; a.get()[16 + i] += k * a.get()[i]; } - for (int i = 0; i < n; i++) { - float signal = ibuf[i]; - float tmp[4]; - float tx[4]; - for (int j = 0; j < 4; j++) { - tx[j] = sigmoid(x[j], overdrive); - } - matvec4(tmp, a.get() + 4, tx); - float xin = sigmoid(signal - k * x[3], overdrive); - for (int j = 0; j < 4; j++) { - x[j] += tmp[j] + xin * a.get()[j]; - obuf[i] = x[3] * ogain; + if (useneon) { +#ifdef HAVE_NEON + // Neon implementation has A first, then B + AlignedBuf a_neon; + matcopy(a_neon.get(), a.get() + 4, 16); + matcopy(a_neon.get() + 16, a.get(), 4); + a_neon.get()[20] = k; + a_neon.get()[21] = overdrive * (1.0 / (1 << 24)); + a_neon.get()[22] = ogain; + neon_ladder_nl(ibuf, a_neon.get(), obuf, n, x.get()); +#endif + } else { + for (int i = 0; i < n; i++) { + float signal = ibuf[i]; + float tmp[4]; + float tx[4]; + for (int j = 0; j < 4; j++) { + tx[j] = sigmoid(x.get()[j], overdrive); + } + matvec4(tmp, a.get() + 4, tx); + float xin = sigmoid(signal - k * x.get()[3], overdrive); + for (int j = 0; j < 4; j++) { + x.get()[j] += tmp[j] + xin * a.get()[j]; + obuf[i] = x.get()[3] * ogain; + } } } } @@ -288,3 +324,25 @@ void ResoFilter::process(const int32_t **inbufs, const int32_t *control_in, #endif } #endif // USE_MATRIX + +void reso_benchmark(int niter, bool nonlinear) { +#ifdef HAVE_NEON + AlignedBuf a_neon; + for (int i = 0; i < 23; i++) { + float y = 0.0; + if (i < 20 && (i % 5) == 0) y = 1.0; + a_neon.get()[i] = y; + } + const int n = 64; + AlignedBuf inbuf; + AlignedBuf outbuf; + AlignedBuf x; + for (int i = 0; i < niter; i++) { + if (nonlinear) { + neon_ladder_nl(inbuf.get(), a_neon.get(), outbuf.get(), n, x.get()); + } else { + neon_ladder_lin(inbuf.get(), a_neon.get(), outbuf.get(), n, x.get()); + } + } +#endif +} diff --git a/cpp/src/resofilter.h b/cpp/src/resofilter.h index c0720bb..3b71d57 100644 --- a/cpp/src/resofilter.h +++ b/cpp/src/resofilter.h @@ -31,7 +31,7 @@ class ResoFilter : Module { const int32_t *control_last, int32_t **outbufs); private: #if defined(USE_MATRIX) - float x[4]; + AlignedBuf x; #else int32_t x[4]; #if defined(NONLINEARITY) diff --git a/cpp/src/synth_unit.cc b/cpp/src/synth_unit.cc index 9a69f5d..4d11d13 100644 --- a/cpp/src/synth_unit.cc +++ b/cpp/src/synth_unit.cc @@ -250,7 +250,7 @@ void SynthUnit::GetSamples(int n_samples, int16_t *buffer) { for (; i < n_samples; i += N) { AlignedBuf audiobuf; - int32_t audiobuf2[N]; + AlignedBuf audiobuf2; for (int j = 0; j < N; ++j) { audiobuf.get()[j] = 0; } @@ -263,11 +263,11 @@ void SynthUnit::GetSamples(int n_samples, int16_t *buffer) { } } const int32_t *bufs[] = { audiobuf.get() }; - int32_t *bufs2[] = { audiobuf2 }; + int32_t *bufs2[] = { audiobuf2.get() }; filter_.process(bufs, filter_control_, filter_control_, bufs2); int jmax = n_samples - i; for (int j = 0; j < N; ++j) { - int32_t val = audiobuf2[j] >> 4; + int32_t val = audiobuf2.get()[j] >> 4; int clip_val = val < -(1 << 24) ? 0x8000 : val >= (1 << 24) ? 0x7fff : val >> 9; // TODO: maybe some dithering? diff --git a/cpp/src/test_neon.cc b/cpp/src/test_neon.cc new file mode 100644 index 0000000..570e8f0 --- /dev/null +++ b/cpp/src/test_neon.cc @@ -0,0 +1,28 @@ +#include + +#include "aligned_buf.h" + +void reso_benchmark(int n, bool nonlinear); + +#ifdef HAVE_NEON +extern "C" void neon_ladder_mkmatrix(const float *in, float *out); +#endif + +int main(int argc, char** argv) { +#ifdef HAVE_NEON + float in[2] = {0.1, 3.9}; + AlignedBuf a; + for (int i = 0; i < 10000000; i++) { + neon_ladder_mkmatrix(in, a.get()); + } + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 5; j++) { + printf("%6f ", a.get()[j * 4 + i]); + } + printf("\n"); + } +#endif + + //reso_benchmark(1000000, false); + return 0; +}