diff --git a/app/build.gradle b/app/build.gradle index d158a9e..a00706a 100644 --- a/app/build.gradle +++ b/app/build.gradle @@ -22,6 +22,11 @@ model { moduleName "synth" ldLibs.addAll(['log', 'OpenSLES']) } + abis { + create("armeabi-v7a") { + cppFlags.add('-mfpu=neon') + } + } sources { main { jni { diff --git a/app/src/main/jni/fm_op_kernel.cc b/app/src/main/jni/fm_op_kernel.cc index 1a7e10b..bf1c8c5 100644 --- a/app/src/main/jni/fm_op_kernel.cc +++ b/app/src/main/jni/fm_op_kernel.cc @@ -21,12 +21,132 @@ #include "sin.h" #include "fm_op_kernel.h" -#ifdef HAVE_NEON +#ifdef HAVE_NEON_INTRINSICS extern "C" void neon_fm_kernel(const int *in, const int *busin, int *out, int count, int32_t phase0, int32_t freq, int32_t gain1, int32_t dgain); +const int32_t __attribute__ ((aligned(16))) const_0_1_2_3_arg[4] = {0, 1, 2, 3}; +const int32_t __attribute__ ((aligned(16))) mask23_arg = 0x7fffff; +const float32_t __attribute__ ((aligned(16))) coeffs_arg[4] = { + -0.01880853017455781, 0.25215252666796095, -1.2333439964934032, 1.0 +}; + +void neon_fm_kernel(const int *in, const int *busin, int *out, int count, + int32_t phase0, int32_t freq_arg, int32_t gain1_arg, int32_t dgain_arg) { + int32x4_t phase = vld1q_dup_s32(&phase0); + int32x4_t freq = vld1q_dup_s32(&freq_arg); + int32x4_t const_0_1_2_3 = vld1q_s32(const_0_1_2_3_arg); + phase = vmlaq_s32(phase, freq, const_0_1_2_3); + int32x4_t gain1 = vld1q_dup_s32(&gain1_arg); + int32x4_t dgain = vld1q_dup_s32(&dgain_arg); + gain1 = vmlaq_s32(gain1, dgain, const_0_1_2_3); + int32x4_t mask23 = vld1q_dup_s32(&mask23_arg); + float32x4_t coeffs = vld1q_f32(coeffs_arg); + float32x4_t gainf = vcvtq_n_f32_s32(gain1, 24); + int32x4_t freq4 = vshlq_n_s32(freq, 2); + float32x4_t dgainf = vcvtq_n_f32_s32(dgain, 22); + + count -= 4; + int32x4_t q15 = vmovq_n_s32(0x800000); + int32x4_t q7 = vmovq_n_s32(0x400000); + while (true) { + int32x4_t phase4 = vaddq_s32(phase, freq4); + int32x4_t phase8 = vaddq_s32(phase4, freq4); + int32x4_t data1a = vld1q_s32(in); + data1a = vaddq_s32(data1a, phase); + int32x4_t data1b = vld1q_s32(in + 4); + data1b = vaddq_s32(data1b, phase4); + int32x4_t data1c = vld1q_s32(in + 8); + data1c = vaddq_s32(data1c, phase8); + phase = vaddq_s32(phase8, freq4); + in += 12; + int32x4_t data4a = (int32x4_t)vtstq_s32(data1a, q15); + int32x4_t data4b = (int32x4_t)vtstq_s32(data1b, q15); + int32x4_t data4c = (int32x4_t)vtstq_s32(data1c, q15); + data1a = vandq_s32(data1a, mask23); + data1b = vandq_s32(data1b, mask23); + data1c = vandq_s32(data1c, mask23); + data1a = vsubq_s32(data1a, q7); + data1b = vsubq_s32(data1b, q7); + data1c = vsubq_s32(data1c, q7); + float32x4_t fdata1a = vcvtq_n_f32_s32(data1a, 22); + float32x4_t fdata1b = vcvtq_n_f32_s32(data1b, 22); + float32x4_t fdata1c = vcvtq_n_f32_s32(data1c, 22); + fdata1a = vmulq_f32(fdata1a, fdata1a); + fdata1b = vmulq_f32(fdata1b, fdata1b); + fdata1c = vmulq_f32(fdata1c, fdata1c); + float32x4_t fdata2a = vdupq_lane_f32(vget_low_f32(coeffs), 1); + float32x4_t fdata2b = vdupq_lane_f32(vget_low_f32(coeffs), 1); + float32x4_t fdata2c = vdupq_lane_f32(vget_low_f32(coeffs), 1); + fdata2a = vmlaq_lane_f32(fdata2a, fdata1a, vget_low_f32(coeffs), 0); + fdata2b = vmlaq_lane_f32(fdata2b, fdata1b, vget_low_f32(coeffs), 0); + fdata2c = vmlaq_lane_f32(fdata2c, fdata1c, vget_low_f32(coeffs), 0); + float32x4_t fdata3a = vdupq_lane_f32(vget_high_f32(coeffs), 0); + float32x4_t fdata3b = vdupq_lane_f32(vget_high_f32(coeffs), 0); + float32x4_t fdata3c = vdupq_lane_f32(vget_high_f32(coeffs), 0); + fdata3a = vmlaq_f32(fdata3a, fdata1a, fdata2a); + fdata3b = vmlaq_f32(fdata3b, fdata1b, fdata2b); + fdata3c = vmlaq_f32(fdata3c, fdata1c, fdata2c); + fdata2a = vdupq_lane_f32(vget_high_f32(coeffs), 1); + fdata2b = vdupq_lane_f32(vget_high_f32(coeffs), 1); + fdata2c = vdupq_lane_f32(vget_high_f32(coeffs), 1); + fdata2a = vmlaq_f32(fdata2a, fdata1a, fdata3a); + fdata2b = vmlaq_f32(fdata2b, fdata1b, fdata3b); + fdata2c = vmlaq_f32(fdata2c, fdata1c, fdata3c); + fdata3a = vaddq_f32(gainf, dgainf); + fdata3b = vaddq_f32(fdata3a, dgainf); + fdata2a = vmulq_f32(fdata2a, gainf); + fdata2b = vmulq_f32(fdata2b, fdata3a); + fdata2c = vmulq_f32(fdata2c, fdata3b); + gainf = vaddq_f32(fdata3b, dgainf); + int32x4_t data3a = vcvtq_n_s32_f32(fdata2a, 24); + int32x4_t data3b = vcvtq_n_s32_f32(fdata2b, 24); + int32x4_t data3c = vcvtq_n_s32_f32(fdata2c, 24); + data1a = vld1q_s32(busin); + data1b = vld1q_s32(busin + 4); + data1c = vld1q_s32(busin + 8); + busin += 12; + data3a = veorq_s32(data3a, data4a); + data3b = veorq_s32(data3b, data4b); + data3c = veorq_s32(data3c, data4c); + data3a = vaddq_s32(data3a, data1a); + data3b = vaddq_s32(data3b, data1b); + data3c = vaddq_s32(data3c, data1c); + vst1q_s32(out, data3a); + vst1q_s32(out + 4, data3b); + vst1q_s32(out + 8, data3c); + out += 12; + count -= 12; + if (count <= 0) { + if (count == 0) { + // finish last chunk of 4 + data1a = vld1q_s32(in); + data1a = vaddq_s32(data1a, phase); + data4a = (int32x4_t)vtstq_s32(data1a, q15); + data1a = vandq_s32(data1a, mask23); + data1a = vsubq_s32(data1a, q7); + fdata1a = vcvtq_n_f32_s32(data1a, 22); + fdata1a = vmulq_f32(fdata1a, fdata1a); + fdata2a = vdupq_lane_f32(vget_low_f32(coeffs), 1); + fdata2a = vmlaq_lane_f32(fdata2a, fdata1a, vget_low_f32(coeffs), 0); + fdata3a = vdupq_lane_f32(vget_high_f32(coeffs), 0); + fdata3a = vmlaq_f32(fdata3a, fdata1a, fdata2a); + fdata2a = vdupq_lane_f32(vget_high_f32(coeffs), 1); + fdata2a = vmlaq_f32(fdata2a, fdata1a, fdata3a); + fdata2a = vmulq_f32(fdata2a, gainf); + data3a = vcvtq_n_s32_f32(fdata2a, 24); + data1a = vld1q_s32(busin); + data3a = veorq_s32(data3a, data4a); + data3a = vaddq_s32(data3a, data1a); + vst1q_s32(out, data3a); + } + break; + } + } +} + const int32_t __attribute__ ((aligned(16))) zeros[N] = {0}; #endif @@ -38,7 +158,7 @@ void FmOpKernel::compute(int32_t *output, const int32_t *input, int32_t gain = gain1; int32_t phase = phase0; if (hasNeon()) { -#ifdef HAVE_NEON +#ifdef HAVE_NEON_INTRINSICS neon_fm_kernel(input, add ? output : zeros, output, N, phase0, freq, gain, dgain); #endif @@ -68,7 +188,7 @@ void FmOpKernel::compute_pure(int32_t *output, int32_t phase0, int32_t freq, int32_t gain = gain1; int32_t phase = phase0; if (hasNeon()) { -#ifdef HAVE_NEON +#ifdef HAVE_NEON_INTRINSICS neon_fm_kernel(zeros, add ? output : zeros, output, N, phase0, freq, gain, dgain); #endif diff --git a/app/src/main/jni/resofilter.cc b/app/src/main/jni/resofilter.cc index 7db35bb..1b89342 100644 --- a/app/src/main/jni/resofilter.cc +++ b/app/src/main/jni/resofilter.cc @@ -202,7 +202,7 @@ void ResoFilter::process(const int32_t **inbufs, const int32_t *control_in, float overdrive = control_in[2] * (1.0 / (1 << 24)); const int32_t *ibuf = inbufs[0]; int32_t *obuf = outbufs[0]; - bool useneon = hasNeon(); + bool useneon = false; // TODO: hasNeon(); if (useneon) { #ifdef HAVE_NEON float params[2]; diff --git a/app/src/main/jni/synth.h b/app/src/main/jni/synth.h index 1addd5d..262a987 100644 --- a/app/src/main/jni/synth.h +++ b/app/src/main/jni/synth.h @@ -51,11 +51,21 @@ inline static T max(const T& a, const T& b) { return a > b ? a : b; } -#ifdef HAVE_NEON -#include +#ifdef __aarch64__ +#define HAVE_NEON_INTRINSICS +#include +static inline bool hasNeon() { + return true; +} +#elif __ARM_NEON__ +#define HAVE_NEON_INTRINSICS +#include +// TODO: get this working (although maybe we don't even care about non-NEON ARM any more) +//#include static inline bool hasNeon() { - return (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) != 0; + return true; + //return (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) != 0; } #else static inline bool hasNeon() { @@ -63,6 +73,4 @@ static inline bool hasNeon() { } #endif - - #endif // __SYNTH_H