From 7f9c21cd2ead9c8b60a9e04c2e3c24f44ff53eb6 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Wed, 24 Apr 2013 00:02:33 -0700 Subject: [PATCH] Add NEON accelerated FM kernel The FM kernel yields itself well to speedup using NEON assembler. This patch contains the NEON assembly code, plus C integration code (including making sure that buffers are aligned to 16 bytes). --- android/jni/Android.mk | 11 ++- cpp/src/aligned_buf.h | 34 +++++++ cpp/src/fm_core.cc | 10 +- cpp/src/fm_core.h | 4 +- cpp/src/fm_op_kernel.cc | 84 ++++++++++++----- cpp/src/neon_fm_kernel.s | 195 +++++++++++++++++++++++++++++++++++++++ cpp/src/synth_unit.cc | 9 +- 7 files changed, 314 insertions(+), 33 deletions(-) create mode 100644 cpp/src/aligned_buf.h create mode 100644 cpp/src/neon_fm_kernel.s diff --git a/android/jni/Android.mk b/android/jni/Android.mk index 92dd881..c113c85 100644 --- a/android/jni/Android.mk +++ b/android/jni/Android.mk @@ -16,12 +16,21 @@ LOCAL_SRC_FILES := android_glue.cc \ sin.cc \ synth_unit.cc +ifeq ($(TARGET_ARCH_ABI),armeabi-v7a) + LOCAL_ARM_NEON := true + LOCAL_CFLAGS := -DHAVE_NEON=1 + LOCAL_SRC_FILES += neon_fm_kernel.s +endif + # for native audio LOCAL_LDLIBS += -lOpenSLES # for logging LOCAL_LDLIBS += -llog -LOCAL_CFLAGS := -O3 +LOCAL_STATIC_LIBRARIES += cpufeatures + +LOCAL_CFLAGS += -O3 include $(BUILD_SHARED_LIBRARY) +$(call import-module,android/cpufeatures) \ No newline at end of file diff --git a/cpp/src/aligned_buf.h b/cpp/src/aligned_buf.h new file mode 100644 index 0000000..70aef2a --- /dev/null +++ b/cpp/src/aligned_buf.h @@ -0,0 +1,34 @@ +/* + * Copyright 2013 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// A convenient wrapper for buffers with alignment constraints + +// Note that if we were on C++11, we'd use aligned_storage or somesuch. + +#ifndef __ALIGNED_BUF_H +#define __ALIGNED_BUF_H + +template +class AlignedBuf { + public: + T *get() { + return (T *)((((intptr_t)storage_) + alignment - 1) & -alignment); + } + private: + unsigned char storage_[size * sizeof(T) + alignment]; +}; + +#endif // __ALIGNED_BUF_H diff --git a/cpp/src/fm_core.cc b/cpp/src/fm_core.cc index 7f5ed60..3066a09 100644 --- a/cpp/src/fm_core.cc +++ b/cpp/src/fm_core.cc @@ -18,6 +18,10 @@ #include #endif +#ifdef __ANDROID_API__ +#include +#endif + #include "synth.h" #include "fm_op_kernel.h" #include "fm_core.h" @@ -117,7 +121,7 @@ void FmCore::compute(int32_t *output, FmOpParams *params, int algorithm, FmOpParams ¶m = params[op]; int inbus = (flags >> 4) & 3; int outbus = flags & 3; - int32_t *outptr = (outbus == 0) ? output : buf_[outbus - 1]; + int32_t *outptr = (outbus == 0) ? output : buf_[outbus - 1].get(); int32_t gain1 = param.gain[0]; int32_t gain2 = param.gain[1]; if (gain1 != 0 || gain2 != 0) { @@ -138,8 +142,8 @@ void FmCore::compute(int32_t *output, FmOpParams *params, int algorithm, } } else { // cout << op << " normal " << inbus << outbus << " " << param.freq << add << endl; - FmOpKernel::compute(outptr, buf_[inbus - 1], param.phase, param.freq, - gain1, gain2, add); + FmOpKernel::compute(outptr, buf_[inbus - 1].get(), + param.phase, param.freq, gain1, gain2, add); } has_contents[outbus] = true; } diff --git a/cpp/src/fm_core.h b/cpp/src/fm_core.h index 8aa9f09..d1b40c1 100644 --- a/cpp/src/fm_core.h +++ b/cpp/src/fm_core.h @@ -17,6 +17,8 @@ #ifndef __FM_CORE_H #define __FM_CORE_H +#include "aligned_buf.h" + struct FmOpParams { int32_t gain[2]; int32_t freq; @@ -29,7 +31,7 @@ class FmCore { void compute(int32_t *output, FmOpParams *params, int algorithm, int32_t *fb_buf, int32_t feedback_gain); private: - int32_t buf_[2][N]; + AlignedBufbuf_[2]; }; #endif // __FM_CORE_H diff --git a/cpp/src/fm_op_kernel.cc b/cpp/src/fm_op_kernel.cc index 867ea26..6f2aa69 100644 --- a/cpp/src/fm_op_kernel.cc +++ b/cpp/src/fm_op_kernel.cc @@ -16,30 +16,59 @@ #include +#ifdef HAVE_NEON +#include +#endif + #include "synth.h" #include "sin.h" #include "fm_op_kernel.h" +#ifdef HAVE_NEON +static bool hasNeon() { + return true; + return (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) != 0; +} + +extern "C" +void neon_fm_kernel(const int *in, const int *busin, int *out, int count, + int32_t phase0, int32_t freq, int32_t gain1, int32_t dgain); + +const int32_t __attribute__ ((aligned(16))) zeros[N] = {0}; + +#else +static bool hasNeon() { + return false; +} +#endif + void FmOpKernel::compute(int32_t *output, const int32_t *input, int32_t phase0, int32_t freq, int32_t gain1, int32_t gain2, bool add) { int32_t dgain = (gain2 - gain1 + (N >> 1)) >> LG_N; int32_t gain = gain1; int32_t phase = phase0; - if (add) { - for (int i = 0; i < N; i++) { - gain += dgain; - int32_t y = Sin::lookup(phase + input[i]); - output[i] += ((int64_t)y * (int64_t)gain) >> 24; - phase += freq; - } + if (hasNeon()) { +#ifdef HAVE_NEON + neon_fm_kernel(input, add ? output : zeros, output, N, + phase0, freq, gain, dgain); +#endif } else { - for (int i = 0; i < N; i++) { - gain += dgain; - int32_t y = Sin::lookup(phase + input[i]); - output[i] = ((int64_t)y * (int64_t)gain) >> 24; - phase += freq; + if (add) { + for (int i = 0; i < N; i++) { + gain += dgain; + int32_t y = Sin::lookup(phase + input[i]); + output[i] += ((int64_t)y * (int64_t)gain) >> 24; + phase += freq; + } + } else { + for (int i = 0; i < N; i++) { + gain += dgain; + int32_t y = Sin::lookup(phase + input[i]); + output[i] = ((int64_t)y * (int64_t)gain) >> 24; + phase += freq; + } } } } @@ -50,19 +79,26 @@ void FmOpKernel::compute_pure(int32_t *output, int32_t phase0, int32_t freq, int32_t dgain = (gain2 - gain1 + (N >> 1)) >> LG_N; int32_t gain = gain1; int32_t phase = phase0; - if (add) { - for (int i = 0; i < N; i++) { - gain += dgain; - int32_t y = Sin::lookup(phase); - output[i] += ((int64_t)y * (int64_t)gain) >> 24; - phase += freq; - } + if (hasNeon()) { +#ifdef HAVE_NEON + neon_fm_kernel(zeros, add ? output : zeros, output, N, + phase0, freq, gain, dgain); +#endif } else { - for (int i = 0; i < N; i++) { - gain += dgain; - int32_t y = Sin::lookup(phase); - output[i] = ((int64_t)y * (int64_t)gain) >> 24; - phase += freq; + if (add) { + for (int i = 0; i < N; i++) { + gain += dgain; + int32_t y = Sin::lookup(phase); + output[i] += ((int64_t)y * (int64_t)gain) >> 24; + phase += freq; + } + } else { + for (int i = 0; i < N; i++) { + gain += dgain; + int32_t y = Sin::lookup(phase); + output[i] = ((int64_t)y * (int64_t)gain) >> 24; + phase += freq; + } } } } diff --git a/cpp/src/neon_fm_kernel.s b/cpp/src/neon_fm_kernel.s new file mode 100644 index 0000000..0dc2a47 --- /dev/null +++ b/cpp/src/neon_fm_kernel.s @@ -0,0 +1,195 @@ + .text + + .align 2 + .global neon_fm_kernel + .type neon_fm_kernel, %function +neon_fm_kernel: +@ On entry: +@ r0 = pointer to input +@ r1 = pointer to mix input +@ r2 = pointer to output +@ r3 = number of samples (n) +@ stack[1] = phase0 +@ stack[2] = freq +@ stack[3] = gain +@ stack[4] = dgain +@ Note: pointers must have 128 bit (16 byte) alignment +@ Note: n mod 12 must be either 0 or 4, and n >= 12 +@ Register layout: +@ q0 phase accum +@ q1 gain +@ q2 0x7fffff +@ q3 poly coefs +@ q4-q6 data block 1 +@ q7-q9 data block 2 +@ q10-q12 data block 3 +@ q13-q15 data block 4 +@ [r4] phase bump +@ [r5] gain bump + push {r4-r5} + vpush {q4-q7} + @ TODO: maybe get rid of pipeline stalls in setep + add r5, sp, #(4 * 18) + vld1.32 {d0[], d1[]}, [r5]! @ phase + vld1.32 {d14[], d15[]}, [r5]! @ freq + adr r4, neon_fm_kernel_const + vld1.32 {q5}, [r4:128]! + vld1.32 {q2}, [r4,:128]! + vld1.32 {q3}, [r4,:128]! + vmla.i32 q0, q7, q5 + sub sp, #48 + vld1.32 {d2[], d3[]}, [r5]! @ gain + vld1.32 {d12[], d13[]}, [r5]! @ dgain + vmla.i32 q1, q6, q5 + vshl.i32 q7, #2 + vcvt.f32.s32 q1, q1, #24 + + add r4, sp, #15 + and r4, #-16 @ align to 16-byte boundary + vst1.32 {q7}, [r4:128] + vcvt.f32.s32 q6, q6, #22 + add r5, r4, #16 + vst1.32 {q6}, [r5:128] + + @vld1.32 {q7}, [r4:128] + vadd.i32 q8, q0, q7 + sub r3, #4 + + vld1.32 {q4}, [r0:128]! + vld1.32 {q5}, [r0:128]! + vadd.i32 q4, q0 + vadd.i32 q9, q8, q7 + vadd.i32 q5, q8 + vld1.32 {q6}, [r0:128]! + vmov.i32 q15, #0x800000 + vadd.i32 q6, q9 + vtst.32 q13, q4, q15 + vtst.32 q14, q5, q15 + vtst.32 q15, q6, q15 + vadd.i32 q0, q9, q7 + vmov.i32 q7, #0x400000 + b neon_fm_kernel_2 +neon_fm_kernel_1: + vld1.32 {q4}, [r0:128]! + vld1.32 {q5}, [r0:128]! + vadd.i32 q4, q0 + vadd.i32 q9, q8, q7 + vadd.i32 q5, q8 + vld1.32 {q6}, [r0:128]! + vst1.32 {q11}, [r2:128]! + vmov.i32 q15, #0x800000 + vadd.i32 q6, q9 + vtst.32 q13, q4, q15 + vtst.32 q14, q5, q15 + vtst.32 q15, q6, q15 + vadd.i32 q0, q9, q7 + vmov.i32 q7, #0x400000 + vst1.32 {q12}, [r2:128]! +neon_fm_kernel_2: + vand q4, q2 + vand q5, q2 + vand q6, q2 + vsub.i32 q4, q7 + vsub.i32 q5, q7 + vsub.i32 q6, q7 + vcvt.f32.s32 q4, q4, #22 + vcvt.f32.s32 q5, q5, #22 + vcvt.f32.s32 q6, q6, #22 + vmul.f32 q4, q4 + vmul.f32 q5, q5 + vmul.f32 q6, q6 + + vdup.32 q7, d6[1] + vdup.32 q8, d6[1] + vdup.32 q9, d6[1] + vmla.f32 q7, q4, d6[0] + vmla.f32 q8, q5, d6[0] + vmla.f32 q9, q6, d6[0] + vdup.32 q10, d7[0] + vdup.32 q11, d7[0] + vdup.32 q12, d7[0] + vmla.f32 q10, q4, q7 + vmla.f32 q11, q5, q8 + vmla.f32 q12, q6, q9 + vdup.32 q7, d7[1] + vdup.32 q8, d7[1] + vdup.32 q9, d7[1] + vmla.f32 q7, q4, q10 + vmla.f32 q8, q5, q11 + vmla.f32 q9, q6, q12 + vld1.32 {q10}, [r5:128] + vadd.f32 q11, q1, q10 + vmul.f32 q7, q1 + vadd.f32 q12, q11, q10 + vmul.f32 q8, q11 + vmul.f32 q9, q12 + vld1.32 {q4}, [r1:128]! + vadd.f32 q1, q12, q10 + vcvt.s32.f32 q10, q7, #24 + vld1.32 {q5}, [r1:128]! + vcvt.s32.f32 q11, q8, #24 + vcvt.s32.f32 q12, q9, #24 + vld1.32 {q6}, [r1:128]! + vld1.32 {q7}, [r4:128] + veor q10, q13 + veor q11, q14 + veor q12, q15 + vadd.i32 q10, q4 + vadd.i32 q11, q5 + vadd.i32 q12, q6 + vadd.i32 q8, q0, q7 + vst1.32 {q10}, [r2:128]! + subs r3, #12 + bgt neon_fm_kernel_1 + beq neon_fm_kernel_3 + @ does not handle n mod 12 == 8 + + add sp, #48 + vpop {q4-q7} + vst1.32 {q11}, [r2:128]! + pop {r4-r5} + vst1.32 {q12}, [r2:128]! + bx lr +neon_fm_kernel_3: + @ finish last chunk of 4 + vld1.32 {q4}, [r0:128]! + vadd.i32 q4, q0 + vmov.i32 q15, #0x800000 + vst1.32 {q11}, [r2:128]! + vtst.32 q13, q4, q15 + vmov.i32 q7, #0x400000 + vand q4, q2 + vst1.32 {q12}, [r2:128]! + vsub.i32 q4, q7 + vcvt.f32.s32 q4, q4, #22 + vmul.f32 q4, q4 + + vdup.32 q7, d6[1] + vmla.f32 q7, q4, d6[0] + vdup.32 q10, d7[0] + vmla.f32 q10, q4, q7 + vdup.32 q7, d7[1] + vmla.f32 q7, q4, q10 + vmul.f32 q7, q1 + vld1.32 {q4}, [r1:128]! + vcvt.s32.f32 q10, q7, #24 + veor q10, q13 + vadd.i32 q10, q4 + vst1.32 {q10}, [r2:128]! + add sp, #48 + vpop {q4-q7} + pop {r4-r5} + bx lr + + .size neon_fm_kernel, .-neon_fm_kernel + + .balign 16 +neon_fm_kernel_const: + .word 0, 1, 2, 3 + .word 0x7fffff, 0x7fffff, 0x7fffff, 0x7fffff + .float -0.01880853017455781, 0.25215252666796095, -1.2333439964934032, 1.0 + + @ vars for bumping + .float 1.0, 1.0, 1.0, 1.0 + .word 0, 0, 0, 0 + .float 0.0, 0.0, 0.0, 0.0 diff --git a/cpp/src/synth_unit.cc b/cpp/src/synth_unit.cc index 81bfb35..63a59e2 100644 --- a/cpp/src/synth_unit.cc +++ b/cpp/src/synth_unit.cc @@ -22,6 +22,7 @@ #include "synth.h" #include "synth_unit.h" +#include "aligned_buf.h" char epiano[] = { 95, 29, 20, 50, 99, 95, 0, 0, 41, 0, 19, 0, 115, 24, 79, 2, 0, @@ -193,17 +194,17 @@ void SynthUnit::GetSamples(int n_samples, int16_t *buffer) { ConsumeInput(input_offset); for (int i = 0; i < n_samples; i += N) { - int32_t audiobuf[N]; + AlignedBuf audiobuf; int32_t audiobuf2[N]; for (int j = 0; j < N; ++j) { - audiobuf[j] = 0; + audiobuf.get()[j] = 0; } for (int note = 0; note < max_active_notes; ++note) { if (active_note_[note].dx7_note != NULL) { - active_note_[note].dx7_note->compute(audiobuf); + active_note_[note].dx7_note->compute(audiobuf.get()); } } - const int32_t *bufs[] = { audiobuf }; + const int32_t *bufs[] = { audiobuf.get() }; int32_t *bufs2[] = { audiobuf2 }; filter_.process(bufs, filter_control_, filter_control_, bufs2); for (int j = 0; j < N; ++j) {