Add NEON accelerated FM kernel

The FM kernel yields itself well to speedup using NEON assembler. This
patch contains the NEON assembly code, plus C integration code
(including making sure that buffers are aligned to 16 bytes).
master
Raph Levien 12 years ago
parent 10392e8260
commit 7f9c21cd2e
  1. 11
      android/jni/Android.mk
  2. 34
      cpp/src/aligned_buf.h
  3. 10
      cpp/src/fm_core.cc
  4. 4
      cpp/src/fm_core.h
  5. 36
      cpp/src/fm_op_kernel.cc
  6. 195
      cpp/src/neon_fm_kernel.s
  7. 9
      cpp/src/synth_unit.cc

@ -16,12 +16,21 @@ LOCAL_SRC_FILES := android_glue.cc \
sin.cc \
synth_unit.cc
ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
LOCAL_ARM_NEON := true
LOCAL_CFLAGS := -DHAVE_NEON=1
LOCAL_SRC_FILES += neon_fm_kernel.s
endif
# for native audio
LOCAL_LDLIBS += -lOpenSLES
# for logging
LOCAL_LDLIBS += -llog
LOCAL_CFLAGS := -O3
LOCAL_STATIC_LIBRARIES += cpufeatures
LOCAL_CFLAGS += -O3
include $(BUILD_SHARED_LIBRARY)
$(call import-module,android/cpufeatures)

@ -0,0 +1,34 @@
/*
* Copyright 2013 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// A convenient wrapper for buffers with alignment constraints
// Note that if we were on C++11, we'd use aligned_storage or somesuch.
#ifndef __ALIGNED_BUF_H
#define __ALIGNED_BUF_H
template<typename T, size_t size, size_t alignment = 16>
class AlignedBuf {
public:
T *get() {
return (T *)((((intptr_t)storage_) + alignment - 1) & -alignment);
}
private:
unsigned char storage_[size * sizeof(T) + alignment];
};
#endif // __ALIGNED_BUF_H

@ -18,6 +18,10 @@
#include <iostream>
#endif
#ifdef __ANDROID_API__
#include <cpu-features.h>
#endif
#include "synth.h"
#include "fm_op_kernel.h"
#include "fm_core.h"
@ -117,7 +121,7 @@ void FmCore::compute(int32_t *output, FmOpParams *params, int algorithm,
FmOpParams &param = params[op];
int inbus = (flags >> 4) & 3;
int outbus = flags & 3;
int32_t *outptr = (outbus == 0) ? output : buf_[outbus - 1];
int32_t *outptr = (outbus == 0) ? output : buf_[outbus - 1].get();
int32_t gain1 = param.gain[0];
int32_t gain2 = param.gain[1];
if (gain1 != 0 || gain2 != 0) {
@ -138,8 +142,8 @@ void FmCore::compute(int32_t *output, FmOpParams *params, int algorithm,
}
} else {
// cout << op << " normal " << inbus << outbus << " " << param.freq << add << endl;
FmOpKernel::compute(outptr, buf_[inbus - 1], param.phase, param.freq,
gain1, gain2, add);
FmOpKernel::compute(outptr, buf_[inbus - 1].get(),
param.phase, param.freq, gain1, gain2, add);
}
has_contents[outbus] = true;
}

@ -17,6 +17,8 @@
#ifndef __FM_CORE_H
#define __FM_CORE_H
#include "aligned_buf.h"
struct FmOpParams {
int32_t gain[2];
int32_t freq;
@ -29,7 +31,7 @@ class FmCore {
void compute(int32_t *output, FmOpParams *params, int algorithm,
int32_t *fb_buf, int32_t feedback_gain);
private:
int32_t buf_[2][N];
AlignedBuf<int32_t, N>buf_[2];
};
#endif // __FM_CORE_H

@ -16,17 +16,45 @@
#include <math.h>
#ifdef HAVE_NEON
#include <cpu-features.h>
#endif
#include "synth.h"
#include "sin.h"
#include "fm_op_kernel.h"
#ifdef HAVE_NEON
static bool hasNeon() {
return true;
return (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) != 0;
}
extern "C"
void neon_fm_kernel(const int *in, const int *busin, int *out, int count,
int32_t phase0, int32_t freq, int32_t gain1, int32_t dgain);
const int32_t __attribute__ ((aligned(16))) zeros[N] = {0};
#else
static bool hasNeon() {
return false;
}
#endif
void FmOpKernel::compute(int32_t *output, const int32_t *input,
int32_t phase0, int32_t freq,
int32_t gain1, int32_t gain2, bool add) {
int32_t dgain = (gain2 - gain1 + (N >> 1)) >> LG_N;
int32_t gain = gain1;
int32_t phase = phase0;
if (hasNeon()) {
#ifdef HAVE_NEON
neon_fm_kernel(input, add ? output : zeros, output, N,
phase0, freq, gain, dgain);
#endif
} else {
if (add) {
for (int i = 0; i < N; i++) {
gain += dgain;
@ -43,6 +71,7 @@ void FmOpKernel::compute(int32_t *output, const int32_t *input,
}
}
}
}
#if 1
void FmOpKernel::compute_pure(int32_t *output, int32_t phase0, int32_t freq,
@ -50,6 +79,12 @@ void FmOpKernel::compute_pure(int32_t *output, int32_t phase0, int32_t freq,
int32_t dgain = (gain2 - gain1 + (N >> 1)) >> LG_N;
int32_t gain = gain1;
int32_t phase = phase0;
if (hasNeon()) {
#ifdef HAVE_NEON
neon_fm_kernel(zeros, add ? output : zeros, output, N,
phase0, freq, gain, dgain);
#endif
} else {
if (add) {
for (int i = 0; i < N; i++) {
gain += dgain;
@ -66,6 +101,7 @@ void FmOpKernel::compute_pure(int32_t *output, int32_t phase0, int32_t freq,
}
}
}
}
#endif
#define noDOUBLE_ACCURACY

@ -0,0 +1,195 @@
.text
.align 2
.global neon_fm_kernel
.type neon_fm_kernel, %function
neon_fm_kernel:
@ On entry:
@ r0 = pointer to input
@ r1 = pointer to mix input
@ r2 = pointer to output
@ r3 = number of samples (n)
@ stack[1] = phase0
@ stack[2] = freq
@ stack[3] = gain
@ stack[4] = dgain
@ Note: pointers must have 128 bit (16 byte) alignment
@ Note: n mod 12 must be either 0 or 4, and n >= 12
@ Register layout:
@ q0 phase accum
@ q1 gain
@ q2 0x7fffff
@ q3 poly coefs
@ q4-q6 data block 1
@ q7-q9 data block 2
@ q10-q12 data block 3
@ q13-q15 data block 4
@ [r4] phase bump
@ [r5] gain bump
push {r4-r5}
vpush {q4-q7}
@ TODO: maybe get rid of pipeline stalls in setep
add r5, sp, #(4 * 18)
vld1.32 {d0[], d1[]}, [r5]! @ phase
vld1.32 {d14[], d15[]}, [r5]! @ freq
adr r4, neon_fm_kernel_const
vld1.32 {q5}, [r4:128]!
vld1.32 {q2}, [r4,:128]!
vld1.32 {q3}, [r4,:128]!
vmla.i32 q0, q7, q5
sub sp, #48
vld1.32 {d2[], d3[]}, [r5]! @ gain
vld1.32 {d12[], d13[]}, [r5]! @ dgain
vmla.i32 q1, q6, q5
vshl.i32 q7, #2
vcvt.f32.s32 q1, q1, #24
add r4, sp, #15
and r4, #-16 @ align to 16-byte boundary
vst1.32 {q7}, [r4:128]
vcvt.f32.s32 q6, q6, #22
add r5, r4, #16
vst1.32 {q6}, [r5:128]
@vld1.32 {q7}, [r4:128]
vadd.i32 q8, q0, q7
sub r3, #4
vld1.32 {q4}, [r0:128]!
vld1.32 {q5}, [r0:128]!
vadd.i32 q4, q0
vadd.i32 q9, q8, q7
vadd.i32 q5, q8
vld1.32 {q6}, [r0:128]!
vmov.i32 q15, #0x800000
vadd.i32 q6, q9
vtst.32 q13, q4, q15
vtst.32 q14, q5, q15
vtst.32 q15, q6, q15
vadd.i32 q0, q9, q7
vmov.i32 q7, #0x400000
b neon_fm_kernel_2
neon_fm_kernel_1:
vld1.32 {q4}, [r0:128]!
vld1.32 {q5}, [r0:128]!
vadd.i32 q4, q0
vadd.i32 q9, q8, q7
vadd.i32 q5, q8
vld1.32 {q6}, [r0:128]!
vst1.32 {q11}, [r2:128]!
vmov.i32 q15, #0x800000
vadd.i32 q6, q9
vtst.32 q13, q4, q15
vtst.32 q14, q5, q15
vtst.32 q15, q6, q15
vadd.i32 q0, q9, q7
vmov.i32 q7, #0x400000
vst1.32 {q12}, [r2:128]!
neon_fm_kernel_2:
vand q4, q2
vand q5, q2
vand q6, q2
vsub.i32 q4, q7
vsub.i32 q5, q7
vsub.i32 q6, q7
vcvt.f32.s32 q4, q4, #22
vcvt.f32.s32 q5, q5, #22
vcvt.f32.s32 q6, q6, #22
vmul.f32 q4, q4
vmul.f32 q5, q5
vmul.f32 q6, q6
vdup.32 q7, d6[1]
vdup.32 q8, d6[1]
vdup.32 q9, d6[1]
vmla.f32 q7, q4, d6[0]
vmla.f32 q8, q5, d6[0]
vmla.f32 q9, q6, d6[0]
vdup.32 q10, d7[0]
vdup.32 q11, d7[0]
vdup.32 q12, d7[0]
vmla.f32 q10, q4, q7
vmla.f32 q11, q5, q8
vmla.f32 q12, q6, q9
vdup.32 q7, d7[1]
vdup.32 q8, d7[1]
vdup.32 q9, d7[1]
vmla.f32 q7, q4, q10
vmla.f32 q8, q5, q11
vmla.f32 q9, q6, q12
vld1.32 {q10}, [r5:128]
vadd.f32 q11, q1, q10
vmul.f32 q7, q1
vadd.f32 q12, q11, q10
vmul.f32 q8, q11
vmul.f32 q9, q12
vld1.32 {q4}, [r1:128]!
vadd.f32 q1, q12, q10
vcvt.s32.f32 q10, q7, #24
vld1.32 {q5}, [r1:128]!
vcvt.s32.f32 q11, q8, #24
vcvt.s32.f32 q12, q9, #24
vld1.32 {q6}, [r1:128]!
vld1.32 {q7}, [r4:128]
veor q10, q13
veor q11, q14
veor q12, q15
vadd.i32 q10, q4
vadd.i32 q11, q5
vadd.i32 q12, q6
vadd.i32 q8, q0, q7
vst1.32 {q10}, [r2:128]!
subs r3, #12
bgt neon_fm_kernel_1
beq neon_fm_kernel_3
@ does not handle n mod 12 == 8
add sp, #48
vpop {q4-q7}
vst1.32 {q11}, [r2:128]!
pop {r4-r5}
vst1.32 {q12}, [r2:128]!
bx lr
neon_fm_kernel_3:
@ finish last chunk of 4
vld1.32 {q4}, [r0:128]!
vadd.i32 q4, q0
vmov.i32 q15, #0x800000
vst1.32 {q11}, [r2:128]!
vtst.32 q13, q4, q15
vmov.i32 q7, #0x400000
vand q4, q2
vst1.32 {q12}, [r2:128]!
vsub.i32 q4, q7
vcvt.f32.s32 q4, q4, #22
vmul.f32 q4, q4
vdup.32 q7, d6[1]
vmla.f32 q7, q4, d6[0]
vdup.32 q10, d7[0]
vmla.f32 q10, q4, q7
vdup.32 q7, d7[1]
vmla.f32 q7, q4, q10
vmul.f32 q7, q1
vld1.32 {q4}, [r1:128]!
vcvt.s32.f32 q10, q7, #24
veor q10, q13
vadd.i32 q10, q4
vst1.32 {q10}, [r2:128]!
add sp, #48
vpop {q4-q7}
pop {r4-r5}
bx lr
.size neon_fm_kernel, .-neon_fm_kernel
.balign 16
neon_fm_kernel_const:
.word 0, 1, 2, 3
.word 0x7fffff, 0x7fffff, 0x7fffff, 0x7fffff
.float -0.01880853017455781, 0.25215252666796095, -1.2333439964934032, 1.0
@ vars for bumping
.float 1.0, 1.0, 1.0, 1.0
.word 0, 0, 0, 0
.float 0.0, 0.0, 0.0, 0.0

@ -22,6 +22,7 @@
#include "synth.h"
#include "synth_unit.h"
#include "aligned_buf.h"
char epiano[] = {
95, 29, 20, 50, 99, 95, 0, 0, 41, 0, 19, 0, 115, 24, 79, 2, 0,
@ -193,17 +194,17 @@ void SynthUnit::GetSamples(int n_samples, int16_t *buffer) {
ConsumeInput(input_offset);
for (int i = 0; i < n_samples; i += N) {
int32_t audiobuf[N];
AlignedBuf<int32_t, N> audiobuf;
int32_t audiobuf2[N];
for (int j = 0; j < N; ++j) {
audiobuf[j] = 0;
audiobuf.get()[j] = 0;
}
for (int note = 0; note < max_active_notes; ++note) {
if (active_note_[note].dx7_note != NULL) {
active_note_[note].dx7_note->compute(audiobuf);
active_note_[note].dx7_note->compute(audiobuf.get());
}
}
const int32_t *bufs[] = { audiobuf };
const int32_t *bufs[] = { audiobuf.get() };
int32_t *bufs2[] = { audiobuf2 };
filter_.process(bufs, filter_control_, filter_control_, bufs2);
for (int j = 0; j < N; ++j) {

Loading…
Cancel
Save