FIR filter implementation

This is a FIR filter with NEON speedup. The NEON version has been tested
for accuracy against the scalar one, and there is simple benchmarking
code in place as well.
master
Raph Levien 11 years ago
parent 2d433d1056
commit 287427513c
  1. 2
      android/AndroidManifest.xml
  2. 24
      android/jni/Android.mk
  3. 72
      cpp/src/fir.cc
  4. 48
      cpp/src/fir.h
  5. 97
      cpp/src/neon_fir.s
  6. 125
      cpp/src/test_fir.cc

@ -2,7 +2,7 @@
<manifest
xmlns:android="http://schemas.android.com/apk/res/android"
package="com.levien.synthesizer"
android:versionCode="4"
android:versionCode="5"
android:versionName="0.93">
<uses-sdk android:minSdkVersion="9"
android:targetSdkVersion="17"

@ -7,6 +7,7 @@ LOCAL_SRC_FILES := android_glue.cc \
dx7note.cc \
env.cc \
exp2.cc \
fir.cc \
fm_core.cc \
fm_op_kernel.cc \
freqlut.cc \
@ -23,7 +24,8 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
LOCAL_ARM_NEON := true
LOCAL_CFLAGS := -DHAVE_NEON=1
LOCAL_SRC_FILES += neon_fm_kernel.s \
neon_ladder.s
neon_ladder.s \
neon_fir.s
endif
# for native audio
@ -45,7 +47,8 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
LOCAL_ARM_NEON := true
LOCAL_CFLAGS := -DHAVE_NEON=1
LOCAL_SRC_FILES += neon_fm_kernel.s \
neon_ladder.s
neon_ladder.s \
neon_fir.s
endif
LOCAL_CFLAGS += -O3
@ -56,4 +59,21 @@ LOCAL_MODULE := test_neon
include $(BUILD_EXECUTABLE)
include $(CLEAR_VARS)
LOCAL_SRC_FILES := test_fir.cc \
fir.cc
ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
LOCAL_ARM_NEON := true
LOCAL_CFLAGS := -DHAVE_NEON=1
LOCAL_SRC_FILES += neon_fir.s
endif
LOCAL_CFLAGS += -O3
LOCAL_STATIC_LIBRARIES += cpufeatures
LOCAL_MODULE := test_fir
include $(BUILD_EXECUTABLE)
$(call import-module,android/cpufeatures)

@ -0,0 +1,72 @@
/*
* Copyright 2013 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Implementation of FIR filtering (convolution)
#include <stdio.h> // for debugging, remove
#include <stdlib.h>
#include <malloc.h>
#include "fir.h"
// Should probably ifdef this to make it more portable
void *malloc_aligned(size_t alignment, size_t nbytes) {
return memalign(alignment, nbytes);
}
SimpleFirFilter::SimpleFirFilter(const float *kernel, size_t nk) : nk(nk) {
k = (float *)malloc(nk * sizeof(k[0]));
for (size_t i = 0; i < nk; i++) {
k[i] = kernel[nk - i - 1];
}
}
SimpleFirFilter::~SimpleFirFilter() {
free(k);
}
void SimpleFirFilter::process(const float *in, float *out, size_t n) {
for (size_t i = 0; i < n; i++) {
float y = 0;
for (size_t j = 0; j < nk; j++) {
y += k[j] * in[i + j];
}
out[i] = y;
}
}
NeonFirFilter::NeonFirFilter(const float *kernel, size_t nk) : nk(nk) {
// TODO: handle odd size nk (must be multiple of 4)
k = (float *)malloc_aligned(16, nk * sizeof(k[0]));
for (size_t i = 0; i < nk; i += 4) {
for (size_t j = 0; j < 4; j++) {
k[i + j] = kernel[nk - i - 4 + j];
}
}
}
NeonFirFilter::~NeonFirFilter() {
free(k);
}
#ifdef HAVE_NEON
extern "C"
void neon_fir_direct(const float *in, const float *k, float *out, size_t n, size_t nk);
#endif
void NeonFirFilter::process(const float *in, float *out, size_t n) {
neon_fir_direct(in - 1, k, out, n, nk);
}

@ -0,0 +1,48 @@
/*
* Copyright 2013 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// move to generic utility file?
void *malloc_aligned(size_t alignment, size_t nbytes);
// Abstract class
class FirFilter {
public:
// preconditions:
// in + (nk - 1) is aligned to 128 bits
// out is aligned to 128 bits
virtual void process(const float *in, float *out, size_t n) = 0;
};
class SimpleFirFilter : public FirFilter {
public:
SimpleFirFilter(const float *kernel, size_t nk);
~SimpleFirFilter();
void process(const float *in, float *out, size_t n);
private:
size_t nk;
float *k;
};
// should make conditional compilation?
class NeonFirFilter : public FirFilter {
public:
NeonFirFilter(const float *kernel, size_t nk);
~NeonFirFilter();
void process(const float *in, float *out, size_t n);
private:
size_t nk;
float *k;
};

@ -0,0 +1,97 @@
@ Copyright 2013 Google Inc.
@
@ Licensed under the Apache License, Version 2.0 (the "License");
@ you may not use this file except in compliance with the License.
@ You may obtain a copy of the License at
@
@ http://www.apache.org/licenses/LICENSE-2.0
@
@ Unless required by applicable law or agreed to in writing, software
@ distributed under the License is distributed on an "AS IS" BASIS,
@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ See the License for the specific language governing permissions and
@ limitations under the License.
@ NEON assembly implementation of FIR filter core
.text
.align 2
.global neon_fir_direct
.type neon_fir_direct, %function
neon_fir_direct:
@ r0 = pointer to input (aligned)
@ r1 = pointer to kernel (aligned)
@ r2 = pointer to output (aligned)
@ r3 = size of input in floats (multiple of 4)
@ stack[0] = size of kernel in floats (multiple of 4)
push {r4-r7}
ldr r4, [sp, #(4 * 4)]
lsl r6, r4, #2
sub r6, #16
@ compute initial overlap
mov r7, r4
vmov.i32 q9, #0
vmov.i32 q10, #0
vmov.i32 q11, #0
neon_fir_direct1:
vld1.i32 {q0}, [r0:128]! @ load 4 samples from input
vld1.i32 {q1}, [r1:128]! @ load 4 samples from kernel
vmla.f32 q9, q1, d0[1]
vmla.f32 q10, q1, d1[0]
vmla.f32 q11, q1, d1[1]
subs r7, #4
bne neon_fir_direct1
vmov.i32 q12, #0
vext.32 q13, q9, q12, #3
vext.32 q1, q10, q12, #2
vadd.f32 q13, q1
vext.32 q1, q11, q12, #1
vadd.f32 q8, q13, q1
sub r0, r6
sub r1, r4, lsl #2
neon_fir_direct2:
mov r7, r4
vmov.i32 q9, #0
vmov.i32 q10, #0
vmov.i32 q11, #0
@ inner loop
neon_fir_direct3:
vld1.32 {q0}, [r0:128]! @ load 4 samples from input
vld1.i32 {q1}, [r1:128]! @ load 4 samples from kernel
vmla.f32 q9, q1, d0[1]
vmla.f32 q8, q1, d0[0]
vmla.f32 q10, q1, d1[0]
vmla.f32 q11, q1, d1[1]
subs r7, #4
bne neon_fir_direct3
@ process overlaps
vext.32 q0, q12, q9, #3
vext.32 q13, q9, q12, #3
vadd.f32 q8, q0
vext.32 q0, q12, q10, #2
vext.32 q1, q10, q12, #2
vadd.f32 q8, q0
vadd.f32 q13, q1
vext.32 q0, q12, q11, #1
vext.32 q1, q11, q12, #1
vadd.f32 q0, q8
vadd.f32 q8, q13, q1
sub r0, r6
sub r1, r4, lsl #2
vst1.32 {q0}, [r2:128]!
subs r3, #4
bne neon_fir_direct2
pop {r4-r7}
bx lr
.size neon_fir_direct, .-neon_fir_direct

@ -0,0 +1,125 @@
/*
* Copyright 2013 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Little test app for measuring FIR speed
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <sys/time.h>
#include <math.h>
#include "fir.h"
// clock_gettime would be a little better, but whatever
double now() {
struct timeval tp;
gettimeofday(&tp, NULL);
return tp.tv_sec + 1e-6 * tp.tv_usec;
}
void condition_governor() {
// sleep for a bit to avoid thermal throttling
static uint32_t v = 0;
struct timespec ts;
ts.tv_sec = 0;
ts.tv_nsec = 900000000 + (v & 1); // 900ms
nanosleep(&ts, NULL);
// consume cpu a bit to try to coax max cpufreq
uint32_t x = v;
for (int i = 0; i < 10000000; i++) {
x += 42;
x += (x << 10);
x ^= (x >> 6);
}
// storing it in a static guarantees not optimizing out
v = x;
}
float *mkrandom(size_t size) {
float *result = (float *)malloc_aligned(16, size * sizeof(result[0]));
for (int i = 0; i < size; i++) {
result[i] = random() * (2.0 / RAND_MAX) - 1;
}
return result;
}
double test_accuracy(FirFilter *f1, FirFilter *f2, const float *inp, int nblock) {
float *out1 = (float *)malloc_aligned(16, nblock * sizeof(out1[0]));
float *out2 = (float *)malloc_aligned(16, nblock * sizeof(out2[0]));
f1->process(inp + 1, out1, nblock);
f2->process(inp + 1, out2, nblock);
double err = 0;
for (int i = 0; i < nblock; i++) {
//printf("%d: %f %f\n", i, out1[i], out2[i]);
err += fabs(out1[i] - out2[i]);
}
free(out1);
free(out2);
return err;
}
void benchfir(int size, int experiment) {
condition_governor();
const int nblock = 64;
float *kernel = mkrandom(size);
float *inp = mkrandom(size + nblock);
float *out = (float *)malloc_aligned(16, nblock * sizeof(out[0]));
FirFilter *f;
switch(experiment) {
case 0:
f = new SimpleFirFilter(kernel, size);
break;
case 1:
f = new NeonFirFilter(kernel, size);
break;
}
double start = now();
for (int j = 0; j < 15625; j++) {
f->process(inp + 1, out, 64);
}
double elapsed = now() - start;
printf("%i %f\n", size, 1e3 * elapsed);
FirFilter *fbase = new SimpleFirFilter(kernel, size);
double accuracy = test_accuracy(fbase, f, inp, nblock);
//printf("accuracy = %g\n", accuracy);
delete f;
delete fbase;
free(kernel);
free(inp);
free(out);
}
int main(int argc, char **argv) {
printf("set style data linespoints\n"
"set xlabel 'FIR kernel size'\n"
"set ylabel 'ns per sample'\n"
"plot '-' title 'scalar', '-' title '4x4 block'\n");
for (int experiment = 0; experiment < 2; experiment++) {
for (int i = 16; i <= 128; i += 16) {
benchfir(i, experiment);
}
printf("e\n");
}
return 0;
}
Loading…
Cancel
Save