This is a FIR filter with NEON speedup. The NEON version has been tested for accuracy against the scalar one, and there is simple benchmarking code in place as well.master
parent
2d433d1056
commit
287427513c
@ -0,0 +1,72 @@ |
||||
/*
|
||||
* Copyright 2013 Google Inc. |
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); |
||||
* you may not use this file except in compliance with the License. |
||||
* You may obtain a copy of the License at |
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
// Implementation of FIR filtering (convolution)
|
||||
|
||||
#include <stdio.h> // for debugging, remove |
||||
#include <stdlib.h> |
||||
#include <malloc.h> |
||||
|
||||
#include "fir.h" |
||||
|
||||
// Should probably ifdef this to make it more portable
|
||||
void *malloc_aligned(size_t alignment, size_t nbytes) { |
||||
return memalign(alignment, nbytes); |
||||
} |
||||
|
||||
SimpleFirFilter::SimpleFirFilter(const float *kernel, size_t nk) : nk(nk) { |
||||
k = (float *)malloc(nk * sizeof(k[0])); |
||||
for (size_t i = 0; i < nk; i++) { |
||||
k[i] = kernel[nk - i - 1]; |
||||
} |
||||
} |
||||
|
||||
SimpleFirFilter::~SimpleFirFilter() { |
||||
free(k); |
||||
} |
||||
|
||||
void SimpleFirFilter::process(const float *in, float *out, size_t n) { |
||||
for (size_t i = 0; i < n; i++) { |
||||
float y = 0; |
||||
for (size_t j = 0; j < nk; j++) { |
||||
y += k[j] * in[i + j]; |
||||
} |
||||
out[i] = y; |
||||
} |
||||
} |
||||
|
||||
NeonFirFilter::NeonFirFilter(const float *kernel, size_t nk) : nk(nk) { |
||||
// TODO: handle odd size nk (must be multiple of 4)
|
||||
k = (float *)malloc_aligned(16, nk * sizeof(k[0])); |
||||
for (size_t i = 0; i < nk; i += 4) { |
||||
for (size_t j = 0; j < 4; j++) { |
||||
k[i + j] = kernel[nk - i - 4 + j]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
NeonFirFilter::~NeonFirFilter() { |
||||
free(k); |
||||
} |
||||
|
||||
#ifdef HAVE_NEON |
||||
extern "C" |
||||
void neon_fir_direct(const float *in, const float *k, float *out, size_t n, size_t nk); |
||||
#endif |
||||
|
||||
void NeonFirFilter::process(const float *in, float *out, size_t n) { |
||||
neon_fir_direct(in - 1, k, out, n, nk); |
||||
} |
@ -0,0 +1,48 @@ |
||||
/*
|
||||
* Copyright 2013 Google Inc. |
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); |
||||
* you may not use this file except in compliance with the License. |
||||
* You may obtain a copy of the License at |
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
// move to generic utility file?
|
||||
void *malloc_aligned(size_t alignment, size_t nbytes); |
||||
|
||||
// Abstract class
|
||||
class FirFilter { |
||||
public: |
||||
// preconditions:
|
||||
// in + (nk - 1) is aligned to 128 bits
|
||||
// out is aligned to 128 bits
|
||||
virtual void process(const float *in, float *out, size_t n) = 0; |
||||
}; |
||||
|
||||
class SimpleFirFilter : public FirFilter { |
||||
public: |
||||
SimpleFirFilter(const float *kernel, size_t nk); |
||||
~SimpleFirFilter(); |
||||
void process(const float *in, float *out, size_t n); |
||||
private: |
||||
size_t nk; |
||||
float *k; |
||||
}; |
||||
|
||||
// should make conditional compilation?
|
||||
class NeonFirFilter : public FirFilter { |
||||
public: |
||||
NeonFirFilter(const float *kernel, size_t nk); |
||||
~NeonFirFilter(); |
||||
void process(const float *in, float *out, size_t n); |
||||
private: |
||||
size_t nk; |
||||
float *k; |
||||
}; |
@ -0,0 +1,97 @@ |
||||
@ Copyright 2013 Google Inc.
|
||||
@
|
||||
@ Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@ you may not use this file except in compliance with the License.
|
||||
@ You may obtain a copy of the License at
|
||||
@
|
||||
@ http://www.apache.org/licenses/LICENSE-2.0
|
||||
@
|
||||
@ Unless required by applicable law or agreed to in writing, software
|
||||
@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||
@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
@ See the License for the specific language governing permissions and
|
||||
@ limitations under the License.
|
||||
|
||||
@ NEON assembly implementation of FIR filter core
|
||||
|
||||
.text |
||||
|
||||
.align 2
|
||||
.global neon_fir_direct
|
||||
.type neon_fir_direct, %function |
||||
neon_fir_direct: |
||||
@ r0 = pointer to input (aligned)
|
||||
@ r1 = pointer to kernel (aligned)
|
||||
@ r2 = pointer to output (aligned)
|
||||
@ r3 = size of input in floats (multiple of 4)
|
||||
@ stack[0] = size of kernel in floats (multiple of 4)
|
||||
|
||||
push {r4-r7} |
||||
ldr r4, [sp, #(4 * 4)] |
||||
|
||||
lsl r6, r4, #2 |
||||
sub r6, #16 |
||||
|
||||
@ compute initial overlap
|
||||
mov r7, r4 |
||||
vmov.i32 q9, #0 |
||||
vmov.i32 q10, #0 |
||||
vmov.i32 q11, #0 |
||||
neon_fir_direct1: |
||||
vld1.i32 {q0}, [r0:128]! @ load 4 samples from input
|
||||
vld1.i32 {q1}, [r1:128]! @ load 4 samples from kernel
|
||||
vmla.f32 q9, q1, d0[1] |
||||
vmla.f32 q10, q1, d1[0] |
||||
vmla.f32 q11, q1, d1[1] |
||||
subs r7, #4 |
||||
bne neon_fir_direct1 |
||||
vmov.i32 q12, #0 |
||||
vext.32 q13, q9, q12, #3 |
||||
vext.32 q1, q10, q12, #2 |
||||
vadd.f32 q13, q1 |
||||
vext.32 q1, q11, q12, #1 |
||||
vadd.f32 q8, q13, q1 |
||||
sub r0, r6 |
||||
sub r1, r4, lsl #2 |
||||
|
||||
neon_fir_direct2: |
||||
|
||||
mov r7, r4 |
||||
vmov.i32 q9, #0 |
||||
vmov.i32 q10, #0 |
||||
vmov.i32 q11, #0 |
||||
@ inner loop
|
||||
neon_fir_direct3: |
||||
vld1.32 {q0}, [r0:128]! @ load 4 samples from input
|
||||
vld1.i32 {q1}, [r1:128]! @ load 4 samples from kernel
|
||||
vmla.f32 q9, q1, d0[1] |
||||
vmla.f32 q8, q1, d0[0] |
||||
vmla.f32 q10, q1, d1[0] |
||||
vmla.f32 q11, q1, d1[1] |
||||
subs r7, #4 |
||||
bne neon_fir_direct3 |
||||
|
||||
@ process overlaps
|
||||
vext.32 q0, q12, q9, #3 |
||||
vext.32 q13, q9, q12, #3 |
||||
vadd.f32 q8, q0 |
||||
vext.32 q0, q12, q10, #2 |
||||
vext.32 q1, q10, q12, #2 |
||||
vadd.f32 q8, q0 |
||||
vadd.f32 q13, q1 |
||||
vext.32 q0, q12, q11, #1 |
||||
vext.32 q1, q11, q12, #1 |
||||
vadd.f32 q0, q8 |
||||
vadd.f32 q8, q13, q1 |
||||
|
||||
sub r0, r6 |
||||
sub r1, r4, lsl #2 |
||||
vst1.32 {q0}, [r2:128]! |
||||
|
||||
subs r3, #4 |
||||
bne neon_fir_direct2 |
||||
|
||||
pop {r4-r7} |
||||
bx lr |
||||
|
||||
.size neon_fir_direct, .-neon_fir_direct |
@ -0,0 +1,125 @@ |
||||
/*
|
||||
* Copyright 2013 Google Inc. |
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); |
||||
* you may not use this file except in compliance with the License. |
||||
* You may obtain a copy of the License at |
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
// Little test app for measuring FIR speed
|
||||
|
||||
#include <stdlib.h> |
||||
#include <stdio.h> |
||||
#include <time.h> |
||||
#include <sys/time.h> |
||||
#include <math.h> |
||||
|
||||
#include "fir.h" |
||||
|
||||
// clock_gettime would be a little better, but whatever
|
||||
double now() { |
||||
struct timeval tp; |
||||
gettimeofday(&tp, NULL); |
||||
return tp.tv_sec + 1e-6 * tp.tv_usec; |
||||
} |
||||
|
||||
void condition_governor() { |
||||
// sleep for a bit to avoid thermal throttling
|
||||
static uint32_t v = 0; |
||||
struct timespec ts; |
||||
ts.tv_sec = 0; |
||||
ts.tv_nsec = 900000000 + (v & 1); // 900ms
|
||||
nanosleep(&ts, NULL); |
||||
|
||||
// consume cpu a bit to try to coax max cpufreq
|
||||
uint32_t x = v; |
||||
for (int i = 0; i < 10000000; i++) { |
||||
x += 42; |
||||
x += (x << 10); |
||||
x ^= (x >> 6); |
||||
} |
||||
// storing it in a static guarantees not optimizing out
|
||||
v = x; |
||||
} |
||||
|
||||
float *mkrandom(size_t size) { |
||||
float *result = (float *)malloc_aligned(16, size * sizeof(result[0])); |
||||
for (int i = 0; i < size; i++) { |
||||
result[i] = random() * (2.0 / RAND_MAX) - 1; |
||||
} |
||||
return result; |
||||
} |
||||
|
||||
double test_accuracy(FirFilter *f1, FirFilter *f2, const float *inp, int nblock) { |
||||
float *out1 = (float *)malloc_aligned(16, nblock * sizeof(out1[0])); |
||||
float *out2 = (float *)malloc_aligned(16, nblock * sizeof(out2[0])); |
||||
f1->process(inp + 1, out1, nblock); |
||||
f2->process(inp + 1, out2, nblock); |
||||
double err = 0; |
||||
for (int i = 0; i < nblock; i++) { |
||||
//printf("%d: %f %f\n", i, out1[i], out2[i]);
|
||||
err += fabs(out1[i] - out2[i]); |
||||
} |
||||
free(out1); |
||||
free(out2); |
||||
return err; |
||||
} |
||||
|
||||
void benchfir(int size, int experiment) { |
||||
condition_governor(); |
||||
|
||||
const int nblock = 64; |
||||
float *kernel = mkrandom(size); |
||||
float *inp = mkrandom(size + nblock); |
||||
float *out = (float *)malloc_aligned(16, nblock * sizeof(out[0])); |
||||
FirFilter *f; |
||||
|
||||
switch(experiment) { |
||||
case 0: |
||||
f = new SimpleFirFilter(kernel, size); |
||||
break; |
||||
case 1: |
||||
f = new NeonFirFilter(kernel, size); |
||||
break; |
||||
} |
||||
|
||||
|
||||
double start = now(); |
||||
for (int j = 0; j < 15625; j++) { |
||||
f->process(inp + 1, out, 64); |
||||
} |
||||
double elapsed = now() - start; |
||||
printf("%i %f\n", size, 1e3 * elapsed); |
||||
|
||||
FirFilter *fbase = new SimpleFirFilter(kernel, size); |
||||
double accuracy = test_accuracy(fbase, f, inp, nblock); |
||||
//printf("accuracy = %g\n", accuracy);
|
||||
|
||||
delete f; |
||||
delete fbase; |
||||
free(kernel); |
||||
free(inp); |
||||
free(out); |
||||
} |
||||
|
||||
int main(int argc, char **argv) { |
||||
printf("set style data linespoints\n" |
||||
"set xlabel 'FIR kernel size'\n" |
||||
"set ylabel 'ns per sample'\n" |
||||
"plot '-' title 'scalar', '-' title '4x4 block'\n"); |
||||
for (int experiment = 0; experiment < 2; experiment++) { |
||||
for (int i = 16; i <= 128; i += 16) { |
||||
benchfir(i, experiment); |
||||
} |
||||
printf("e\n"); |
||||
} |
||||
return 0; |
||||
} |
Loading…
Reference in new issue