Half-rate FIR implementation

This commit is a test implementation of a half-rate FIR structure
(basically a Toom-Cook). It's not bad in the scalar case, but the
benefit is marginal at best in NEON.
master
Raph Levien 11 years ago
parent 287427513c
commit 5a20308469
  1. 114
      cpp/src/fir.cc
  2. 40
      cpp/src/fir.h
  3. 272
      cpp/src/neon_fir.s
  4. 31
      cpp/src/test_fir.cc

@ -20,6 +20,7 @@
#include <stdlib.h> #include <stdlib.h>
#include <malloc.h> #include <malloc.h>
#include "aligned_buf.h"
#include "fir.h" #include "fir.h"
// Should probably ifdef this to make it more portable // Should probably ifdef this to make it more portable
@ -48,6 +49,89 @@ void SimpleFirFilter::process(const float *in, float *out, size_t n) {
} }
} }
HalfRateFirFilter::HalfRateFirFilter(const float *kernel, size_t nk, size_t n) : nk(nk) {
float k0[kMaxNk / 2];
float k1[kMaxNk / 2];
size_t n2 = n >> 1;
size_t nk2 = nk >> 1;
// probably better to do fewer allocations and just set up pointers...
y0 = (float *)malloc_aligned(16, n2 * sizeof(y0[0]));
y1 = (float *)malloc_aligned(16, n2 * sizeof(y1[0]));
y2 = (float *)malloc_aligned(16, n2 * sizeof(y2[0]));
i0 = (float *)malloc_aligned(16, (n2 + nk2) * sizeof(i0[0]));
i1 = (float *)malloc_aligned(16, (n2 + nk2) * sizeof(i1[0]));
i2 = (float *)malloc_aligned(16, (n2 + nk2) * sizeof(i2[0]));
k2 = (float *)malloc_aligned(16, nk2 * sizeof(k2[0]));
for (size_t i = 0; i < nk2; i++) {
float b0 = kernel[i * 2];
float b2 = kernel[i * 2 + 1];
k0[i] = b0;
k1[i] = b0 + b2;
k2[i] = b2;
}
f0 = new SimpleFirFilter(k0, nk2);
f1 = new SimpleFirFilter(k1, nk2);
f2 = new SimpleFirFilter(k2, nk2);
}
HalfRateFirFilter::~HalfRateFirFilter() {
free(k2);
delete i0;
delete i1;
delete i2;
delete y0;
delete y1;
delete y2;
delete f0;
delete f1;
delete f2;
}
extern "C"
void neon_halfrate_split(const float *in, float *buf0, float *buf1, float *buf2, size_t n);
extern "C"
void neon_halfrate_combine(const float *out, float *buf0, float *buf1, float *buf2, size_t n);
void HalfRateFirFilter::process(const float *in, float *out, size_t n) {
size_t n2 = n >> 1;
size_t nk2 = nk >> 1;
size_t n2in = n2 + nk2 - 1;
#ifdef HAVE_NEON
neon_halfrate_split(in - 1, i0, i1, i2, n2in + 1);
#else
i2[0] = in[0];
for (size_t i = 0; i < n2in; i++) {
float a0 = in[i * 2 + 1];
float a2 = in[i * 2 + 2];
i0[1 + i] = a0;
i1[1 + i] = a0 + a2;
i2[1 + i] = a2;
}
#endif
f0->process(i0 + 1, y0, n2);
f1->process(i1 + 1, y1, n2);
f2->process(i2 + 1, y2, n2);
#ifdef HAVE_NEON
neon_halfrate_combine(out, y0, y1, y2, n2);
#else
float z2m2 = 0;
for (size_t i = 0; i < nk2; i++) {
z2m2 += k2[nk2 - 1 - i] * i2[i];
}
for (size_t i = 0; i < n2; i++) {
float m0 = y0[i];
float m1 = y1[i];
float m2 = y2[i];
out[i * 2] = m0 + z2m2;
out[i * 2 + 1] = m1 - m0 - m2;
//out[i*2] = i1.get()[i];
z2m2 = m2;
}
#endif
}
#ifdef HAVE_NEON
NeonFirFilter::NeonFirFilter(const float *kernel, size_t nk) : nk(nk) { NeonFirFilter::NeonFirFilter(const float *kernel, size_t nk) : nk(nk) {
// TODO: handle odd size nk (must be multiple of 4) // TODO: handle odd size nk (must be multiple of 4)
k = (float *)malloc_aligned(16, nk * sizeof(k[0])); k = (float *)malloc_aligned(16, nk * sizeof(k[0]));
@ -62,11 +146,37 @@ NeonFirFilter::~NeonFirFilter() {
free(k); free(k);
} }
#ifdef HAVE_NEON
extern "C" extern "C"
void neon_fir_direct(const float *in, const float *k, float *out, size_t n, size_t nk); void neon_fir_direct(const float *in, const float *k, float *out, size_t n, size_t nk);
#endif
void NeonFirFilter::process(const float *in, float *out, size_t n) { void NeonFirFilter::process(const float *in, float *out, size_t n) {
neon_fir_direct(in - 1, k, out, n, nk); neon_fir_direct(in - 1, k, out, n, nk);
} }
Neon16FirFilter::Neon16FirFilter(const float *kernel, size_t nk, bool mirror)
: nk(nk), mirror(mirror) {
// TODO: handle odd size nk (must be multiple of 4)
k = (int16_t *)malloc_aligned(16, nk * sizeof(k[0]));
for (size_t i = 0; i < nk; i++) {
k[i] = 32768 * kernel[nk - i - 1];
}
}
Neon16FirFilter::~Neon16FirFilter() {
free(k);
}
extern "C"
void neon_fir_fixed16(const float *in, const int16_t *k, float *out, size_t n, size_t nk);
extern "C"
void neon_fir_fixed16m(const float *in, const int16_t *k, float *out, size_t n, size_t nk);
void Neon16FirFilter::process(const float *in, float *out, size_t n) {
if (mirror)
neon_fir_fixed16m(in - 1, k, out, n, nk);
else
neon_fir_fixed16(in - 1, k, out, n, nk);
}
#endif

@ -18,15 +18,15 @@
void *malloc_aligned(size_t alignment, size_t nbytes); void *malloc_aligned(size_t alignment, size_t nbytes);
// Abstract class // Abstract class
class FirFilter { template <typename T, typename U> class FirFilter {
public: public:
// preconditions: // preconditions:
// in + (nk - 1) is aligned to 128 bits // in + (nk - 1) is aligned to 128 bits
// out is aligned to 128 bits // out is aligned to 128 bits
virtual void process(const float *in, float *out, size_t n) = 0; virtual void process(const T *in, U *out, size_t n) = 0;
}; };
class SimpleFirFilter : public FirFilter { class SimpleFirFilter : public FirFilter<float, float> {
public: public:
SimpleFirFilter(const float *kernel, size_t nk); SimpleFirFilter(const float *kernel, size_t nk);
~SimpleFirFilter(); ~SimpleFirFilter();
@ -36,8 +36,25 @@ class SimpleFirFilter : public FirFilter {
float *k; float *k;
}; };
// should make conditional compilation? class HalfRateFirFilter : public FirFilter<float, float> {
class NeonFirFilter : public FirFilter { public:
HalfRateFirFilter(const float *kernel, size_t nk, size_t n);
~HalfRateFirFilter();
void process(const float *in, float *out, size_t n);
private:
static const int kMaxNk = 256;
size_t nk;
float *i0, *i1, *i2;
float *y0, *y1, *y2;
float *k2;
FirFilter *f0;
FirFilter *f1;
FirFilter *f2;
};
#ifdef HAVE_NEON
class NeonFirFilter : public FirFilter<float, float> {
public: public:
NeonFirFilter(const float *kernel, size_t nk); NeonFirFilter(const float *kernel, size_t nk);
~NeonFirFilter(); ~NeonFirFilter();
@ -46,3 +63,16 @@ class NeonFirFilter : public FirFilter {
size_t nk; size_t nk;
float *k; float *k;
}; };
class Neon16FirFilter : public FirFilter<float, float> {
public:
Neon16FirFilter(const float *kernel, size_t nk, bool mirror);
~Neon16FirFilter();
void process(const float *in, float *out, size_t n);
private:
size_t nk;
int16_t *k;
bool mirror;
};
#endif // HAVE_NEON

@ -94,4 +94,274 @@ neon_fir_direct3:
pop {r4-r7} pop {r4-r7}
bx lr bx lr
.size neon_fir_direct, .-neon_fir_direct @ a sketch to see if this is worthwhile; looks like it isn't
.if 0
.size neon_fir_short, .-neon_fir_short
.align 2
.global neon_fir_short
.type neon_fir_short, %function
neon_fir_short:
@ r0 = pointer to input (aligned)
@ r1 = pointer to kernel (aligned)
@ r2 = pointer to output (aligned)
@ r3 = size of input in floats (multiple of 4)
@ stack[0] = size of kernel in floats (multiple of 4)
push {r4-r7}
vpush {q4-q7}
ldr r4, [sp, #(4 * 4 + 4 * 16)]
sub sp, #(4 * 16) @ room for z2's
lsl r6, r4, #2
sub r6, #16
@ compute initial overlap (todo redo)
mov r7, r4
vmov.i32 q9, #0
vmov.i32 q10, #0
vmov.i32 q11, #0
neon_fir_short1:
vld1.i32 {q0}, [r0:128]! @ load 4 samples from input
vld1.i32 {q1}, [r1:128]! @ load 4 samples from kernel
vmla.f32 q9, q1, d0[1]
vmla.f32 q10, q1, d1[0]
vmla.f32 q11, q1, d1[1]
subs r7, #4
bne neon_fir_short1
vmov.i32 q12, #0
vext.32 q13, q9, q12, #3
vext.32 q1, q10, q12, #2
vadd.f32 q13, q1
vext.32 q1, q11, q12, #1
vadd.f32 q8, q13, q1
sub r0, r6
sub r1, r4, lsl #2
neon_fir_short2:
mov r7, r4
vmov.i32 q4, #0
vmov.i32 q5, #0
vmov.i32 q6, #0
vmov.i32 q7, #0
vmov.i32 q8, #0
vmov.i32 q9, #0
vmov.i32 q10, #0
vmov.i32 q11, #0
vmov.i32 q12, #0
vmov.i32 q13, #0
vmov.i32 q14, #0
vmov.i32 q15, #0
@ inner loop
neon_fir_short3:
vld1.32 {q0}, [r0:128]! @ load 4 samples from input (a0)
vld1.i32 {q1}, [r1:128]! @ load 4 samples from kernel (b0)
vmla.f32 q4, q1, d0[0]
vmla.f32 q5, q1, d0[1]
vmla.f32 q6, q1, d1[0]
vmla.f32 q7, q1, d1[1]
vld1.32 {q2}, [r0:128]! @ load 4 samples from input (a2)
vadd.f32 q0, q2 @ a1
vld1.i32 {q3}, [r1:128]! @ load 4 samples from kernel (b2)
vadd.f32 q1, q3 @ b1
vmla.f32 q12, q3, d4[0]
vmla.f32 q13, q3, d4[1]
vmla.f32 q14, q3, d5[0]
vmla.f32 q15, q3, d5[1]
vmla.f32 q8, q1, d0[0]
vmla.f32 q9, q1, d0[1]
vmla.f32 q10, q1, d1[0]
vmla.f32 q11, q1, d1[1]
subs r7, #8
bne neon_fir_short3
@ now for the fun part
mov r5, sp
vld1.32 {q0, q1}, [r5]!
vld1.32 {q2, q3}, [r5]!
vadd.f32 q0, q4
vadd.f32 q1, q5
vadd.f32 q2, q6
vadd.f32 q3, q7
vsub.f32 q8, q4
vsub.f32 q9, q5
vsub.f32 q10, q6
vsub.f32 q1, q7
vsub.f32 q8, q12
vsub.f32 q9, q13
vsub.f32 q10, q14
vsub.f32 q11, q15
mov r5, sp
vst1.32 {q12, q13}, [r5]!
vst1.32 {q14, q15}, [r5]!
@ process overlaps
vext.32 q0, q12, q9, #3
vext.32 q13, q9, q12, #3
vadd.f32 q8, q0
vext.32 q0, q12, q10, #2
vext.32 q1, q10, q12, #2
vadd.f32 q8, q0
vadd.f32 q13, q1
vext.32 q0, q12, q11, #1
vext.32 q1, q11, q12, #1
vadd.f32 q0, q8
vadd.f32 q8, q13, q1
sub r0, r6
sub r1, r4, lsl #2
vst1.32 {q0}, [r2:128]!
vst1.32 {q1}, [r2:128]!
subs r3, #8
bne neon_fir_short2
add sp, #64
vpop {q4-q7}
pop {r4-r7}
bx lr
.size neon_fir_short, .-neon_fir_short
.endif
.align 2
.global neon_fir_fixed16
.type neon_fir_fixed16, %function
@ fixed 16 bit dot product
@ based on Andy Hung code, but not doing mirror trick
neon_fir_fixed16:
push {r4-r7}
ldr r4, [sp, #(4 * 4)]
lsl r6, r4, #1
sub r6, #2
neon_fir_fixed16_1:
vmov.i16 q0, #0
mov r7, r4
neon_fir_fixed16_2:
vld1.16 {q2, q3}, [r0]! @ load 16 samples from input
vld1.16 {q8, q9}, [r1:128]! @ load 16 samples from kernel
vmlal.s16 q0, d4, d16
vmlal.s16 q0, d5, d17
vmlal.s16 q0, d6, d18
vmlal.s16 q0, d7, d19
subs r7, #16
bne neon_fir_fixed16_2
vpadd.s32 d0, d0, d1
vpadd.s32 d0, d0, d0
sub r0, r6
sub r1, r4, lsl #1
vst1.32 {d0[0]}, [r2]!
subs r3, #1
bne neon_fir_fixed16_1
pop {r4-r7}
bx lr
.size neon_fir_fixed16, .-neon_fir_fixed16
.align 2
.global neon_fir_fixed16m
.type neon_fir_fixed16m, %function
@ fixed 16 bit dot product
@ based on Andy Hung code, modeling mirror trick
neon_fir_fixed16m:
push {r4-r7}
ldr r4, [sp, #(4 * 4)]
lsl r6, r4, #1
sub r6, #2
neon_fir_fixed16m_1:
vmov.i16 q0, #0
mov r7, r4
add r5, r0, r6
neon_fir_fixed16m_2:
vld1.16 {q2}, [r5] @ load 8 samples from input
vld1.16 {q3}, [r0]! @ load 8 samples from input
vld1.16 {q8}, [r1:128]! @ load 8 samples from kernel
vld1.16 {q9}, [r1:128]! @ load 8 samples from kernel
vmlal.s16 q0, d4, d16
vmlal.s16 q0, d5, d17
vmlal.s16 q0, d6, d18
vmlal.s16 q0, d7, d19
subs r7, #16
sub r5, #16
bne neon_fir_fixed16m_2
vpadd.s32 d0, d0, d1
vpadd.s32 d0, d0, d0
sub r0, r6
sub r1, r4, lsl #1
vst1.32 {d0[0]}, [r2]!
subs r3, #1
bne neon_fir_fixed16m_1
pop {r4-r7}
bx lr
.size neon_fir_fixed16m, .-neon_fir_fixed16m
.align 2
.global neon_halfrate_split
neon_halfrate_split:
@ r0 = pointer to input
@ r1 = pointer to buf0
@ r2 = pointer to buf1
@ r3 = pointer to buf2
@ stack[0] = number of output buffers to fill
push {r4}
ldr r4, [sp, #4]
neon_halfrate_split1:
vld2.32 {q0, q1}, [r0:128]!
vadd.f32 q2, q0, q1
vst1.32 {q0}, [r1:128]!
vst1.32 {q1}, [r3:128]!
vst1.32 {q2}, [r2:128]!
subs r4, #4
bne neon_halfrate_split1
pop {r4}
bx lr
.size neon_halfrate_split, .-.neon_halfrate_split
.align 2
.global neon_halfrate_combine
neon_halfrate_combine:
@ r0 = pointer to output
@ r1 = pointer to buf0
@ r2 = pointer to buf1
@ r3 = pointer to buf2
@ stack[0] = number of input buffers to combine
@ todo: deal with z2m2
push {r4}
ldr r4, [sp, #4]
vmov.i32 q3, #0
neon_halfrate_combine1:
vld1.32 {q2}, [r3:128]!
vld1.32 {q0}, [r1:128]!
vext.32 q8, q3, q2, #3
vld1.32 {q1}, [r2:128]!
vsub.f32 q1, q0
vadd.f32 q0, q8
vsub.f32 q1, q2
vst2.32 {q0, q1}, [r0:128]!
vmov q3, q2
subs r4, #4
bne neon_halfrate_combine1
pop {r4}
bx lr
.size neon_halfrate_combine, .-.neon_halfrate_combine

@ -37,7 +37,7 @@ void condition_governor() {
struct timespec ts; struct timespec ts;
ts.tv_sec = 0; ts.tv_sec = 0;
ts.tv_nsec = 900000000 + (v & 1); // 900ms ts.tv_nsec = 900000000 + (v & 1); // 900ms
nanosleep(&ts, NULL); //nanosleep(&ts, NULL);
// consume cpu a bit to try to coax max cpufreq // consume cpu a bit to try to coax max cpufreq
uint32_t x = v; uint32_t x = v;
@ -58,14 +58,14 @@ float *mkrandom(size_t size) {
return result; return result;
} }
double test_accuracy(FirFilter *f1, FirFilter *f2, const float *inp, int nblock) { double test_accuracy(FirFilter<float, float> *f1, FirFilter<float, float> *f2, const float *inp, int nblock) {
float *out1 = (float *)malloc_aligned(16, nblock * sizeof(out1[0])); float *out1 = (float *)malloc_aligned(16, nblock * sizeof(out1[0]));
float *out2 = (float *)malloc_aligned(16, nblock * sizeof(out2[0])); float *out2 = (float *)malloc_aligned(16, nblock * sizeof(out2[0]));
f1->process(inp + 1, out1, nblock); f1->process(inp + 1, out1, nblock);
f2->process(inp + 1, out2, nblock); f2->process(inp + 1, out2, nblock);
double err = 0; double err = 0;
for (int i = 0; i < nblock; i++) { for (int i = 0; i < nblock; i++) {
//printf("%d: %f %f\n", i, out1[i], out2[i]); printf("#%d: %f %f\n", i, out1[i], out2[i]);
err += fabs(out1[i] - out2[i]); err += fabs(out1[i] - out2[i]);
} }
free(out1); free(out1);
@ -80,28 +80,39 @@ void benchfir(int size, int experiment) {
float *kernel = mkrandom(size); float *kernel = mkrandom(size);
float *inp = mkrandom(size + nblock); float *inp = mkrandom(size + nblock);
float *out = (float *)malloc_aligned(16, nblock * sizeof(out[0])); float *out = (float *)malloc_aligned(16, nblock * sizeof(out[0]));
FirFilter *f; FirFilter<float, float> *f;
switch(experiment) { switch(experiment) {
case 0: case 0:
f = new SimpleFirFilter(kernel, size); f = new SimpleFirFilter(kernel, size);
break; break;
#ifdef HAVE_NEON
// this will crash on non-NEON devices, but we're only interested
// in testing NEON for now
case 1: case 1:
f = new NeonFirFilter(kernel, size); f = new NeonFirFilter(kernel, size);
break; break;
case 2:
case 3:
f = new Neon16FirFilter(kernel, size, experiment == 3);
break;
#endif
case 4:
f = new HalfRateFirFilter(kernel, size, nblock);
break;
} }
double start = now(); double start = now();
for (int j = 0; j < 15625; j++) { for (int j = 0; j < 15625; j++) {
f->process(inp + 1, out, 64); f->process(inp + 1, out, nblock);
} }
double elapsed = now() - start; double elapsed = now() - start;
printf("%i %f\n", size, 1e3 * elapsed); printf("%i %f\n", size, 1e3 * elapsed);
FirFilter *fbase = new SimpleFirFilter(kernel, size); FirFilter<float, float> *fbase = new SimpleFirFilter(kernel, size);
double accuracy = test_accuracy(fbase, f, inp, nblock); double accuracy = test_accuracy(fbase, f, inp, nblock);
//printf("accuracy = %g\n", accuracy); printf("#accuracy = %g\n", accuracy);
delete f; delete f;
delete fbase; delete fbase;
@ -114,9 +125,9 @@ int main(int argc, char **argv) {
printf("set style data linespoints\n" printf("set style data linespoints\n"
"set xlabel 'FIR kernel size'\n" "set xlabel 'FIR kernel size'\n"
"set ylabel 'ns per sample'\n" "set ylabel 'ns per sample'\n"
"plot '-' title 'scalar', '-' title '4x4 block'\n"); "plot '-' title 'scalar', '-' title '4x4 block', '-' title 'fixed16', '-' title 'fixed16 mirror', '-' title 'half rate'\n");
for (int experiment = 0; experiment < 2; experiment++) { for (int experiment = 0; experiment < 5; experiment++) {
for (int i = 16; i <= 128; i += 16) { for (int i = 16; i <= 256; i += 16) {
benchfir(i, experiment); benchfir(i, experiment);
} }
printf("e\n"); printf("e\n");

Loading…
Cancel
Save