From 5a203084696ba03e3ef8ea13031b335d17922f7d Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Mon, 16 Dec 2013 12:54:29 -0800 Subject: [PATCH] Half-rate FIR implementation This commit is a test implementation of a half-rate FIR structure (basically a Toom-Cook). It's not bad in the scalar case, but the benefit is marginal at best in NEON. --- cpp/src/fir.cc | 114 ++++++++++++++++++- cpp/src/fir.h | 40 ++++++- cpp/src/neon_fir.s | 272 +++++++++++++++++++++++++++++++++++++++++++- cpp/src/test_fir.cc | 31 +++-- 4 files changed, 439 insertions(+), 18 deletions(-) diff --git a/cpp/src/fir.cc b/cpp/src/fir.cc index 861e5f5..a03da68 100644 --- a/cpp/src/fir.cc +++ b/cpp/src/fir.cc @@ -20,6 +20,7 @@ #include #include +#include "aligned_buf.h" #include "fir.h" // Should probably ifdef this to make it more portable @@ -48,6 +49,89 @@ void SimpleFirFilter::process(const float *in, float *out, size_t n) { } } +HalfRateFirFilter::HalfRateFirFilter(const float *kernel, size_t nk, size_t n) : nk(nk) { + float k0[kMaxNk / 2]; + float k1[kMaxNk / 2]; + size_t n2 = n >> 1; + size_t nk2 = nk >> 1; + // probably better to do fewer allocations and just set up pointers... + y0 = (float *)malloc_aligned(16, n2 * sizeof(y0[0])); + y1 = (float *)malloc_aligned(16, n2 * sizeof(y1[0])); + y2 = (float *)malloc_aligned(16, n2 * sizeof(y2[0])); + i0 = (float *)malloc_aligned(16, (n2 + nk2) * sizeof(i0[0])); + i1 = (float *)malloc_aligned(16, (n2 + nk2) * sizeof(i1[0])); + i2 = (float *)malloc_aligned(16, (n2 + nk2) * sizeof(i2[0])); + k2 = (float *)malloc_aligned(16, nk2 * sizeof(k2[0])); + for (size_t i = 0; i < nk2; i++) { + float b0 = kernel[i * 2]; + float b2 = kernel[i * 2 + 1]; + k0[i] = b0; + k1[i] = b0 + b2; + k2[i] = b2; + } + f0 = new SimpleFirFilter(k0, nk2); + f1 = new SimpleFirFilter(k1, nk2); + f2 = new SimpleFirFilter(k2, nk2); +} + +HalfRateFirFilter::~HalfRateFirFilter() { + free(k2); + delete i0; + delete i1; + delete i2; + delete y0; + delete y1; + delete y2; + delete f0; + delete f1; + delete f2; +} + +extern "C" +void neon_halfrate_split(const float *in, float *buf0, float *buf1, float *buf2, size_t n); + +extern "C" +void neon_halfrate_combine(const float *out, float *buf0, float *buf1, float *buf2, size_t n); + +void HalfRateFirFilter::process(const float *in, float *out, size_t n) { + size_t n2 = n >> 1; + size_t nk2 = nk >> 1; + size_t n2in = n2 + nk2 - 1; +#ifdef HAVE_NEON + neon_halfrate_split(in - 1, i0, i1, i2, n2in + 1); +#else + i2[0] = in[0]; + for (size_t i = 0; i < n2in; i++) { + float a0 = in[i * 2 + 1]; + float a2 = in[i * 2 + 2]; + i0[1 + i] = a0; + i1[1 + i] = a0 + a2; + i2[1 + i] = a2; + } +#endif + f0->process(i0 + 1, y0, n2); + f1->process(i1 + 1, y1, n2); + f2->process(i2 + 1, y2, n2); +#ifdef HAVE_NEON + neon_halfrate_combine(out, y0, y1, y2, n2); +#else + float z2m2 = 0; + for (size_t i = 0; i < nk2; i++) { + z2m2 += k2[nk2 - 1 - i] * i2[i]; + } + for (size_t i = 0; i < n2; i++) { + float m0 = y0[i]; + float m1 = y1[i]; + float m2 = y2[i]; + out[i * 2] = m0 + z2m2; + out[i * 2 + 1] = m1 - m0 - m2; + //out[i*2] = i1.get()[i]; + z2m2 = m2; + } +#endif +} + +#ifdef HAVE_NEON NeonFirFilter::NeonFirFilter(const float *kernel, size_t nk) : nk(nk) { // TODO: handle odd size nk (must be multiple of 4) k = (float *)malloc_aligned(16, nk * sizeof(k[0])); @@ -62,11 +146,37 @@ NeonFirFilter::~NeonFirFilter() { free(k); } -#ifdef HAVE_NEON extern "C" void neon_fir_direct(const float *in, const float *k, float *out, size_t n, size_t nk); -#endif void NeonFirFilter::process(const float *in, float *out, size_t n) { neon_fir_direct(in - 1, k, out, n, nk); } + +Neon16FirFilter::Neon16FirFilter(const float *kernel, size_t nk, bool mirror) + : nk(nk), mirror(mirror) { + // TODO: handle odd size nk (must be multiple of 4) + k = (int16_t *)malloc_aligned(16, nk * sizeof(k[0])); + for (size_t i = 0; i < nk; i++) { + k[i] = 32768 * kernel[nk - i - 1]; + } +} + +Neon16FirFilter::~Neon16FirFilter() { + free(k); +} + +extern "C" +void neon_fir_fixed16(const float *in, const int16_t *k, float *out, size_t n, size_t nk); + +extern "C" +void neon_fir_fixed16m(const float *in, const int16_t *k, float *out, size_t n, size_t nk); + +void Neon16FirFilter::process(const float *in, float *out, size_t n) { + if (mirror) + neon_fir_fixed16m(in - 1, k, out, n, nk); + else + neon_fir_fixed16(in - 1, k, out, n, nk); +} + +#endif diff --git a/cpp/src/fir.h b/cpp/src/fir.h index 2dc3d7e..3ff245e 100644 --- a/cpp/src/fir.h +++ b/cpp/src/fir.h @@ -18,15 +18,15 @@ void *malloc_aligned(size_t alignment, size_t nbytes); // Abstract class -class FirFilter { +template class FirFilter { public: // preconditions: // in + (nk - 1) is aligned to 128 bits // out is aligned to 128 bits - virtual void process(const float *in, float *out, size_t n) = 0; + virtual void process(const T *in, U *out, size_t n) = 0; }; -class SimpleFirFilter : public FirFilter { +class SimpleFirFilter : public FirFilter { public: SimpleFirFilter(const float *kernel, size_t nk); ~SimpleFirFilter(); @@ -36,8 +36,25 @@ class SimpleFirFilter : public FirFilter { float *k; }; -// should make conditional compilation? -class NeonFirFilter : public FirFilter { +class HalfRateFirFilter : public FirFilter { + public: + HalfRateFirFilter(const float *kernel, size_t nk, size_t n); + ~HalfRateFirFilter(); + void process(const float *in, float *out, size_t n); + private: + static const int kMaxNk = 256; + size_t nk; + float *i0, *i1, *i2; + float *y0, *y1, *y2; + float *k2; + FirFilter *f0; + FirFilter *f1; + FirFilter *f2; +}; + +#ifdef HAVE_NEON + +class NeonFirFilter : public FirFilter { public: NeonFirFilter(const float *kernel, size_t nk); ~NeonFirFilter(); @@ -46,3 +63,16 @@ class NeonFirFilter : public FirFilter { size_t nk; float *k; }; + +class Neon16FirFilter : public FirFilter { + public: + Neon16FirFilter(const float *kernel, size_t nk, bool mirror); + ~Neon16FirFilter(); + void process(const float *in, float *out, size_t n); + private: + size_t nk; + int16_t *k; + bool mirror; +}; + +#endif // HAVE_NEON diff --git a/cpp/src/neon_fir.s b/cpp/src/neon_fir.s index b7d3131..c0bcb20 100644 --- a/cpp/src/neon_fir.s +++ b/cpp/src/neon_fir.s @@ -94,4 +94,274 @@ neon_fir_direct3: pop {r4-r7} bx lr - .size neon_fir_direct, .-neon_fir_direct +@ a sketch to see if this is worthwhile; looks like it isn't + + + .if 0 + .size neon_fir_short, .-neon_fir_short + + .align 2 + .global neon_fir_short + .type neon_fir_short, %function +neon_fir_short: +@ r0 = pointer to input (aligned) +@ r1 = pointer to kernel (aligned) +@ r2 = pointer to output (aligned) +@ r3 = size of input in floats (multiple of 4) +@ stack[0] = size of kernel in floats (multiple of 4) + + push {r4-r7} + vpush {q4-q7} + ldr r4, [sp, #(4 * 4 + 4 * 16)] + + sub sp, #(4 * 16) @ room for z2's + lsl r6, r4, #2 + sub r6, #16 + + @ compute initial overlap (todo redo) + mov r7, r4 + vmov.i32 q9, #0 + vmov.i32 q10, #0 + vmov.i32 q11, #0 +neon_fir_short1: + vld1.i32 {q0}, [r0:128]! @ load 4 samples from input + vld1.i32 {q1}, [r1:128]! @ load 4 samples from kernel + vmla.f32 q9, q1, d0[1] + vmla.f32 q10, q1, d1[0] + vmla.f32 q11, q1, d1[1] + subs r7, #4 + bne neon_fir_short1 + vmov.i32 q12, #0 + vext.32 q13, q9, q12, #3 + vext.32 q1, q10, q12, #2 + vadd.f32 q13, q1 + vext.32 q1, q11, q12, #1 + vadd.f32 q8, q13, q1 + sub r0, r6 + sub r1, r4, lsl #2 + +neon_fir_short2: + + mov r7, r4 + vmov.i32 q4, #0 + vmov.i32 q5, #0 + vmov.i32 q6, #0 + vmov.i32 q7, #0 + vmov.i32 q8, #0 + vmov.i32 q9, #0 + vmov.i32 q10, #0 + vmov.i32 q11, #0 + vmov.i32 q12, #0 + vmov.i32 q13, #0 + vmov.i32 q14, #0 + vmov.i32 q15, #0 + @ inner loop +neon_fir_short3: + vld1.32 {q0}, [r0:128]! @ load 4 samples from input (a0) + vld1.i32 {q1}, [r1:128]! @ load 4 samples from kernel (b0) + vmla.f32 q4, q1, d0[0] + vmla.f32 q5, q1, d0[1] + vmla.f32 q6, q1, d1[0] + vmla.f32 q7, q1, d1[1] + vld1.32 {q2}, [r0:128]! @ load 4 samples from input (a2) + vadd.f32 q0, q2 @ a1 + vld1.i32 {q3}, [r1:128]! @ load 4 samples from kernel (b2) + vadd.f32 q1, q3 @ b1 + vmla.f32 q12, q3, d4[0] + vmla.f32 q13, q3, d4[1] + vmla.f32 q14, q3, d5[0] + vmla.f32 q15, q3, d5[1] + vmla.f32 q8, q1, d0[0] + vmla.f32 q9, q1, d0[1] + vmla.f32 q10, q1, d1[0] + vmla.f32 q11, q1, d1[1] + subs r7, #8 + bne neon_fir_short3 + + @ now for the fun part + mov r5, sp + vld1.32 {q0, q1}, [r5]! + vld1.32 {q2, q3}, [r5]! + vadd.f32 q0, q4 + vadd.f32 q1, q5 + vadd.f32 q2, q6 + vadd.f32 q3, q7 + vsub.f32 q8, q4 + vsub.f32 q9, q5 + vsub.f32 q10, q6 + vsub.f32 q1, q7 + vsub.f32 q8, q12 + vsub.f32 q9, q13 + vsub.f32 q10, q14 + vsub.f32 q11, q15 + mov r5, sp + vst1.32 {q12, q13}, [r5]! + vst1.32 {q14, q15}, [r5]! + + @ process overlaps + vext.32 q0, q12, q9, #3 + vext.32 q13, q9, q12, #3 + vadd.f32 q8, q0 + vext.32 q0, q12, q10, #2 + vext.32 q1, q10, q12, #2 + vadd.f32 q8, q0 + vadd.f32 q13, q1 + vext.32 q0, q12, q11, #1 + vext.32 q1, q11, q12, #1 + vadd.f32 q0, q8 + vadd.f32 q8, q13, q1 + + sub r0, r6 + sub r1, r4, lsl #2 + vst1.32 {q0}, [r2:128]! + vst1.32 {q1}, [r2:128]! + + subs r3, #8 + bne neon_fir_short2 + + add sp, #64 + vpop {q4-q7} + pop {r4-r7} + bx lr + + .size neon_fir_short, .-neon_fir_short + .endif + + .align 2 + .global neon_fir_fixed16 + .type neon_fir_fixed16, %function + +@ fixed 16 bit dot product +@ based on Andy Hung code, but not doing mirror trick +neon_fir_fixed16: + + push {r4-r7} + ldr r4, [sp, #(4 * 4)] + + lsl r6, r4, #1 + sub r6, #2 + +neon_fir_fixed16_1: + vmov.i16 q0, #0 + mov r7, r4 + +neon_fir_fixed16_2: + vld1.16 {q2, q3}, [r0]! @ load 16 samples from input + vld1.16 {q8, q9}, [r1:128]! @ load 16 samples from kernel + vmlal.s16 q0, d4, d16 + vmlal.s16 q0, d5, d17 + vmlal.s16 q0, d6, d18 + vmlal.s16 q0, d7, d19 + subs r7, #16 + bne neon_fir_fixed16_2 + vpadd.s32 d0, d0, d1 + vpadd.s32 d0, d0, d0 + + sub r0, r6 + sub r1, r4, lsl #1 + vst1.32 {d0[0]}, [r2]! + + subs r3, #1 + bne neon_fir_fixed16_1 + + pop {r4-r7} + bx lr + .size neon_fir_fixed16, .-neon_fir_fixed16 + + .align 2 + .global neon_fir_fixed16m + .type neon_fir_fixed16m, %function + +@ fixed 16 bit dot product +@ based on Andy Hung code, modeling mirror trick +neon_fir_fixed16m: + + push {r4-r7} + ldr r4, [sp, #(4 * 4)] + + lsl r6, r4, #1 + sub r6, #2 + +neon_fir_fixed16m_1: + vmov.i16 q0, #0 + mov r7, r4 + add r5, r0, r6 + +neon_fir_fixed16m_2: + vld1.16 {q2}, [r5] @ load 8 samples from input + vld1.16 {q3}, [r0]! @ load 8 samples from input + vld1.16 {q8}, [r1:128]! @ load 8 samples from kernel + vld1.16 {q9}, [r1:128]! @ load 8 samples from kernel + vmlal.s16 q0, d4, d16 + vmlal.s16 q0, d5, d17 + vmlal.s16 q0, d6, d18 + vmlal.s16 q0, d7, d19 + subs r7, #16 + sub r5, #16 + bne neon_fir_fixed16m_2 + vpadd.s32 d0, d0, d1 + vpadd.s32 d0, d0, d0 + + sub r0, r6 + sub r1, r4, lsl #1 + vst1.32 {d0[0]}, [r2]! + + subs r3, #1 + bne neon_fir_fixed16m_1 + + pop {r4-r7} + bx lr + .size neon_fir_fixed16m, .-neon_fir_fixed16m + + .align 2 + .global neon_halfrate_split +neon_halfrate_split: +@ r0 = pointer to input +@ r1 = pointer to buf0 +@ r2 = pointer to buf1 +@ r3 = pointer to buf2 +@ stack[0] = number of output buffers to fill + push {r4} + ldr r4, [sp, #4] +neon_halfrate_split1: + vld2.32 {q0, q1}, [r0:128]! + vadd.f32 q2, q0, q1 + vst1.32 {q0}, [r1:128]! + vst1.32 {q1}, [r3:128]! + vst1.32 {q2}, [r2:128]! + subs r4, #4 + bne neon_halfrate_split1 + + pop {r4} + bx lr + .size neon_halfrate_split, .-.neon_halfrate_split + + .align 2 + .global neon_halfrate_combine +neon_halfrate_combine: +@ r0 = pointer to output +@ r1 = pointer to buf0 +@ r2 = pointer to buf1 +@ r3 = pointer to buf2 +@ stack[0] = number of input buffers to combine +@ todo: deal with z2m2 + push {r4} + ldr r4, [sp, #4] + vmov.i32 q3, #0 +neon_halfrate_combine1: + vld1.32 {q2}, [r3:128]! + vld1.32 {q0}, [r1:128]! + vext.32 q8, q3, q2, #3 + vld1.32 {q1}, [r2:128]! + vsub.f32 q1, q0 + vadd.f32 q0, q8 + vsub.f32 q1, q2 + vst2.32 {q0, q1}, [r0:128]! + vmov q3, q2 + + subs r4, #4 + bne neon_halfrate_combine1 + + pop {r4} + bx lr + .size neon_halfrate_combine, .-.neon_halfrate_combine diff --git a/cpp/src/test_fir.cc b/cpp/src/test_fir.cc index d26638a..6c055c6 100644 --- a/cpp/src/test_fir.cc +++ b/cpp/src/test_fir.cc @@ -37,7 +37,7 @@ void condition_governor() { struct timespec ts; ts.tv_sec = 0; ts.tv_nsec = 900000000 + (v & 1); // 900ms - nanosleep(&ts, NULL); + //nanosleep(&ts, NULL); // consume cpu a bit to try to coax max cpufreq uint32_t x = v; @@ -58,14 +58,14 @@ float *mkrandom(size_t size) { return result; } -double test_accuracy(FirFilter *f1, FirFilter *f2, const float *inp, int nblock) { +double test_accuracy(FirFilter *f1, FirFilter *f2, const float *inp, int nblock) { float *out1 = (float *)malloc_aligned(16, nblock * sizeof(out1[0])); float *out2 = (float *)malloc_aligned(16, nblock * sizeof(out2[0])); f1->process(inp + 1, out1, nblock); f2->process(inp + 1, out2, nblock); double err = 0; for (int i = 0; i < nblock; i++) { - //printf("%d: %f %f\n", i, out1[i], out2[i]); + printf("#%d: %f %f\n", i, out1[i], out2[i]); err += fabs(out1[i] - out2[i]); } free(out1); @@ -80,28 +80,39 @@ void benchfir(int size, int experiment) { float *kernel = mkrandom(size); float *inp = mkrandom(size + nblock); float *out = (float *)malloc_aligned(16, nblock * sizeof(out[0])); - FirFilter *f; + FirFilter *f; switch(experiment) { case 0: f = new SimpleFirFilter(kernel, size); break; +#ifdef HAVE_NEON + // this will crash on non-NEON devices, but we're only interested + // in testing NEON for now case 1: f = new NeonFirFilter(kernel, size); break; + case 2: + case 3: + f = new Neon16FirFilter(kernel, size, experiment == 3); + break; +#endif + case 4: + f = new HalfRateFirFilter(kernel, size, nblock); + break; } double start = now(); for (int j = 0; j < 15625; j++) { - f->process(inp + 1, out, 64); + f->process(inp + 1, out, nblock); } double elapsed = now() - start; printf("%i %f\n", size, 1e3 * elapsed); - FirFilter *fbase = new SimpleFirFilter(kernel, size); + FirFilter *fbase = new SimpleFirFilter(kernel, size); double accuracy = test_accuracy(fbase, f, inp, nblock); - //printf("accuracy = %g\n", accuracy); + printf("#accuracy = %g\n", accuracy); delete f; delete fbase; @@ -114,9 +125,9 @@ int main(int argc, char **argv) { printf("set style data linespoints\n" "set xlabel 'FIR kernel size'\n" "set ylabel 'ns per sample'\n" - "plot '-' title 'scalar', '-' title '4x4 block'\n"); - for (int experiment = 0; experiment < 2; experiment++) { - for (int i = 16; i <= 128; i += 16) { + "plot '-' title 'scalar', '-' title '4x4 block', '-' title 'fixed16', '-' title 'fixed16 mirror', '-' title 'half rate'\n"); + for (int experiment = 0; experiment < 5; experiment++) { + for (int i = 16; i <= 256; i += 16) { benchfir(i, experiment); } printf("e\n");