You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 
music-synthesizer-for-android/cpp/src/fir.cc

273 lines
8.0 KiB

/*
* Copyright 2013 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Implementation of FIR filtering (convolution)
#include <stdio.h> // for debugging, remove
#include <stdlib.h>
#include "aligned_buf.h"
#include "fir.h"
#ifdef __ANDROID_API__
void *malloc_aligned(size_t alignment, size_t nbytes) {
return memalign(alignment, nbytes);
}
#else
void *malloc_aligned(size_t alignment, size_t nbytes) {
void *result;
int status = posix_memalign(&result, alignment, nbytes);
return status == 0 ? result : 0;
}
#endif
SimpleFirFilter::SimpleFirFilter(const float *kernel, size_t nk) : nk(nk) {
k = (float *)malloc(nk * sizeof(k[0]));
for (size_t i = 0; i < nk; i++) {
k[i] = kernel[nk - i - 1];
}
}
SimpleFirFilter::~SimpleFirFilter() {
free(k);
}
void SimpleFirFilter::process(const float *in, float *out, size_t n) {
for (size_t i = 0; i < n; i++) {
float y = 0;
for (size_t j = 0; j < nk; j++) {
y += k[j] * in[i + j];
}
out[i] = y;
}
}
HalfRateFirFilter::HalfRateFirFilter(const float *kernel, size_t nk, size_t n) : nk(nk) {
float k0[kMaxNk / 2];
float k1[kMaxNk / 2];
size_t n2 = n >> 1;
size_t nk2 = nk >> 1;
// probably better to do fewer allocations and just set up pointers...
y0 = (float *)malloc_aligned(16, n2 * sizeof(y0[0]));
y1 = (float *)malloc_aligned(16, n2 * sizeof(y1[0]));
y2 = (float *)malloc_aligned(16, n2 * sizeof(y2[0]));
i0 = (float *)malloc_aligned(16, (n2 + nk2) * sizeof(i0[0]));
i1 = (float *)malloc_aligned(16, (n2 + nk2) * sizeof(i1[0]));
i2 = (float *)malloc_aligned(16, (n2 + nk2) * sizeof(i2[0]));
k2 = (float *)malloc_aligned(16, nk2 * sizeof(k2[0]));
for (size_t i = 0; i < nk2; i++) {
float b0 = kernel[i * 2];
float b2 = kernel[i * 2 + 1];
k0[i] = b0;
k1[i] = b0 + b2;
k2[i] = b2;
}
f0 = new SimpleFirFilter(k0, nk2);
f1 = new SimpleFirFilter(k1, nk2);
f2 = new SimpleFirFilter(k2, nk2);
}
HalfRateFirFilter::~HalfRateFirFilter() {
free(k2);
delete i0;
delete i1;
delete i2;
delete y0;
delete y1;
delete y2;
delete f0;
delete f1;
delete f2;
}
extern "C"
void neon_halfrate_split(const float *in, float *buf0, float *buf1, float *buf2, size_t n);
extern "C"
void neon_halfrate_combine(const float *out, float *buf0, float *buf1, float *buf2, size_t n);
void HalfRateFirFilter::process(const float *in, float *out, size_t n) {
size_t n2 = n >> 1;
size_t nk2 = nk >> 1;
size_t n2in = n2 + nk2 - 1;
#ifdef HAVE_NEON
neon_halfrate_split(in - 1, i0, i1, i2, n2in + 1);
#else
i2[0] = in[0];
for (size_t i = 0; i < n2in; i++) {
float a0 = in[i * 2 + 1];
float a2 = in[i * 2 + 2];
i0[1 + i] = a0;
i1[1 + i] = a0 + a2;
i2[1 + i] = a2;
}
#endif
f0->process(i0 + 1, y0, n2);
f1->process(i1 + 1, y1, n2);
f2->process(i2 + 1, y2, n2);
#ifdef HAVE_NEON
neon_halfrate_combine(out, y0, y1, y2, n2);
#else
float z2m2 = 0;
for (size_t i = 0; i < nk2; i++) {
z2m2 += k2[nk2 - 1 - i] * i2[i];
}
for (size_t i = 0; i < n2; i++) {
float m0 = y0[i];
float m1 = y1[i];
float m2 = y2[i];
out[i * 2] = m0 + z2m2;
out[i * 2 + 1] = m1 - m0 - m2;
//out[i*2] = i1.get()[i];
z2m2 = m2;
}
#endif
}
#ifdef HAVE_NEON
NeonFirFilter::NeonFirFilter(const float *kernel, size_t nk) : nk(nk) {
// TODO: handle odd size nk (must be multiple of 4)
k = (float *)malloc_aligned(16, nk * sizeof(k[0]));
for (size_t i = 0; i < nk; i += 4) {
for (size_t j = 0; j < 4; j++) {
k[i + j] = kernel[nk - i - 4 + j];
}
}
}
NeonFirFilter::~NeonFirFilter() {
free(k);
}
extern "C"
void neon_fir_direct(const float *in, const float *k, float *out, size_t n, size_t nk);
void NeonFirFilter::process(const float *in, float *out, size_t n) {
neon_fir_direct(in - 1, k, out, n, nk);
}
Neon16FirFilter::Neon16FirFilter(const float *kernel, size_t nk, bool mirror)
: nk(nk), mirror(mirror) {
// TODO: handle odd size nk (must be multiple of 4)
k = (int16_t *)malloc_aligned(16, nk * sizeof(k[0]));
for (size_t i = 0; i < nk; i++) {
k[i] = 32768 * kernel[nk - i - 1];
}
}
Neon16FirFilter::~Neon16FirFilter() {
free(k);
}
extern "C"
void neon_fir_fixed16(const float *in, const int16_t *k, float *out, size_t n, size_t nk);
extern "C"
void neon_fir_fixed16m(const float *in, const int16_t *k, float *out, size_t n, size_t nk);
void Neon16FirFilter::process(const float *in, float *out, size_t n) {
if (mirror)
neon_fir_fixed16m(in - 1, k, out, n, nk);
else
neon_fir_fixed16(in - 1, k, out, n, nk);
}
#endif
#ifdef __SSE2__
#include <emmintrin.h>
SseFirFilter::SseFirFilter(const float *kernel, size_t nk) : nk(nk) {
// TODO: handle odd size nk (must be multiple of 4)
k = (float *)malloc_aligned(16, nk * sizeof(k[0]));
for (size_t i = 0; i < nk; i += 4) {
for (size_t j = 0; j < 4; j++) {
k[i + j] = kernel[nk - i - 4 + j];
}
}
}
SseFirFilter::~SseFirFilter() {
free(k);
}
void printvec(__m128 v) {
float *f = (float *)&v;
printf("[%f %f %f %f]\n", f[0], f[1], f[2], f[3]);
}
void SseFirFilter::process(const float *in1, float *out, size_t n) {
const float *in = in1 - 1;
__m128 q9 = _mm_set_ps1(0.0);
__m128 q10 = _mm_set_ps1(0.0);
__m128 q11 = _mm_set_ps1(0.0);
__m128i mask = _mm_set_epi32(-1, -1, -1, 0);
for (int i = 0; i < nk; i += 4) {
__m128 q0 = _mm_load_ps(&in[i]);
__m128 q1 = _mm_load_ps(&k[i]);
__m128 s = _mm_shuffle_ps(q0, q0, _MM_SHUFFLE(1, 1, 1, 1));
q9 = _mm_add_ps(_mm_mul_ps(q1, s), q9);
s = _mm_shuffle_ps(q0, q0, _MM_SHUFFLE(2, 2, 2, 2));
q10 = _mm_add_ps(_mm_mul_ps(q1, s), q10);
s = _mm_shuffle_ps(q0, q0, _MM_SHUFFLE(3, 3, 3, 3));
q11 = _mm_add_ps(_mm_mul_ps(q1, s), q11);
}
// Note: AVX has _mm_permute_ps, which would be a bit more direct
q9 = (__m128)_mm_and_si128((__m128i)q9, mask);
__m128 q8 = _mm_shuffle_ps(q9, q9, _MM_SHUFFLE(0, 0, 0, 3));
q10 = _mm_shuffle_ps(q10, (__m128)mask, _MM_SHUFFLE(0, 0, 3, 2));
q8 = _mm_add_ps(q8, q10);
q11 = (__m128)_mm_and_si128((__m128i)q11, mask);
q11 = _mm_shuffle_ps(q11, q11, _MM_SHUFFLE(0, 3, 2, 1));
q8 = _mm_add_ps(q8, q11);
for (int i = 0; i < n; i += 4) {
q9 = _mm_set_ps1(0.0);
q10 = _mm_set_ps1(0.0);
q11 = _mm_set_ps1(0.0);
const float *inptr = &in[i + 4];
// inner loop
for (int j = 0; j < nk; j += 4) {
__m128 q0 = _mm_load_ps(&inptr[j]);
__m128 q1 = _mm_load_ps(&k[j]);
__m128 s = _mm_shuffle_ps(q0, q0, _MM_SHUFFLE(0, 0, 0, 0));
q8 = _mm_add_ps(_mm_mul_ps(q1, s), q8);
s = _mm_shuffle_ps(q0, q0, _MM_SHUFFLE(1, 1, 1, 1));
q9 = _mm_add_ps(_mm_mul_ps(q1, s), q9);
s = _mm_shuffle_ps(q0, q0, _MM_SHUFFLE(2, 2, 2, 2));
q10 = _mm_add_ps(_mm_mul_ps(q1, s), q10);
s = _mm_shuffle_ps(q0, q0, _MM_SHUFFLE(3, 3, 3, 3));
q11 = _mm_add_ps(_mm_mul_ps(q1, s), q11);
}
// process overlaps
__m128 q0a = _mm_shuffle_ps(q9, q9, _MM_SHUFFLE(2, 1, 0, 3));
__m128 q0 = _mm_add_ps(q8, (__m128)_mm_and_si128(mask, (__m128i)q0a));
q8 = (__m128)_mm_andnot_si128(mask, (__m128i)q0a);
q0a = _mm_shuffle_ps((__m128)mask, q10, _MM_SHUFFLE(1, 0, 0, 0));
q0 = _mm_add_ps(q0, q0a);
q0a = _mm_shuffle_ps(q10, (__m128)mask, _MM_SHUFFLE(0, 0, 3, 2));
q8 = _mm_add_ps(q8, q0a);
q0a = (__m128)_mm_andnot_si128(mask, (__m128i)q11);
q0a = _mm_shuffle_ps(q0a, q0a, _MM_SHUFFLE(0, 3, 2, 1));
q0 = _mm_add_ps(q0, q0a);
q0a = (__m128)_mm_and_si128(mask, (__m128i)q11);
q0a = _mm_shuffle_ps(q0a, q0a, _MM_SHUFFLE(0, 3, 2, 1));
q8 = _mm_add_ps(q8, q0a);
_mm_store_ps(&out[i], q0);
}
}
#endif