From 065689a892fbce6b7cdd02d587332e501581f22b Mon Sep 17 00:00:00 2001 From: Graham Whaley Date: Thu, 12 May 2022 10:23:28 +0100 Subject: [PATCH] spectral: Add Spectral noise reduction Add Spectral Noise Reduction module, based on the code from the UHSDR / Convolution-SDR code bases. Signed-off-by: Graham Whaley --- AudioSpectralDenoise_F32.cpp | 342 +++++++++++++++++++++++++++++++++++ AudioSpectralDenoise_F32.h | 198 ++++++++++++++++++++ OpenAudio_ArduinoLibrary.h | 1 + 3 files changed, 541 insertions(+) create mode 100644 AudioSpectralDenoise_F32.cpp create mode 100644 AudioSpectralDenoise_F32.h diff --git a/AudioSpectralDenoise_F32.cpp b/AudioSpectralDenoise_F32.cpp new file mode 100644 index 0000000..b032864 --- /dev/null +++ b/AudioSpectralDenoise_F32.cpp @@ -0,0 +1,342 @@ +/* AudioSpectralDenoise_F2.h + * Spectral noise reduction + * + * Extracted and based on the work found in the: + * - Convolution SDR: https://github.com/DD4WH/Teensy-ConvolutionSDR + * - UHSDR: https://github.com/df8oe/UHSDR/blob/active-devel/mchf-eclipse/drivers/audio/audio_nr.c + * + * License: GNU GPLv3 + * Both the Convolution SDR and UHSDR are licensed under GPLv3. + */ + +#include "AudioSpectralDenoise_F32.h" + +#include + +// No serial debug by default +static const bool serial_debug = false; + +int AudioSpectralDenoise_F32::setup(const AudioSettings_F32 & settings, + const int _N_FFT) +{ + enable(false); //Disable us, just incase we are already active... + sample_rate_Hz = settings.sample_rate_Hz; + + if (N_FFT == -1) { + //setup the FFT and IFFT. If they return a negative FFT, it wasn't an allowed FFT size. + N_FFT = myFFT.setup(settings, _N_FFT); //hopefully, we got the same N_FFT that we asked for + if (N_FFT < 1) + return N_FFT; + N_FFT = myIFFT.setup(settings, _N_FFT); //hopefully, we got the same N_FFT that we asked for + if (N_FFT < 1) + return N_FFT; + + //As we do a complex fft on a real signal, we only use half the returned FFT bins due + // to conjugate symmetry. Store the number of bins to make it obvious and handy. + N_bins = N_FFT / 2; + + //Spectral uses sqrtHann filtering + (myFFT.getFFTObject())->useHanningWindow(); //applied prior to FFT + + //allocate memory to hold frequency domain data - complex r+i, so double the size of the + // fft size. + complex_2N_buffer = new (std::nothrow) float32_t[2 * N_FFT]; + if (complex_2N_buffer == NULL) return -1; + + NR_X = new (std::nothrow) float32_t[N_bins]; + if (NR_X == NULL) return -1; + ph1y = new (std::nothrow) float32_t[N_bins]; + if (ph1y == NULL) return -1; + pslp = new (std::nothrow) float32_t[N_bins]; + if (pslp == NULL) return -1; + xt = new (std::nothrow) float32_t[N_bins]; + if (xt == NULL) return -1; + NR_SNR_post = new (std::nothrow) float32_t[N_bins]; + if (NR_SNR_post == NULL) return -1; + NR_SNR_prio = new (std::nothrow) float32_t[N_bins]; + if (NR_SNR_prio == NULL) return -1; + NR_Hk_old = new (std::nothrow) float32_t[N_bins]; + if (NR_Hk_old == NULL) return -1; + NR_G = new (std::nothrow) float32_t[N_bins]; + if (NR_G == NULL) return -1; + NR_Nest = new (std::nothrow) float32_t[N_bins]; + if (NR_Nest == NULL) return -1; + } + + //Clear out and initialise + for (int bindx = 0; bindx < N_bins; bindx++) { + NR_Hk_old[bindx] = 0.1; // old gain + NR_Nest[bindx] = 0.01; + NR_X[bindx] = 0.0; + NR_SNR_post[bindx] = 2.0; + NR_SNR_prio[bindx] = 1.0; + NR_G[bindx] = 0.0; + } + + //Work out the 'bin' range for our chosen voice frequencies + // divide 2 to account for nyquist + VAD_low = VAD_low_freq / ((sample_rate_Hz / 2.0) / (float32_t) (N_bins)); + VAD_high = VAD_high_freq / ((sample_rate_Hz / 2.0) / (float32_t) N_bins); + + xih1 = powf(10, asnr / 10.0); + pfac = (1.0 / pspri - 1.0) * (1.0 + xih1); + xih1r = 1.0 / (1.0 + xih1) - 1.0; + + //Configure the other things that might rely on the fft size of bitrate + tinc = 1.0 / (sample_rate_Hz / AUDIO_BLOCK_SAMPLES); //Frame time + tax = -tinc / log(tax_factor); //noise output smoothing constant in seconds = -tinc/ln(0.8) + tap = -tinc / log(tap_factor); //speech prob smoothing constant in seconds = -tinc/ln(0.9) + ap = expf(-tinc / tap); //noise output smoothing factor + ax = expf(-tinc / tax); //noise output smoothing factor + + if (serial_debug) { + Serial.println(" Spectral setup with fft:" + String(N_FFT)); + Serial.println(" FFT nblocks:" + String(myFFT.getNBuffBlocks())); + Serial.println(" iFFT nblocks:" + String(myIFFT.getNBuffBlocks())); + Serial.println(" Sample rate:" + String(sample_rate_Hz)); + Serial.println(" bins:" + String(N_bins)); + Serial.println(" VAD low:" + String(VAD_low)); + Serial.println(" VAD low freq:" + String(getVADLowFreq())); + Serial.println(" VAD high:" + String(VAD_high)); + Serial.println(" VAD high freq:" + String(getVADHighFreq())); + Serial.println(" tinc:" + String(tinc, 5)); + Serial.println(" tax_factor:" + String(tax_factor, 5)); + Serial.println(" tap_factor:" + String(tap_factor, 5)); + Serial.println(" tax:" + String(tax, 5)); + Serial.println(" tap:" + String(tap, 5)); + Serial.println(" ax:" + String(ax, 5)); + Serial.println(" ap:" + String(ap, 5)); + Serial.println(" xih1:" + String(xih1, 5)); + Serial.println(" xih1r:" + String(xih1r, 5)); + Serial.println(" pfac:" + String(pfac, 5)); + Serial.println(" snr_prio_min:" + String(getSNRPrioMin(), 5)); + Serial.println(" power_threshold:" + String(getPowerThreshold(), 5)); + Serial.println(" asnr:" + String(getAsnr(), 5)); + Serial.println(" NR_alpha:" + String(getNRAlpha(), 5)); + Serial.println(" NR_width:" + String(getNRWidth(), 5)); + + Serial.flush(); + } + + enable(true); + return is_enabled; +} + +void AudioSpectralDenoise_F32::update(void) +{ + //get a pointer to the latest data + audio_block_f32_t *in_audio_block = AudioStream_F32::receiveReadOnly_f32(); + if (!in_audio_block) + return; + + //simply return the audio if this class hasn't been enabled + if (!is_enabled) { + AudioStream_F32::transmit(in_audio_block); + AudioStream_F32::release(in_audio_block); + return; + } + //****************************************************************************** + //convert to frequency domain + //FFT is in complex_2N_buffer, interleaved real, imaginary, real, imaginary, etc + myFFT.execute(in_audio_block, complex_2N_buffer); + + // Preserve the block id, so we can pass it out with our final result + unsigned long incoming_id = in_audio_block->id; + + // We just passed ownership of in_audio_block to myFFT, so we can + // release it here as we won't use it here again. + AudioStream_F32::release(in_audio_block); + + if (init_phase == 1) { + if (serial_debug) { + Serial.println("One time init"); + Serial.flush(); + } + init_phase++; + + for (int bindx = 0; bindx < N_bins; bindx++) { + NR_G[bindx] = 1.0; + NR_Hk_old[bindx] = 1.0; // old gain or xu in development mode + NR_Nest[bindx] = 0.0; + pslp[bindx] = 0.5; + } + } + //****************************************************************************** + //***** Calculate magnitude, used later for noise estimates and calculations + // AIUI, as we are only passing real values into a complex FFT, the resulting + // data contains duplicated mirrored data, thus we only need to evaluate the + // magnitude of the first half of the bins, as it will be identical to that + // of the second half of the bins. When we finally apply the NR results to the + // FFT data we apply it to both the first half and the conjugate, mirror style. + // Fundamentally, this saves us half the processing on some parts. + for (int bindx = 0; bindx < N_bins; bindx++) { + NR_X[bindx] = + (complex_2N_buffer[bindx * 2] * complex_2N_buffer[bindx * 2] + + complex_2N_buffer[bindx * 2 + 1] * complex_2N_buffer[bindx * 2 + 1]); + } + + //Second stage initialisation + if (init_phase == 2) { + static int NR_init_counter = 0; + if (serial_debug) { + Serial.println("Two time init (" + String(NR_init_counter) + ")"); + Serial.flush(); + } + for (int bindx = 0; bindx < N_bins; bindx++) { + // we do it 20 times to average over 20 frames for app. 100ms only on + // NR_on/bandswitch/modeswitch,... + NR_Nest[bindx] = NR_Nest[bindx] + 0.05 * NR_X[bindx]; + xt[bindx] = psini * NR_Nest[bindx]; + } + NR_init_counter++; + if (NR_init_counter > 19) //average over 20 frames for app. 100ms + { + if (serial_debug) { + Serial.println("Two time init done"); + Serial.flush(); + } + NR_init_counter = 0; + init_phase++; + } + if (serial_debug) + Serial.println(" Two time loop done"); + } + + //Now we are fully initialised, we can actually do the NR processing + //****************************************************************************** + //MMSE (Minimum Mean Square Error) based noise estimate + // code/algo inspired by the matlab based voicebox library: + // http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html + // Noise estimate code can be found at: + // https://github.com/YouriT/matlab-speech/blob/master/MATLAB_CODE_SOURCE/voicebox/estnoiseg.m + for (int bindx = 0; bindx < N_bins; bindx++) { + float32_t xtr; + + // a-posteriori speech presence probability + ph1y[bindx] = 1.0 / (1.0 + pfac * expf(xih1r * NR_X[bindx] / xt[bindx])); + // smoothed speech presence probability + pslp[bindx] = ap * pslp[bindx] + (1.0 - ap) * ph1y[bindx]; + + // limit ph1y + if (pslp[bindx] > psthr) { + ph1y[bindx] = 1.0 - pnsaf; + } else { + ph1y[bindx] = fmin(ph1y[bindx], 1.0); + } + // estimated raw noise spectrum + xtr = (1.0 - ph1y[bindx]) * NR_X[bindx] + ph1y[bindx] * xt[bindx]; + // smooth the noise estimate + xt[bindx] = ax * xt[bindx] + (1.0 - ax) * xtr; + } + + // Limit the ratios + // I don't have a lot of info on how this works, but SNRpost and SNRprio are related + // to both Ephraim&Malah(84) and Romanin(2009) papers + for (int bindx = 0; bindx < N_bins; bindx++) { + // limited to +30 /-15 dB, might be still too much of reduction, let's try it? + NR_SNR_post[bindx] = fmax(fmin(NR_X[bindx] / xt[bindx], 1000.0), snr_prio_min); + + NR_SNR_prio[bindx] = + fmax(NR_alpha * NR_Hk_old[bindx] + + (1.0 - NR_alpha) * fmax(NR_SNR_post[bindx] - 1.0, 0.0), 0.0); + } + + //****************************************************************************** + // VAD + // maybe we should limit this to the signal containing bins (filtering!!) + for (int bindx = VAD_low; bindx < VAD_high; bindx++) { + float32_t v = + NR_SNR_prio[bindx] * NR_SNR_post[bindx] / (1.0 + NR_SNR_prio[bindx]); + NR_G[bindx] = 1.0 / NR_SNR_post[bindx] * sqrtf((0.7212 * v + v * v)); + NR_Hk_old[bindx] = NR_SNR_post[bindx] * NR_G[bindx] * NR_G[bindx]; + } + + //****************************************************************************** + // Do the musical noise reduction + // musical noise "artefact" reduction by dynamic averaging - depending on SNR ratio + pre_power = 0.0; + post_power = 0.0; + for (int bindx = VAD_low; bindx < VAD_high; bindx++) { + pre_power += NR_X[bindx]; + post_power += NR_G[bindx] * NR_G[bindx] * NR_X[bindx]; + } + + power_ratio = post_power / pre_power; + if (power_ratio > power_threshold) { + power_ratio = 1.0; + NN = 1; + } else { + NN = 1 + 2 * (int)(0.5 + + NR_width * (1.0 - power_ratio / power_threshold)); + } + + for (int bindx = VAD_low + NN / 2; bindx < VAD_high - NN / 2; bindx++) { + NR_Nest[bindx] = 0.0; + for (int m = bindx - NN / 2; m <= bindx + NN / 2; m++) { + NR_Nest[bindx] += NR_G[m]; + } + NR_Nest[bindx] /= (float32_t) NN; + } + + // and now the edges - only going NN steps forward and taking the average + // lower edge + for (int bindx = VAD_low; bindx < VAD_low + NN / 2; bindx++) { + NR_Nest[bindx] = 0.0; + for (int m = bindx; m < (bindx + NN); m++) { + NR_Nest[bindx] += NR_G[m]; + } + NR_Nest[bindx] /= (float32_t) NN; + } + + // upper edge - only going NN steps backward and taking the average + for (int bindx = VAD_high - NN; bindx < VAD_high; bindx++) { + NR_Nest[bindx] = 0.0; + for (int m = bindx; m > (bindx - NN); m--) { + NR_Nest[bindx] += NR_G[m]; + } + NR_Nest[bindx] /= (float32_t) NN; + } + // end of edge treatment + + for (int bindx = VAD_low + NN / 2; bindx < VAD_high - NN / 2; bindx++) { + NR_G[bindx] = NR_Nest[bindx]; + } + // end of musical noise reduction + + //****************************************************************************** + // And finally actually apply the weightings to the signals... + // FINAL SPECTRAL WEIGHTING: Multiply current FFT results with complex_2N_buffer for + // bins with the bin-specific gain factors G + for (int bindx = 0; bindx < N_bins; bindx++) { + // real part + complex_2N_buffer[bindx * 2] = complex_2N_buffer[bindx * 2] * NR_G[bindx]; + + // imag part + complex_2N_buffer[bindx * 2 + 1] = + complex_2N_buffer[bindx * 2 + 1] * NR_G[bindx]; + + // real part conjugate symmetric + //N_bins * 4 == N_FFT * 2 == N_FFT[real, imag] + complex_2N_buffer[N_bins * 4 - bindx * 2 - 2] = + complex_2N_buffer[N_bins * 4 - bindx * 2 - 2] * NR_G[bindx]; + + // imag part conjugate symmetric + complex_2N_buffer[N_bins * 4 - bindx * 2 - 1] = + complex_2N_buffer[N_bins * 4 - bindx * 2 - 1] * NR_G[bindx]; + } + + //****************************************************************************** + //And finally call the IFFT, back to the time domain, and pass the processed block on + + //out_block is pre-allocated in here. + audio_block_f32_t *out_audio_block = myIFFT.execute(complex_2N_buffer); + + //update the block number to match the incoming one + out_audio_block->id = incoming_id; + + //send the returned audio block. Don't issue the release command here because myIFFT will re-use it + //don't release this buffer because myIFFT re-uses it within its own code + AudioStream_F32::transmit(out_audio_block); //don't release this buffer because myIFFT re-uses it within its own code + + return; +} diff --git a/AudioSpectralDenoise_F32.h b/AudioSpectralDenoise_F32.h new file mode 100644 index 0000000..ca352e3 --- /dev/null +++ b/AudioSpectralDenoise_F32.h @@ -0,0 +1,198 @@ +/* + * AudioSpectralDenoise_F32 + * + * Created: Graham Whaley, 2022 + * Purpose: Spectral noise reduction + * + * This processes a single stream of audio data (i.e., it is mono) + * + * License: GNU GPLv3 License + * As the code it is derived from is GPLv3 + * + * Based off the work from the UHSDR project, as also used in the mcHF and Convolution-SDR + * projects. + * Reference documentation can be found at https://github.com/df8oe/UHSDR/wiki/Noise-reduction + * Code extracted into isolated files can be found at + * https://github.com/grahamwhaley/DSPham/blob/master/spectral.cpp + */ + +#ifndef _AudioSpectralDenoise_F32_h +#define _AudioSpectralDenoise_F32_h + +#include "AudioStream_F32.h" +#include +#include "FFT_Overlapped_OA_F32.h" +#include + +class AudioSpectralDenoise_F32:public AudioStream_F32 { +//GUI: inputs:1, outputs:1 //this line used for automatic generation of GUI node +//GUI: shortName:spectral +public: + AudioSpectralDenoise_F32(void):AudioStream_F32(1, inputQueueArray_f32) { + }; + AudioSpectralDenoise_F32(const AudioSettings_F32 & + settings):AudioStream_F32(1, inputQueueArray_f32) { + } + AudioSpectralDenoise_F32(const AudioSettings_F32 & settings, + const int _N_FFT):AudioStream_F32(1, + inputQueueArray_f32) + { + setup(settings, _N_FFT); + } + + //destructor...release all of the memory that has been allocated + ~AudioSpectralDenoise_F32(void) { + if (complex_2N_buffer) delete complex_2N_buffer; + if (NR_X) delete NR_X; + if (ph1y) delete ph1y; + if (pslp) delete pslp; + if (xt) delete xt; + if (NR_SNR_post) delete NR_SNR_post; + if (NR_SNR_prio) delete NR_SNR_prio; + if (NR_Hk_old) delete NR_Hk_old; + if (NR_G) delete NR_G; + if (NR_Nest) delete NR_Nest; + } + + //Our default FFT size is 256. That is time and space efficient, but + // if you are running at a 'high' sample rate, the NR 'buckets' might + // be quite small. You may want to use a 1024 FFT if running at 44.1KHz + // for instance, if you can afford the time and space overheads. + int setup(const AudioSettings_F32 & settings, const int _N_FFT = 256); + + virtual void update(void); + bool enable(bool state = true) { + is_enabled = state; + return is_enabled; + } + bool enabled(void) { + return is_enabled; + } + + //Getters and Setters + float32_t getAsnr(void) { + return asnr; + } + void setAsnr(float32_t v) { + asnr = v; + } + float32_t getVADHighFreq(void) { + return VAD_high_freq; + } + void setVADHighFreq(float32_t f) { + VAD_high_freq = f; + } + float32_t getVADLowFreq(void) { + return VAD_low_freq; + } + void setVADLowFreq(float32_t f) { + VAD_low_freq = f; + } + float32_t getNRAlpha(void) { + return NR_alpha; + } + void setNRAlpha(float32_t v) { + NR_alpha = v; + if (NR_alpha < 0.9) + NR_alpha = 0.9; + if (NR_alpha > 0.9999) + NR_alpha = 0.9999; + } + float32_t getSNRPrioMin(void) { + return snr_prio_min; + } + void setSNRPrioMin(float32_t v) { + snr_prio_min = v; + } + int16_t getNRWidth(void) { + return NR_width; + } + void setNRWidth(int16_t v) { + NR_width = v; + } + float32_t getPowerThreshold(void) { + return power_threshold; + } + void setPowerThreshold(float32_t v) { + power_threshold = v; + } + float32_t getTaxFactor(void) { + return tax_factor; + } + void setTaxFactor(float32_t v) { + tax_factor = v; + } + float32_t getTapFactor(void) { + return tap_factor; + } + void setTapFactor(float32_t v) { + tap_factor = v; + } + +private: + static const int max_fft = 2048; //The largest FFT FFT_OA handles. Fixed so we can fix the + //array sizes - FIXME - a hack, but easier than doing the dynamic allocations for now. + + uint8_t init_phase = 1; //Track our phases of initialisation + int is_enabled = 0; + float32_t *complex_2N_buffer; //Store our FFT real/imag data + audio_block_f32_t *inputQueueArray_f32[1]; //memory pointer for the input to this module + FFT_Overlapped_OA_F32 myFFT; + IFFT_Overlapped_OA_F32 myIFFT; + int N_FFT = -1; //How big an FFT are we using? + int N_bins = -1; //How many actual data bins are we processing on + float sample_rate_Hz = AUDIO_SAMPLE_RATE; + + //*********** NR vars + //Magnitudes (fabs) of power for the last four (three?) audio blocks + float32_t *NR_X = NULL; + + float32_t *ph1y = NULL; + float32_t *pslp = NULL; + float32_t *xt = NULL; + + const float32_t psini = 0.5; //initial speech probability + const float32_t pspri = 0.5; //prior speech probability + float32_t asnr = 25; //active SNR in dB - seems to make less different than I expected. + float32_t xih1; + float32_t pfac; + float32_t xih1r; + + const float32_t psthr = 0.99; //threshold for smoothed speech probability + const float32_t pnsaf = 0.01; //noise probability safety value + float32_t tinc; //Frame time in seconds + float32_t tax_factor = 0.8; //Noise output smoothing factor + float32_t tax; //noise output smoothing constant in seconds = -tinc/ln(0.8) + float32_t tap_factor = 0.9; //Speech probability smoothing factor + float32_t tap; //speech prob smoothing constant in seconds = -tinc/ln(0.9) + float32_t ap; //noise output smoothing factor + float32_t ax; //noise output smoothing factor + float32_t snr_prio_min = powf(10, -(float32_t) 20 / 20.0); //Lower limit of SNR ratio calculation + // Time smoothing of gain weights. Makes quite a difference to the NR performance. + float32_t NR_alpha = 0.99; //range 0.98-0.9999. 0.95 acts much too hard: reverb effects. + + float32_t *NR_SNR_post = NULL; + float32_t *NR_SNR_prio = NULL; + float32_t *NR_Hk_old = NULL; + + // preliminary gain factors (before time smoothing) and after that contains the frequency + // smoothed gain factors + float32_t *NR_G = NULL; + + //Our Noise estimate array - 'one dimentional' is a hangover from the old version of the + // original code that used multiple entries for averaging, which seems to have then been + // dropped, but the arrays still left in place. + float32_t *NR_Nest = NULL; + + float32_t VAD_low_freq = 100.0; + float32_t VAD_high_freq = 3600.0; + //if we grow the FFT to 1024, these might need to be bigger than a uint8? + uint8_t VAD_low, VAD_high; //lower/upper bounds for 'voice spectrum' slot processing + int16_t NN; //used as part of VAD calculations, n-bin averaging?. Also, why an int16 ? + int16_t NR_width = 4; + float32_t pre_power, post_power; //Used in VAD calculations + float32_t power_ratio; + float32_t power_threshold = 0.4; +}; + +#endif diff --git a/OpenAudio_ArduinoLibrary.h b/OpenAudio_ArduinoLibrary.h index 29eceed..0388333 100644 --- a/OpenAudio_ArduinoLibrary.h +++ b/OpenAudio_ArduinoLibrary.h @@ -21,6 +21,7 @@ #include "AudioMixer_F32.h" #include "AudioMultiply_F32.h" #include "AudioSettings_F32.h" +#include "AudioSpectralDenoise_F32.h" #include "input_i2s_f32.h" #include "output_i2s_f32.h" #include "play_queue_f32.h"