spectral: Add Spectral noise reduction

Add Spectral Noise Reduction module, based on the code from the UHSDR / Convolution-SDR code bases. Signed-off-by: Graham Whaley <graham.whaley@gmail.com>
3 years ago · 065689a892
parent a609fa8602
commit 065689a892
3 changed files with 541 additions and 0 deletions
--- a/AudioSpectralDenoise_F32.cpp
+++ b/AudioSpectralDenoise_F32.cpp
@ -0,0 +1,342 @@
 /* AudioSpectralDenoise_F2.h
 * Spectral noise reduction
 *
 * Extracted and based on the work found in the:
 *  - Convolution SDR:  https://github.com/DD4WH/Teensy-ConvolutionSDR
 *  - UHSDR: https://github.com/df8oe/UHSDR/blob/active-devel/mchf-eclipse/drivers/audio/audio_nr.c
 *
 * License: GNU GPLv3
 *  Both the Convolution SDR and UHSDR are licensed under GPLv3.
 */
 #include "AudioSpectralDenoise_F32.h"
 #include <new>
 // No serial debug by default
 static const bool serial_debug = false;
 int AudioSpectralDenoise_F32::setup(const AudioSettings_F32 & settings,
                                    const int _N_FFT)
 {
  enable(false);                //Disable us, just incase we are already active...
  sample_rate_Hz = settings.sample_rate_Hz;
  if (N_FFT == -1) {
    //setup the FFT and IFFT.  If they return a negative FFT, it wasn't an allowed FFT size.
    N_FFT = myFFT.setup(settings, _N_FFT);  //hopefully, we got the same N_FFT that we asked for
    if (N_FFT < 1)
      return N_FFT;
    N_FFT = myIFFT.setup(settings, _N_FFT); //hopefully, we got the same N_FFT that we asked for
    if (N_FFT < 1)
      return N_FFT;
    //As we do a complex fft on a real signal, we only use half the returned FFT bins due
    // to conjugate symmetry. Store the number of bins to make it obvious and handy.
    N_bins = N_FFT / 2;
    //Spectral uses sqrtHann filtering
    (myFFT.getFFTObject())->useHanningWindow(); //applied prior to FFT
    //allocate memory to hold frequency domain data - complex r+i, so double the size of the
    // fft size.
    complex_2N_buffer = new (std::nothrow) float32_t[2 * N_FFT];
    if (complex_2N_buffer == NULL) return -1; 
    NR_X = new (std::nothrow) float32_t[N_bins];
    if (NR_X == NULL) return -1;
    ph1y = new (std::nothrow) float32_t[N_bins];
    if (ph1y == NULL) return -1;
    pslp = new (std::nothrow) float32_t[N_bins];
    if (pslp == NULL) return -1;
    xt = new (std::nothrow) float32_t[N_bins];
    if (xt == NULL) return -1;
    NR_SNR_post = new (std::nothrow) float32_t[N_bins];
    if (NR_SNR_post == NULL) return -1;
    NR_SNR_prio = new (std::nothrow) float32_t[N_bins];
    if (NR_SNR_prio == NULL) return -1;
    NR_Hk_old = new (std::nothrow) float32_t[N_bins];
    if (NR_Hk_old == NULL) return -1;
    NR_G = new (std::nothrow) float32_t[N_bins];
    if (NR_G == NULL) return -1;
    NR_Nest = new (std::nothrow) float32_t[N_bins];
    if (NR_Nest == NULL) return -1;
  }
  //Clear out and initialise
  for (int bindx = 0; bindx < N_bins; bindx++) {
    NR_Hk_old[bindx] = 0.1;     // old gain
    NR_Nest[bindx] = 0.01;
    NR_X[bindx] = 0.0;
    NR_SNR_post[bindx] = 2.0;
    NR_SNR_prio[bindx] = 1.0;
    NR_G[bindx] = 0.0;
  }
  //Work out the 'bin' range for our chosen voice frequencies
  // divide 2 to account for nyquist
  VAD_low = VAD_low_freq / ((sample_rate_Hz / 2.0) / (float32_t) (N_bins));
  VAD_high = VAD_high_freq / ((sample_rate_Hz / 2.0) / (float32_t) N_bins);
  xih1 = powf(10, asnr / 10.0);
  pfac = (1.0 / pspri - 1.0) * (1.0 + xih1);
  xih1r = 1.0 / (1.0 + xih1) - 1.0;
  //Configure the other things that might rely on the fft size of bitrate
  tinc = 1.0 / (sample_rate_Hz / AUDIO_BLOCK_SAMPLES);  //Frame time 
  tax = -tinc / log(tax_factor);       //noise output smoothing constant in seconds = -tinc/ln(0.8)
  tap = -tinc / log(tap_factor);       //speech prob smoothing constant in seconds = -tinc/ln(0.9)
  ap = expf(-tinc / tap);       //noise output smoothing factor
  ax = expf(-tinc / tax);       //noise output smoothing factor
  if (serial_debug) {
    Serial.println(" Spectral setup with fft:" + String(N_FFT));
    Serial.println("  FFT nblocks:" + String(myFFT.getNBuffBlocks()));
    Serial.println("  iFFT nblocks:" + String(myIFFT.getNBuffBlocks()));
    Serial.println("  Sample rate:" + String(sample_rate_Hz));
    Serial.println("  bins:" + String(N_bins));
    Serial.println("  VAD low:" + String(VAD_low));
    Serial.println("  VAD low freq:" + String(getVADLowFreq()));
    Serial.println("  VAD high:" + String(VAD_high));
    Serial.println("  VAD high freq:" + String(getVADHighFreq()));
    Serial.println("  tinc:" + String(tinc, 5));
    Serial.println("  tax_factor:" + String(tax_factor, 5));
    Serial.println("  tap_factor:" + String(tap_factor, 5));
    Serial.println("  tax:" + String(tax, 5));
    Serial.println("  tap:" + String(tap, 5));
    Serial.println("  ax:" + String(ax, 5));
    Serial.println("  ap:" + String(ap, 5));
    Serial.println("  xih1:" + String(xih1, 5));
    Serial.println("  xih1r:" + String(xih1r, 5));
    Serial.println("  pfac:" + String(pfac, 5));
    Serial.println("  snr_prio_min:" + String(getSNRPrioMin(), 5));
    Serial.println("  power_threshold:" + String(getPowerThreshold(), 5));
    Serial.println("  asnr:" + String(getAsnr(), 5));
    Serial.println("  NR_alpha:" + String(getNRAlpha(), 5));
    Serial.println("  NR_width:" + String(getNRWidth(), 5));
    Serial.flush();
  }
  enable(true);
  return is_enabled;
 }
 void AudioSpectralDenoise_F32::update(void)
 {
  //get a pointer to the latest data
  audio_block_f32_t *in_audio_block = AudioStream_F32::receiveReadOnly_f32();
  if (!in_audio_block)
    return;
  //simply return the audio if this class hasn't been enabled
  if (!is_enabled) {
    AudioStream_F32::transmit(in_audio_block);
    AudioStream_F32::release(in_audio_block);
    return;
  }
  //******************************************************************************
  //convert to frequency domain
  //FFT is in complex_2N_buffer, interleaved real, imaginary, real, imaginary, etc
  myFFT.execute(in_audio_block, complex_2N_buffer);
  // Preserve the block id, so we can pass it out with our final result
  unsigned long incoming_id = in_audio_block->id;
  // We just passed ownership of in_audio_block to myFFT, so we can
  // release it here as we won't use it here again.
  AudioStream_F32::release(in_audio_block);
  if (init_phase == 1) {
    if (serial_debug) {
      Serial.println("One time init");
      Serial.flush();
    }
    init_phase++;
    for (int bindx = 0; bindx < N_bins; bindx++) {
      NR_G[bindx] = 1.0;
      NR_Hk_old[bindx] = 1.0;   // old gain or xu in development mode
      NR_Nest[bindx] = 0.0;
      pslp[bindx] = 0.5;
    }
  }
  //******************************************************************************
  //***** Calculate magnitude, used later for noise estimates and calculations
  // AIUI, as we are only passing real values into a complex FFT, the resulting
  // data contains duplicated mirrored data, thus we only need to evaluate the
  // magnitude of the first half of the bins, as it will be identical to that
  // of the second half of the bins. When we finally apply the NR results to the
  // FFT data we apply it to both the first half and the conjugate, mirror style.
  // Fundamentally, this saves us half the processing on some parts.
  for (int bindx = 0; bindx < N_bins; bindx++) {
    NR_X[bindx] =
        (complex_2N_buffer[bindx * 2] * complex_2N_buffer[bindx * 2] +
         complex_2N_buffer[bindx * 2 + 1] * complex_2N_buffer[bindx * 2 + 1]);
  }
  //Second stage initialisation
  if (init_phase == 2) {
    static int NR_init_counter = 0;
    if (serial_debug) {
      Serial.println("Two time init (" + String(NR_init_counter) + ")");
      Serial.flush();
    }
    for (int bindx = 0; bindx < N_bins; bindx++) {
      // we do it 20 times to average over 20 frames for app. 100ms only on
      // NR_on/bandswitch/modeswitch,...
      NR_Nest[bindx] = NR_Nest[bindx] + 0.05 * NR_X[bindx];
      xt[bindx] = psini * NR_Nest[bindx];
    }
    NR_init_counter++;
    if (NR_init_counter > 19)   //average over 20 frames for app. 100ms
    {
      if (serial_debug) {
        Serial.println("Two time init done");
        Serial.flush();
      }
      NR_init_counter = 0;
      init_phase++;
    }
    if (serial_debug)
      Serial.println(" Two time loop done");
  }
  //Now we are fully initialised, we can actually do the NR processing
  //******************************************************************************
  //MMSE (Minimum Mean Square Error) based noise estimate
  // code/algo inspired by the matlab based voicebox library:
  // http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html
  // Noise estimate code can be found at:
  // https://github.com/YouriT/matlab-speech/blob/master/MATLAB_CODE_SOURCE/voicebox/estnoiseg.m
  for (int bindx = 0; bindx < N_bins; bindx++) {
    float32_t xtr;
    // a-posteriori speech presence probability
    ph1y[bindx] = 1.0 / (1.0 + pfac * expf(xih1r * NR_X[bindx] / xt[bindx]));
    // smoothed speech presence probability
    pslp[bindx] = ap * pslp[bindx] + (1.0 - ap) * ph1y[bindx];
    // limit ph1y
    if (pslp[bindx] > psthr) {
      ph1y[bindx] = 1.0 - pnsaf;
    } else {
      ph1y[bindx] = fmin(ph1y[bindx], 1.0);
    }
    // estimated raw noise spectrum
    xtr = (1.0 - ph1y[bindx]) * NR_X[bindx] + ph1y[bindx] * xt[bindx];
    // smooth the noise estimate
    xt[bindx] = ax * xt[bindx] + (1.0 - ax) * xtr;
  }
  // Limit the ratios
  // I don't have a lot of info on how this works, but SNRpost and SNRprio are related
  // to both Ephraim&Malah(84) and Romanin(2009) papers
  for (int bindx = 0; bindx < N_bins; bindx++) {
    // limited to +30 /-15 dB, might be still too much of reduction, let's try it?
    NR_SNR_post[bindx] = fmax(fmin(NR_X[bindx] / xt[bindx], 1000.0), snr_prio_min);
    NR_SNR_prio[bindx] =
      fmax(NR_alpha * NR_Hk_old[bindx] +
      (1.0 - NR_alpha) * fmax(NR_SNR_post[bindx] - 1.0, 0.0), 0.0);
  }
    //******************************************************************************
    // VAD
    // maybe we should limit this to the signal containing bins (filtering!!)
    for (int bindx = VAD_low; bindx < VAD_high; bindx++) {
      float32_t v =
          NR_SNR_prio[bindx] * NR_SNR_post[bindx] / (1.0 + NR_SNR_prio[bindx]);
      NR_G[bindx] = 1.0 / NR_SNR_post[bindx] * sqrtf((0.7212 * v + v * v));
      NR_Hk_old[bindx] = NR_SNR_post[bindx] * NR_G[bindx] * NR_G[bindx];
    }
    //******************************************************************************
    // Do the musical noise reduction
    // musical noise "artefact" reduction by dynamic averaging - depending on SNR ratio
    pre_power = 0.0;
    post_power = 0.0;
    for (int bindx = VAD_low; bindx < VAD_high; bindx++) {
      pre_power += NR_X[bindx];
      post_power += NR_G[bindx] * NR_G[bindx] * NR_X[bindx];
    }
    power_ratio = post_power / pre_power;
    if (power_ratio > power_threshold) {
      power_ratio = 1.0;
      NN = 1;
    } else {
      NN = 1 + 2 * (int)(0.5 +
                         NR_width * (1.0 - power_ratio / power_threshold));
    }
    for (int bindx = VAD_low + NN / 2; bindx < VAD_high - NN / 2; bindx++) {
      NR_Nest[bindx] = 0.0;
      for (int m = bindx - NN / 2; m <= bindx + NN / 2; m++) {
        NR_Nest[bindx] += NR_G[m];
      }
      NR_Nest[bindx] /= (float32_t) NN;
    }
    // and now the edges - only going NN steps forward and taking the average
    // lower edge
    for (int bindx = VAD_low; bindx < VAD_low + NN / 2; bindx++) {
      NR_Nest[bindx] = 0.0;
      for (int m = bindx; m < (bindx + NN); m++) {
        NR_Nest[bindx] += NR_G[m];
      }
      NR_Nest[bindx] /= (float32_t) NN;
    }
    // upper edge - only going NN steps backward and taking the average
    for (int bindx = VAD_high - NN; bindx < VAD_high; bindx++) {
      NR_Nest[bindx] = 0.0;
      for (int m = bindx; m > (bindx - NN); m--) {
        NR_Nest[bindx] += NR_G[m];
      }
      NR_Nest[bindx] /= (float32_t) NN;
    }
    // end of edge treatment
    for (int bindx = VAD_low + NN / 2; bindx < VAD_high - NN / 2; bindx++) {
      NR_G[bindx] = NR_Nest[bindx];
    }
    // end of musical noise reduction
  //******************************************************************************
  // And finally actually apply the weightings to the signals...
  // FINAL SPECTRAL WEIGHTING: Multiply current FFT results with complex_2N_buffer for
  // bins with the bin-specific gain factors G
  for (int bindx = 0; bindx < N_bins; bindx++) {
    // real part
    complex_2N_buffer[bindx * 2] = complex_2N_buffer[bindx * 2] * NR_G[bindx];
    // imag part
    complex_2N_buffer[bindx * 2 + 1] =
        complex_2N_buffer[bindx * 2 + 1] * NR_G[bindx];
    // real part conjugate symmetric
    //N_bins * 4 == N_FFT * 2 == N_FFT[real, imag]
    complex_2N_buffer[N_bins * 4 - bindx * 2 - 2] =
        complex_2N_buffer[N_bins * 4 - bindx * 2 - 2] * NR_G[bindx];
    // imag part conjugate symmetric
    complex_2N_buffer[N_bins * 4 - bindx * 2 - 1] =
        complex_2N_buffer[N_bins * 4 - bindx * 2 - 1] * NR_G[bindx];
  }
  //******************************************************************************
  //And finally call the IFFT, back to the time domain, and pass the processed block on
  //out_block is pre-allocated in here.
  audio_block_f32_t *out_audio_block = myIFFT.execute(complex_2N_buffer);
  //update the block number to match the incoming one
  out_audio_block->id = incoming_id;
  //send the returned audio block.  Don't issue the release command here because myIFFT will re-use it
  //don't release this buffer because myIFFT re-uses it within its own code
  AudioStream_F32::transmit(out_audio_block); //don't release this buffer because myIFFT re-uses it within its own code
  return;
 }
--- a/AudioSpectralDenoise_F32.h
+++ b/AudioSpectralDenoise_F32.h
@ -0,0 +1,198 @@
 /*
 * AudioSpectralDenoise_F32
 * 
 * Created: Graham Whaley, 2022
 * Purpose: Spectral noise reduction
 *          
 * This processes a single stream of audio data (i.e., it is mono)     
 *          
 * License: GNU GPLv3 License
 *  As the code it is derived from is GPLv3
 *
 * Based off the work from the UHSDR project, as also used in the mcHF and Convolution-SDR
 * projects.
 * Reference documentation can be found at https://github.com/df8oe/UHSDR/wiki/Noise-reduction
 * Code extracted into isolated files can be found at
 *   https://github.com/grahamwhaley/DSPham/blob/master/spectral.cpp
 */
 #ifndef _AudioSpectralDenoise_F32_h
 #define _AudioSpectralDenoise_F32_h
 #include "AudioStream_F32.h"
 #include <arm_math.h>
 #include "FFT_Overlapped_OA_F32.h"
 #include <Arduino.h>
 class AudioSpectralDenoise_F32:public AudioStream_F32 {
 //GUI: inputs:1, outputs:1  //this line used for automatic generation of GUI node
 //GUI: shortName:spectral
 public:
  AudioSpectralDenoise_F32(void):AudioStream_F32(1, inputQueueArray_f32) {
  };
  AudioSpectralDenoise_F32(const AudioSettings_F32 &
                           settings):AudioStream_F32(1, inputQueueArray_f32) {
  }
  AudioSpectralDenoise_F32(const AudioSettings_F32 & settings,
                           const int _N_FFT):AudioStream_F32(1,
                                                             inputQueueArray_f32)
  {
    setup(settings, _N_FFT);
  }
  //destructor...release all of the memory that has been allocated
  ~AudioSpectralDenoise_F32(void) {
    if (complex_2N_buffer) delete complex_2N_buffer;
    if (NR_X) delete NR_X;
    if (ph1y) delete ph1y;
    if (pslp) delete pslp;
    if (xt) delete xt;
    if (NR_SNR_post) delete NR_SNR_post;
    if (NR_SNR_prio) delete NR_SNR_prio;
    if (NR_Hk_old) delete NR_Hk_old;
    if (NR_G) delete NR_G;
    if (NR_Nest) delete NR_Nest;
  }
  //Our default FFT size is 256. That is time and space efficient, but
  // if you are running at a 'high' sample rate, the NR 'buckets' might
  // be quite small. You may want to use a 1024 FFT if running at 44.1KHz
  // for instance, if you can afford the time and space overheads.
  int setup(const AudioSettings_F32 & settings, const int _N_FFT = 256);
  virtual void update(void);
  bool enable(bool state = true) {
    is_enabled = state;
    return is_enabled;
  }
  bool enabled(void) {
    return is_enabled;
  }
  //Getters and Setters
  float32_t getAsnr(void) {
    return asnr;
  }
  void setAsnr(float32_t v) {
    asnr = v;
  }
  float32_t getVADHighFreq(void) {
    return VAD_high_freq;
  }
  void setVADHighFreq(float32_t f) {
    VAD_high_freq = f;
  }
  float32_t getVADLowFreq(void) {
    return VAD_low_freq;
  }
  void setVADLowFreq(float32_t f) {
    VAD_low_freq = f;
  }
  float32_t getNRAlpha(void) {
    return NR_alpha;
  }
  void setNRAlpha(float32_t v) {
    NR_alpha = v;
    if (NR_alpha < 0.9)
      NR_alpha = 0.9;
    if (NR_alpha > 0.9999)
      NR_alpha = 0.9999;
  }
  float32_t getSNRPrioMin(void) {
    return snr_prio_min;
  }
  void setSNRPrioMin(float32_t v) {
    snr_prio_min = v;
  }
  int16_t getNRWidth(void) {
    return NR_width;
  }
  void setNRWidth(int16_t v) {
    NR_width = v;
  }
  float32_t getPowerThreshold(void) {
    return power_threshold;
  }
  void setPowerThreshold(float32_t v) {
    power_threshold = v;
  }
  float32_t getTaxFactor(void) {
    return tax_factor;
  }
  void setTaxFactor(float32_t v) {
    tax_factor = v;
  }
  float32_t getTapFactor(void) {
    return tap_factor;
  }
  void setTapFactor(float32_t v) {
    tap_factor = v;
  }
 private:
  static const int max_fft = 2048;  //The largest FFT FFT_OA handles. Fixed so we can fix the
  //array sizes - FIXME - a hack, but easier than doing the dynamic allocations for now.
  uint8_t init_phase = 1;       //Track our phases of initialisation
  int is_enabled = 0;
  float32_t *complex_2N_buffer; //Store our FFT real/imag data
  audio_block_f32_t *inputQueueArray_f32[1];  //memory pointer for the input to this module
  FFT_Overlapped_OA_F32 myFFT;
  IFFT_Overlapped_OA_F32 myIFFT;
  int N_FFT = -1;               //How big an FFT are we using?
  int N_bins = -1;              //How many actual data bins are we processing on
  float sample_rate_Hz = AUDIO_SAMPLE_RATE;
  //*********** NR vars
  //Magnitudes (fabs) of power for the last four (three?) audio blocks
  float32_t *NR_X = NULL;
  float32_t *ph1y = NULL;
  float32_t *pslp = NULL;
  float32_t *xt = NULL;
  const float32_t psini = 0.5;  //initial speech probability
  const float32_t pspri = 0.5;  //prior speech probability
  float32_t asnr = 25;          //active SNR in dB - seems to make less different than I expected.
  float32_t xih1;
  float32_t pfac;
  float32_t xih1r;
  const float32_t psthr = 0.99; //threshold for smoothed speech probability
  const float32_t pnsaf = 0.01; //noise probability safety value
  float32_t tinc;               //Frame time in seconds
  float32_t tax_factor = 0.8;	//Noise output smoothing factor
  float32_t tax;                //noise output smoothing constant in seconds = -tinc/ln(0.8)
  float32_t tap_factor = 0.9;	//Speech probability smoothing factor
  float32_t tap;                //speech prob smoothing constant in seconds = -tinc/ln(0.9)
  float32_t ap;                 //noise output smoothing factor
  float32_t ax;                 //noise output smoothing factor
  float32_t snr_prio_min = powf(10, -(float32_t) 20 / 20.0);  //Lower limit of SNR ratio calculation
  // Time smoothing of gain weights. Makes quite a difference to the NR performance.
  float32_t NR_alpha = 0.99;    //range 0.98-0.9999. 0.95 acts much too hard: reverb effects.
  float32_t *NR_SNR_post = NULL;
  float32_t *NR_SNR_prio = NULL;
  float32_t *NR_Hk_old = NULL;
  // preliminary gain factors (before time smoothing) and after that contains the frequency
  // smoothed gain factors
  float32_t *NR_G = NULL;
  //Our Noise estimate array - 'one dimentional' is a hangover from the old version of the
  // original code that used multiple entries for averaging, which seems to have then been
  // dropped, but the arrays still left in place.
  float32_t *NR_Nest = NULL;
  float32_t VAD_low_freq = 100.0;
  float32_t VAD_high_freq = 3600.0;
  //if we grow the FFT to 1024, these might need to be bigger than a uint8?
  uint8_t VAD_low, VAD_high;    //lower/upper bounds for 'voice spectrum' slot processing
  int16_t NN;                   //used as part of VAD calculations, n-bin averaging?. Also, why an int16 ?
  int16_t NR_width = 4;
  float32_t pre_power, post_power;  //Used in VAD calculations
  float32_t power_ratio;
  float32_t power_threshold = 0.4;
 };
 #endif
--- a/OpenAudio_ArduinoLibrary.h
+++ b/OpenAudio_ArduinoLibrary.h
@ -21,6 +21,7 @@
 #include "AudioMixer_F32.h"
 #include "AudioMultiply_F32.h"
 #include "AudioSettings_F32.h"
 #include "AudioSpectralDenoise_F32.h"
 #include "input_i2s_f32.h"
 #include "output_i2s_f32.h"
 #include "play_queue_f32.h"