diff --git a/analyze_fft1024_F32.cpp b/analyze_fft1024_F32.cpp
index 9be3237..d7b9076 100644
--- a/analyze_fft1024_F32.cpp
+++ b/analyze_fft1024_F32.cpp
@@ -31,15 +31,15 @@
 
 #include <Arduino.h>
 #include "analyze_fft1024_F32.h"
-// #include "utility/dspinst.h"
 
 // Move audio data from an audio_block_f32_t to the FFT instance buffer.
+// This is for 128 numbers per block
 static void copy_to_fft_buffer(void *destination, const void *source)
 {
     const float *src = (const float *)source;
     float *dst = (float *)destination;
 
-    for (int i=0; i < AUDIO_BLOCK_SAMPLES; i++) {
+    for (int i=0; i < 128; i++) {
         *dst++ = *src++;     // real sample
         *dst++ = 0.0f;       // 0 for Imag
     }
@@ -51,17 +51,18 @@ static void apply_window_to_fft_buffer(void *buffer, const void *window)
     const float *win = (float *)window;
 
     for (int i=0; i < 1024; i++)
-        buf[2*i] *= *win++;
+        buf[2*i] *= *win++; 
 }
 
-void AudioAnalyzeFFT1024_F32::update(void)
-{
+void AudioAnalyzeFFT1024_F32::update(void)  {
     audio_block_f32_t *block;
-    block = receiveReadOnly_f32();
+    uint32_t tt;
+
+    block = AudioStream_F32::receiveReadOnly_f32();
     if (!block) return;
 
-// What all does 7EM cover??
-#if defined(__ARM_ARCH_7EM__)
+    // tt=micros();
+
     switch (state) {
     case 0:
         blocklist[0] = block;
@@ -93,44 +94,69 @@ void AudioAnalyzeFFT1024_F32::update(void)
         break;
     case 7:
         blocklist[7] = block;
+        // We have 4 previous blocks pointed to by blocklist[]:
         copy_to_fft_buffer(fft_buffer+0x000, blocklist[0]->data);
         copy_to_fft_buffer(fft_buffer+0x100, blocklist[1]->data);
         copy_to_fft_buffer(fft_buffer+0x200, blocklist[2]->data);
         copy_to_fft_buffer(fft_buffer+0x300, blocklist[3]->data);
+        // and 4 new blocks, just gathered:
         copy_to_fft_buffer(fft_buffer+0x400, blocklist[4]->data);
         copy_to_fft_buffer(fft_buffer+0x500, blocklist[5]->data);
         copy_to_fft_buffer(fft_buffer+0x600, blocklist[6]->data);
         copy_to_fft_buffer(fft_buffer+0x700, blocklist[7]->data);
-
+ 
         if (pWin)
            apply_window_to_fft_buffer(fft_buffer, window);
 
+#if defined(__IMXRT1062__)
+        // Teensyduino core for T4.x supports arm_cfft_f32
+        // arm_cfft_f32 (const arm_cfft_instance_f32 *S, float32_t *p1, uint8_t ifftFlag, uint8_t bitReverseFlag)
+        arm_cfft_f32 (&Sfft, fft_buffer, 0, 1);
+#else
+        // For T3.x go back to old (deprecated) style
         arm_cfft_radix4_f32(&fft_inst, fft_buffer);
+#endif
 
+        count++;      // Next do non-coherent averaging
         for (int i=0; i < 512; i++) {
             float magsq = fft_buffer[2*i]*fft_buffer[2*i] + fft_buffer[2*i+1]*fft_buffer[2*i+1];
-            if(outputType==FFT_RMS)
-               output[i] = sqrtf(magsq);
-            else if(outputType==FFT_POWER)
-               output[i] = magsq;
-            else if(outputType==FFT_DBFS)
-               output[i] = 10.0f*log10f(magsq)-54.1854f;  // Scaled to FS sine wave
-            else
-               output[i] = 0.0f;
-        }
-        outputflag = true;
-        release(blocklist[0]);
-        release(blocklist[1]);
-        release(blocklist[2]);
-        release(blocklist[3]);
+            if(count==1)               // Starting new average
+               sumsq[i] = magsq;
+            else if (count<=nAverage)  // Adding on to average
+               sumsq[i] += magsq;
+            }
+        if (count >= nAverage) {       // Average is finished
+            count = 0;
+            float inAf = 1.0f/(float)nAverage;
+            for(int i=0; i<512; i++)  {
+               if(outputType==FFT_RMS)
+                  output[i] = sqrtf(inAf*sumsq[i]);
+               else if(outputType==FFT_POWER)
+                  output[i] = inAf*sumsq[i];
+               else if(outputType==FFT_DBFS)  {
+                  if(sumsq[i]>0.0f)
+                     output[i] = 10.0f*log10f(inAf*sumsq[i])-54.1854f;  // Scaled to FS sine wave
+                  else
+                     output[i] = -193.0f;   // lsb for 23 bit mantissa
+                  }
+               else
+                  output[i] = 0.0f;
+               }    // End, set output[i] over all 512
+
+            outputflag = true;
+            }    // End of average is finished
+
+        AudioStream_F32::release(blocklist[0]);
+        AudioStream_F32::release(blocklist[1]);
+        AudioStream_F32::release(blocklist[2]);
+        AudioStream_F32::release(blocklist[3]);
+
         blocklist[0] = blocklist[4];
         blocklist[1] = blocklist[5];
         blocklist[2] = blocklist[6];
         blocklist[3] = blocklist[7];
         state = 4;
         break;
-    }
-#else
-    release(block);
-#endif
-}
+     }     // End switch(state)
+     // Serial.print("uSec: "); Serial.println(micros()-tt);
+  }        // End update()
diff --git a/analyze_fft1024_F32.h b/analyze_fft1024_F32.h
index 1d0ae20..728e80b 100644
--- a/analyze_fft1024_F32.h
+++ b/analyze_fft1024_F32.h
@@ -32,7 +32,7 @@
  * Power or dBFS (full scale).  Output can be bin by bin or a pointer to
  * the output array is available.  Several window functions are provided by
  * in-class design, or a custom window can be provided from the INO.
- * 
+ *
  * Functions (See comments below and #defines above:
  *   bool available()
  *   float read(unsigned int binNumber)
@@ -43,14 +43,15 @@
  *   float* getWindow(void)
  *   void putWindow(float *pwin)
  *   void setOutputType(int _type)
- * 
+ *   void setNAverage(int nAverage)
+ *
  * Timing, max is longest update() time:
- *   T3.6 Windowed, RMS out,  1016 uSec max
- *   T3.6 Windowed, Power Out, 975 uSec max
- *   T3.6 Windowed, dBFS out, 1591 uSec max
+ *   T3.6 Windowed, Power Out, 975 uSec
+ *   T3.6 Windowed, RMS out,  1016 uSec
+ *   T3.6 Windowed, dBFS out, 1591 uSec
  *   No Window saves 60 uSec on T3.6 for any output.
- *   T4.0 Windowed, RMS Out,   149 uSec
- * 
+ *   T4.0 Windowed, Ave=1, Power Out, 156 uSec
+ *   T4.0 Windowed, Ave=1, dBFS Out,  302 uSec
  * Scaling:
  *   Full scale for floating point DSP is a nebulous concept.  Normally the
  *   full scale is -1.0 to +1.0.  This is an unscaled FFT and for a sine
@@ -62,7 +63,7 @@
  *   no matter how it is scaled, but this factor needs to be considered
  *   when building the INO.
  */
-  // Fixed float/int problem in read(first, last).  RSL 3 Mar 21
+// Fixed float/int problem in read(first, last).  RSL 3 Mar 21
 
 #ifndef analyze_fft1024_F32_h_
 #define analyze_fft1024_F32_h_
@@ -71,6 +72,9 @@
 #include "AudioStream_F32.h"
 #include "arm_math.h"
 #include "mathDSP_F32.h"
+#if defined(__IMXRT1062__)
+#include "arm_const_structs.h"
+#endif
 
 #define FFT_RMS 0
 #define FFT_POWER 1
@@ -84,25 +88,37 @@
 
 class AudioAnalyzeFFT1024_F32 : public AudioStream_F32  {
 //GUI: inputs:1, outputs:0  //this line used for automatic generation of GUI node
-//GUI: shortName:AnalyzeFFT1024
+//GUI: shortName:FFT1024
 public:
     AudioAnalyzeFFT1024_F32() : AudioStream_F32(1, inputQueueArray) {
-        arm_cfft_radix4_init_f32(&fft_inst, 1024, 0, 1);
-        useHanningWindow();  // Revisit this for more flexibility  <<<<<
+        // __MK20DX128__ T_LC;  __MKL26Z64__ T3.0;  __MK20DX256__T3.1 and T3.2
+        // __MK64FX512__) T3.5; __MK66FX1M0__ T3.6; __IMXRT1062__ T4.0 and T4.1
+#if defined(__IMXRT1062__)
+        // Teensy4 core library has the right files for new FFT
+        // arm CMSIS library has predefined structures of type arm_cfft_instance_f32
+        Sfft = arm_cfft_sR_f32_len1024;   // This is one of the structures
+#else
+        arm_cfft_radix4_init_f32(&fft_inst, 1024, 0, 1); // for T3.x
+#endif
+        // This is always 128 block size.  Any sample rate.  No use of "setings"
+        // Bob:  Revisit this to use CMSIS fast fft in place of cfft.  faster?
+        useHanningWindow();
     }
 
+    // Inform that the output is available for read()
     bool available() {
         if (outputflag == true) {
             outputflag = false;
             return true;
-        }
+            }
         return false;
-    }
+        }
 
+    // Output data from a single bin is transferred
     float read(unsigned int binNumber) {
         if (binNumber>511 || binNumber<0) return 0.0;
         return output[binNumber];
-    }
+        }
 
     // Return sum of several bins. Normally use with power output.
     // This produces the equivalent of bigger bins.
@@ -147,8 +163,8 @@ public:
      return 0;
      }
 
-    // Fast pointer transfer.  Be aware that the data will go away 
-    // after the next 512 data points occur.  
+    // Fast pointer transfer.  Be aware that the data will go away
+    // after the next 512 data points occur.
     float* getData(void)  {
        return output;
        }
@@ -164,16 +180,22 @@ public:
        for(int i=0; i<1024; i++)
           *p++ = *pwin++;
        }
-  
+
     // Output RMS (default) Power or dBFS
     void setOutputType(int _type)  {
        outputType = _type;
        }
 
+    // Output power (non-coherent) averaging
+    void setNAverage(int _nAverage)  {
+       nAverage = _nAverage;
+       }
+
     virtual void update(void);
- 
-private:    
+
+private:
     float output[512];
+    float sumsq[512];
     float window[1024];
     float *pWin = window;
     audio_block_f32_t *blocklist[8];
@@ -181,8 +203,16 @@ private:
     uint8_t state = 0;
     bool outputflag = false;
     audio_block_f32_t *inputQueueArray[1];
-    arm_cfft_radix4_instance_f32 fft_inst;
+#if defined(__IMXRT1062__)
+  // For T4.x
+  // const static arm_cfft_instance_f32   arm_cfft_sR_f32_len1024;
+  arm_cfft_instance_f32 Sfft;
+#else
+  arm_cfft_radix4_instance_f32 fft_inst;
+#endif
     int outputType = FFT_RMS;  //Same type as I16 version init
+    int nAverage = 1;
+    int count = 0;     // used to average for nAverage of powers
 
     // The Hann window is a good all-around window
     void useHanningWindow(void) {
@@ -199,8 +229,8 @@ private:
            float kx = 0.006141921;  // 2*PI/1023
            int ix = (float) i;
            window[i] = 0.35875 -
-                       0.48829*cosf(     kx*ix) + 
-                       0.14128*cosf(2.0f*kx*ix) - 
+                       0.48829*cosf(     kx*ix) +
+                       0.14128*cosf(2.0f*kx*ix) -
                        0.01168*cosf(3.0f*kx*ix);
         }
     }
@@ -228,8 +258,7 @@ private:
           xn2 = 0.00000382215877f*xn2*xn2;
           window[511 - n]=kbes*(mathEqualizer.i0f(beta*sqrtf(1.0-xn2)));
           window[512 + n] = window[511 - n];
+          }
        }
-    }
-
-};
+    };
 #endif
diff --git a/examples/TestFFT1024/TestFFT1024.ino b/examples/TestFFT1024/TestFFT1024.ino
index ded5ab9..52e52be 100644
--- a/examples/TestFFT1024/TestFFT1024.ino
+++ b/examples/TestFFT1024/TestFFT1024.ino
@@ -5,10 +5,11 @@
 // on audio connected to the Left Line-In pin.  By changing code,
 // a synthetic sine wave can be input instead.
 //
-// The first 40 (of 512) frequency analysis bins are printed to
-// the Arduino Serial Monitor.
+// The power output from 512 frequency analysis bins are printed to
+// the Arduino Serial Monitor.  The format is selectable.
+// Output power averaging is an option
 //
-// T4.0: Uses 6.1% processor and 9 F32 memory blocks, both max.
+// T4.0: Uses 11.5% processor and 9 F32 memory blocks, both max.
 //
 // This example code is in the public domain.
 
@@ -25,13 +26,12 @@ const int myInput = AUDIO_INPUT_LINEIN;
 AudioSynthSineCosine_F32   sinewave;
 AudioAnalyzeFFT1024_F32    myFFT;
 AudioOutputI2S_F32         audioOutput; // audio shield: headphones & line-out NU
-
 // Connect either the live input or synthesized sine wave
 // AudioConnection_F32        patchCord1(audioInput, 0, myFFT, 0);
 AudioConnection_F32        patchCord1(sinewave, 0, myFFT, 0);
+AudioControlSGTL5000       audioShield;
 
-AudioControlSGTL5000 audioShield;
-
+uint32_t ct = 0;
 uint32_t count = 0;
 
 void setup() {
@@ -45,50 +45,52 @@ void setup() {
   audioShield.inputSelect(myInput);
   audioShield.volume(0.5);
 
+  // Create a synthetic sine wave, for testing
+  // To use this, edit the connections above
+  // sinewave.frequency(1033.99f);   // Bin 24  T3.x
+  // sinewave.frequency(1033.59375f);   // Bin 24  T4.x at 44100
+  // sinewave.frequency(1055.0f);  // Bin 24.5, demonstrates windowing
+  sinewave.frequency(1076.0f);
+
+  sinewave.amplitude(1.0f);
+
   // Set windowing function
-  // myFFT.windowFunction(NULL);
-  // myFFT.windowFunction(AudioWindowNone);  // Same as NULL
+  // myFFT.windowFunction(AudioWindowNone);
   // myFFT.windowFunction(AudioWindowHanning1024);  // default
   // The next Kaiser window needs a dB peak sidelobe number
-  myFFT.windowFunction(AudioWindowKaiser1024, 70.0f);
-  // myFFT.windowFunction(AudioWindowBlackmanHarris1024);
+  // myFFT.windowFunction(AudioWindowKaiser1024, 70.0f);
+  myFFT.windowFunction(AudioWindowBlackmanHarris1024);
 
   // To print the window function:
   // float* pw=myFFT.getWindow();
   // for(int jj=0; jj<1024; jj++)
   //    Serial.println(*pw++, 6);
 
-  // Create a synthetic sine wave, for testing
-  // To use this, edit the connections above
-  sinewave.frequency(1034.0);   // Bin 24
-  // sinewave.frequency(1055.0f);  // Bin 24.5, demonstrates windowing
+  myFFT.setNAverage(1);
 
-  myFFT.setOutputType(FFT_DBFS);  // FFT_RMS or FFT_POWER or FFT_DBFS
+  myFFT.setOutputType(FFT_DBFS);   // FFT_RMS or FFT_POWER or FFT_DBFS
 }
 
 void loop() {
-  if (myFFT.available()) {
+  if (myFFT.available() /*&& ++ct == 4*/ ) {
     // each time new FFT data is available
     // print it all to the Arduino Serial Monitor
-    Serial.print("FFT: ");
-    for (int i=0; i<40; i++) {
-        Serial.print(myFFT.read(i), 2);
+    Serial.println("FFT Output: ");
+    for (int i=0; i<512; i++) {
+        Serial.print(i);
         Serial.print(",");
-    }
+        Serial.println(myFFT.read(i), 3);
+        }
     Serial.println();
-  }
-  
-/* if(count++ == 3000) {
-     Serial.print("CPU: Percent Usage, Max: ");
-     Serial.print(AudioProcessorUsage());
-     Serial.print(", ");
+    }
+
+  /*
+  if(count++<200) {
+     Serial.print("CPU: Max Percent Usage: ");
      Serial.println(AudioProcessorUsageMax());
-     Serial.print("   Float 32 Memory: ");
-     Serial.print(AudioMemoryUsage_F32());
-     Serial.print(", ");
+     Serial.print("   Max Float 32 Memory: ");
      Serial.println(AudioMemoryUsageMax_F32());
      }
-  delay(2);
- */
-}
-
+   */
+  delay(500);
+  }