diff --git a/analyze_fft256_iq_F32.cpp b/analyze_fft256_iq_F32.cpp
index bac0ff3..526fdbb 100644
--- a/analyze_fft256_iq_F32.cpp
+++ b/analyze_fft256_iq_F32.cpp
@@ -39,15 +39,13 @@
 #include "analyze_fft256_iq_F32.h"
 
 // Move audio data from audio_block_f32_t to the interleaved FFT instance buffer.
-static void copy_to_fft_buffer1(void *destination, const void *sourceI, const void *sourceQ)  {
+static void copy_to_fft_buffer0(void *destination, const void *sourceI, const void *sourceQ)  {
     const float *srcI = (const float *)sourceI;
     const float *srcQ = (const float *)sourceQ;
     float *dst = (float *)destination;
     for (int i=0; i < AUDIO_BLOCK_SAMPLES; i++) {
        *dst++ = *srcI++;     // real sample, interleave
-       //*dst++ = 0.0f;
        *dst++ = *srcQ++;     // imag
-       //*dst++ = 0.0f; 
        }
     }
 
@@ -55,13 +53,14 @@ static void apply_window_to_fft_buffer1(void *fft_buffer, const void *window) {
     float *buf = (float *)fft_buffer;      // 0th entry is real (do window) 1th is imag
     const float *win = (float *)window;
     for (int i=0; i < 256; i++)  {
-       buf[2*i] *= *win++;      // real
+       buf[2*i] *= *win;      // real
        buf[2*i + 1] *= *win++;  // imag
        }
     }
 
 void AudioAnalyzeFFT256_IQ_F32::update(void)  {
   audio_block_f32_t *block_i,*block_q;
+  int ii;
 
   block_i = receiveReadOnly_f32(0);
   if (!block_i) return;
@@ -78,40 +77,67 @@ void AudioAnalyzeFFT256_IQ_F32::update(void)  {
      prevblock_q = block_q;
      return;  // Nothing to release
      }
+
   // FFT is 256 and blocks are 128, so we need 2 blocks.  We still do
   // this every 128 samples to get 50% overlap on FFT data to roughly
   // compensate for windowing.
   //                (   dest,            i-source,            q-source   )
-  copy_to_fft_buffer1(fft_buffer,     prevblock_i->data, prevblock_q->data);
-  copy_to_fft_buffer1(fft_buffer+256, block_i->data,     block_q->data);
+  copy_to_fft_buffer0(fft_buffer,     prevblock_i->data, prevblock_q->data);
+  copy_to_fft_buffer0(fft_buffer+256, block_i->data,     block_q->data);
   if (pWin)
     apply_window_to_fft_buffer1(fft_buffer, window);
-  arm_cfft_radix4_f32(&fft_inst, fft_buffer);  // Finally the FFT
+
+#if defined(__IMXRT1062__)
+  // Teensyduino core for T4.x supports arm_cfft_f32
+  // arm_cfft_f32 (const arm_cfft_instance_f32 *S, float32_t *p1, uint8_t ifftFlag, uint8_t bitReverseFlag)
+  arm_cfft_f32(&Sfft, fft_buffer, 0, 1);
+#else
+  // For T3.x go back to old (deprecated) style
+  arm_cfft_radix4_f32(&fft_inst, fft_buffer);
+#endif
 
   count++;
-  for (int i=0; i < 256; i++) {
-     float ss = fft_buffer[2*i]*fft_buffer[2*i] + fft_buffer[2*i+1]*fft_buffer[2*i+1];
-     if(count==1)        // Starting new average
-        sumsq[i] = ss;
-     else if (count <= nAverage)  // Adding on to average
-        sumsq[i] += ss;
+  for (int i = 0; i < 128; i++)   {
+     // From complex FFT the "negative frequencies" are mirrors of the frequencies above fs/2.  So, we get
+     // frequencies from 0 to fs by re-arranging the coefficients. These are powers (not Volts)
+     // See DD4WH SDR  (Note - here and at "if(xAxis & xxxx)" below, we may have redundancy in index changing.
+     // Leave as is for now.)
+     float ss0 = fft_buffer[2 * i] *     fft_buffer[2 * i] +
+                 fft_buffer[2 * i + 1] * fft_buffer[2 * i + 1];
+     float ss1 = fft_buffer[2 * (i + 128)] *     fft_buffer[2 * (i + 128)] +
+                 fft_buffer[2 * (i + 128) + 1] * fft_buffer[2 * (i + 128) + 1];
+
+     if(count==1) {       // Starting new average
+        sumsq[i+128] = ss0;
+        sumsq[i] = ss1;
+        }
+    else if (count <= nAverage) { // Adding on to average
+        sumsq[i+128] += ss0;
+        sumsq[i] += ss1;
+        }
      }
 
   if (count >= nAverage) {    // Average is finished
-    count = 0;
-    float inAf = 1.0f/(float)nAverage;
-    for (int i=0; i < 256; i++) {
-       int ii = 255 - (i ^ 128);
-       if(outputType==FFT_RMS)
-          output[ii] = sqrtf(inAf*sumsq[ii]);
-       else if(outputType==FFT_POWER)
-          output[ii] = inAf*sumsq[ii];
-       else if(outputType==FFT_DBFS)
-          output[ii] = 10.0f*log10f(inAf*sumsq[ii])-42.1442f;  // Scaled to FS sine wave
-       else
-          output[ii] = 0.0f;
-       }
+     count = 0;
+     float inAf = 1.0f/(float)nAverage;
+     for (int i=0; i < 256; i++) {
+         // xAxis, bit 0 left/right;  bit 1 low to high
+         if(xAxis & 0X02)
+            ii = i;
+         else
+            ii = i^128;
+         if(xAxis & 0X01)
+            ii = (255 - ii);
+    if(outputType==FFT_RMS)
+       output[i] = sqrtf(inAf*sumsq[ii]);
+    else if(outputType==FFT_POWER)
+       output[i] = inAf*sumsq[ii];
+    else if(outputType==FFT_DBFS)
+       output[i] = 10.0f*log10f(inAf*sumsq[ii])-42.1442f;  // Scaled to FS sine wave
+    else
+       output[i] = 0.0f;
     }
+  }
   outputflag = true;
   release(prevblock_i);    // Release the 2 blocks that were block_i
   release(prevblock_q);    // and block_q on last time through update()
diff --git a/analyze_fft256_iq_F32.h b/analyze_fft256_iq_F32.h
index 102dd86..47986e4 100644
--- a/analyze_fft256_iq_F32.h
+++ b/analyze_fft256_iq_F32.h
@@ -1,15 +1,27 @@
-/*    analyze_fft256_iq_F32.h
+/*    analyze_fft256_iq_F32.h  Assembled by Bob Larkin   6 Mar 2021
  *
- * Converted to F32 floating point input and also extended
- * for complex I and Q inputs
- *   * Adapted all I/O to be F32 floating point for OpenAudio_ArduinoLibrary
- *   * Future: Add outputs for I & Q FFT x2 for overlapped FFT
+ * Rev 6 Mar 2021 - Added setXAxis()
+ * Rev 7 Mar 2021 - Corrected bug in applying windowing 
+ * 
+ * Does Fast Fourier Transform of a 256 point complex (I-Q) input.
+ * Output is one of three measures of the power in each of the 256
+ * output bins, Power, RMS level or dB relative to a full scale
+ * sine wave.  Windowing of the input data is provided for to reduce
+ * spreading of the power in the output bins.  All inputs are Teensy
+ * floating point extension (_F32) and all outputs are floating point.
+ *
+ * Features include:
+ *   * I and Q inputs are OpenAudio_Arduino Library F32 compatible.
+ *   * FFT output for every 128 inputs to overlapped FFTs to
+ *     compensate for windowing.
  *   * Windowing None, Hann, Kaiser and Blackman-Harris.
+ *   * Multiple bin-sum output to simulate wider bins.
+ *   * Power averaging of multiple FFT
+ *   * Soon: F32 audio outputs for I & Q
  *
  * Conversion Copyright (c) 2021 Bob Larkin
  * Same MIT license as PJRC:
  *
- *
  *  Audio Library for Teensy 3.X
  * Copyright (c) 2014, Paul Stoffregen, paul@pjrc.com
  *
@@ -36,8 +48,8 @@
  * THE SOFTWARE.
  */
 
-/* Does complex input FFT of 1024 points. Output is not audio, and is magnitude
- * only.  Multiple output formats of RMS (same as I16 version, and default),
+/* Does complex input FFT of 256 points.  Multiple non-audio (via functions)
+ * output formats of RMS (same as I16 version, and default),
  * Power or dBFS (full scale).  Output can be bin by bin or a pointer to
  * the output array is available.  Several window functions are provided by
  * in-class design, or a custom window can be provided from the INO.
@@ -51,7 +63,17 @@
  *   float* getData(void)
  *   float* getWindow(void)
  *   void putWindow(float *pwin)
+ *   void setNAverage(int NAve)   // >=1
  *   void setOutputType(int _type)
+ *   void setXAxis(uint8_t _xAxis)  // 0, 1, 2, 3
+ *
+ * x-Axis direction and offset per setXAxis(xAxis) for sine to I
+ * and cosine to Q.
+ *   If xAxis=0  f=fs/2 in middle, f=0 on right edge
+ *   If xAxis=1  f=fs/2 in middle, f=0 on left edge
+ *   If xAxis=2  f=fs/2 on left edge, f=0 in middle
+ *   If xAxis=3  f=fs/2 on right edgr, f=0 in middle
+ * If there is 180 degree phase shift to I or Q these all get reversed.
  *
  * Timing, max is longest update() time:
  *   T3.6 Windowed, RMS out,  - uSec max
@@ -75,13 +97,13 @@
 #ifndef analyze_fft256iq_h_
 #define analyze_fft256iq_h_
 
-//#include "AudioStream.h"
-//#include "arm_math.h"
-
 #include "Arduino.h"
 #include "AudioStream_F32.h"
 #include "arm_math.h"
 #include "mathDSP_F32.h"
+#if defined(__IMXRT1062__)
+#include "arm_const_structs.h"
+#endif
 
 #define FFT_RMS 0
 #define FFT_POWER 1
@@ -97,10 +119,21 @@ class AudioAnalyzeFFT256_IQ_F32 : public AudioStream_F32  {
 //GUI: inputs:2, outputs:4  //this line used for automatic generation of GUI node
 //GUI: shortName:AnalyzeFFT256IQ
 public:
-    AudioAnalyzeFFT256_IQ_F32() : AudioStream_F32(2, inputQueueArray) {   // NEEDS SETTINGS etc  <<<<<<<<
-        arm_cfft_radix4_init_f32(&fft_inst, 256, 0, 1);
+    AudioAnalyzeFFT256_IQ_F32() : AudioStream_F32(2, inputQueueArray) {
+        // __MK20DX128__ T_LC;  __MKL26Z64__ T3.0;  __MK20DX256__T3.1 and T3.2
+        // __MK64FX512__) T3.5; __MK66FX1M0__ T3.6; __IMXRT1062__ T4.0 and T4.1
+#if defined(__IMXRT1062__)
+        // Teensy4 core library has the right files for new FFT
+        // arm CMSIS library has predefined structures of type arm_cfft_instance_f32
+        Sfft = arm_cfft_sR_f32_len256;   // This is one of the structures
+#else
+        arm_cfft_radix4_init_f32(&fft_inst, 256, 0, 1); // for T3.x
+#endif
         useHanningWindow();
     }
+    // There is no varient for "settings," as blocks other than 128 are
+    // not supported and, nothing depends on sample rate so we don't need that.
+
 
     bool available() {
         if (outputflag == true) {
@@ -181,7 +214,18 @@ public:
        outputType = _type;
        }
 
-  virtual void update(void);
+    // Output power (non-coherent) averaging
+    // i.e., the number of FFT powers averaged in the output
+    void setNAverage(int _nAverage)  {
+       nAverage = _nAverage;
+       }
+
+    // xAxis, bit 0 left/right;  bit 1 low to high;  default 0X03
+    void setXAxis(uint8_t _xAxis)  {
+       xAxis = _xAxis;
+       }
+
+    virtual void update(void);
 
 private:
   float output[256];
@@ -193,10 +237,17 @@ private:
   bool outputflag = false;
   audio_block_f32_t *inputQueueArray[2];
   audio_block_f32_t *prevblock_i,*prevblock_q;
+#if defined(__IMXRT1062__)
+  // For T4.x
+  // const static arm_cfft_instance_f32   arm_cfft_sR_f32_len256;
+  arm_cfft_instance_f32 Sfft;
+#else
   arm_cfft_radix4_instance_f32 fft_inst;
+#endif
   int outputType = FFT_RMS;  //Same type as I16 version init
   int count = 0;
   int nAverage = 1;
+  uint8_t xAxis = 3;
 
     // The Hann window is a good all-around window
     void useHanningWindow(void) {
@@ -238,8 +289,6 @@ private:
        kbes = 1.0f / mathEqualizer.i0f(beta);      // An additional derived parameter used in loop
        for (int n=0; n<128; n++) {
           xn2 = 0.5f+(float32_t)n;
-          // 4/(1023^2)=0.00000382215877f
-          // xn2 = 0.00000382215877f*xn2*xn2;
           // 4/(255^2)=0.000061514802f
           xn2 = 0.000061514802f*xn2*xn2;
           window[127 - n]=kbes*(mathEqualizer.i0f(beta*sqrtf(1.0-xn2)));
diff --git a/examples/TestFFT256iq/TestFFT256iq.ino b/examples/TestFFT256iq/TestFFT256iq.ino
index a7ffd83..2454014 100644
--- a/examples/TestFFT256iq/TestFFT256iq.ino
+++ b/examples/TestFFT256iq/TestFFT256iq.ino
@@ -10,12 +10,11 @@
 #include <SerialFlash.h>
 
 // GUItool: begin automatically generated code
-AudioSynthSineCosine_F32 sine_cos1;      //xy=76,532
+AudioSynthSineCosine_F32  sine_cos1;      //xy=76,532
 AudioAnalyzeFFT256_IQ_F32 FFT256iq1;      //xy=243,532
-AudioOutputI2S_F32       audioOutI2S1;   //xy=246,591
-AudioConnection_F32      patchCord1(sine_cos1, 0, FFT256iq1, 0);
-AudioConnection_F32      patchCord2(sine_cos1, 1, FFT256iq1, 1);
-//AudioControlSGTL5000     sgtl5000_1;
+AudioOutputI2S_F32        audioOutI2S1;   //xy=246,591
+AudioConnection_F32       patchCord1(sine_cos1, 0, FFT256iq1, 0);
+AudioConnection_F32       patchCord2(sine_cos1, 1, FFT256iq1, 1);
 // GUItool: end automatically generated code
 
 void setup(void) {
@@ -24,27 +23,55 @@ void setup(void) {
   Serial.begin(9600);
   delay(1000);
   AudioMemory_F32(20);
-  Serial.println("FFT256IQ Test");
-//  sgtl5000_1.enable();                   //start the audio board
- // sgtl5000_1.inputSelect(AUDIO_INPUT_LINEIN);       // or AUDIO_INPUT_MIC
+  Serial.println("FFT256IQ Test   v2");
 
-  sine_cos1.amplitude(0.5);         // Initialize Waveform Generator
+  sine_cos1.amplitude(1.0);         // Initialize Waveform Generator
   // bin spacing = 44117.648/256 = 172.335   172.3 * 4 = 689.335 Hz (T3.6)
   // Half bin higher is 775.3 for testing windows
   //sine_cos1.frequency(689.34f);
-  sine_cos1.frequency(1723.35f);
 
+  // Pick T3.6 bin center
+  //sine_cos1.frequency(689.33);
+
+  // or pick T4.x bin center
+  //sine_cos1.frequency(689.0625f);
+
+  // or pick any old frequency
+  sine_cos1.frequency(7100.0);
+
+  // elect the output format
   FFT256iq1.setOutputType(FFT_DBFS);
+
+  // Select the wndow function
+  //FFT256iq1.windowFunction(AudioWindowNone);
+  //FFT256iq1.windowFunction(AudioWindowHanning256);
+  //FFT256iq1.windowFunction(AudioWindowKaiser256, 55.0f);
+  FFT256iq1.windowFunction(AudioWindowBlackmanHarris256);
+
+  // Uncomment to Serial print window function
+  //float* pw = FFT256iq1.getWindow();   // Print window
+  //for (int i=0; i<512; i++) Serial.println(pw[i], 4);
+
+  // xAxis, bit 0 left/right;  bit 1 low to high;  default 0X03
+  FFT256iq1.setXAxis(0X03);
+
   FFT256iq1.windowFunction(AudioWindowBlackmanHarris256);
   //float* pw = FFT256iq1.getWindow();   // Print window
   //for (int i=0; i<256; i++) Serial.println(pw[i], 4);
 
-  delay(1000);
-  if( FFT256iq1.available() )
-      pPwr = FFT256iq1.getData();
+  // Do power averaging (outputs appear less often, as well)
+  FFT256iq1.setNAverage(5);  // nAverage >= 1
 
-  for(int i=0; i<256; i++)
-     Serial.println(*(pPwr + i), 8 );
+  delay(1000);
+  if( FFT256iq1.available() )  {
+     pPwr = FFT256iq1.getData();
+     for(int i=0; i<256; i++) {
+         Serial.print(i);
+         Serial.print(",");
+         Serial.println(*(pPwr + i), 8 );
+         }
+      Serial.print("\n\n");
+      }
   }
 
 void loop(void)  {