diff --git a/src/Makefile b/src/Makefile
index b06b977..2aed327 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -10,7 +10,7 @@ OBJS = main.o kernel.o minidexed.o config.o userinterface.o uimenu.o \
        mididevice.o midikeyboard.o serialmididevice.o pckeyboard.o \
        sysexfileloader.o performanceconfig.o perftimer.o \
        effect_platervbstereo.o uibuttons.o midipin.o \
-       arm_float_to_q23.o \
+       arm_float_to_q23.o arm_scale_zip_f32.o \
        net/ftpdaemon.o net/ftpworker.o net/applemidi.o net/udpmidi.o net/mdnspublisher.o udpmididevice.o
 
 OPTIMIZE = -O3
diff --git a/src/arm_scale_zip_f32.c b/src/arm_scale_zip_f32.c
new file mode 100644
index 0000000..28ff1c7
--- /dev/null
+++ b/src/arm_scale_zip_f32.c
@@ -0,0 +1,85 @@
+#include "arm_scale_zip_f32.h"
+
+/**
+  Scale two vectors and zip after.  For floating-point data, the algorithm used is:
+
+  <pre>
+      pDst[n] = pSrc1[n] * scale, pDst[n+1] = pSrc2[n] * scale   0 <= n < blockSize.
+  </pre>
+
+ */
+
+/**
+* @brief Scale two floating-point vector with a scalar and zip after.
+* @param[in]  pSrc1      points to the input vector 1
+* @param[in]  pSrc2      points to the input vector 2
+* @param[in]  scale      scale scalar
+* @param[out] pDst       points to the output vector
+* @param[in]  blockSize  number of samples in the vector
+*/
+
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
+void arm_scale_zip_f32(
+  const float32_t * pSrc1,
+  const float32_t * pSrc2,
+        float32_t scale,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;                               /* Loop counter */
+
+    f32x2x2_t res;
+
+    /* Compute 2 outputs at a time */
+    blkCnt = blockSize >> 1U;
+
+    while (blkCnt > 0U)
+    {
+        res.val[0] = vmul_n_f32(vld1_f32(pSrc1), scale);
+        res.val[1] = vmul_n_f32(vld1_f32(pSrc2), scale);
+        vst2_f32(pDst, res);
+
+        /* Increment pointers */
+        pSrc1 += 2;
+        pSrc2 += 2;
+        pDst += 4;
+        
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+    ** No loop unrolling is used. */
+    blkCnt = blockSize & 1;
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = *pSrc1++ * scale;
+        *pDst++ = *pSrc2++ * scale;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+}
+#else
+void arm_scale_zip_f32(
+  const float32_t * pSrc1,
+  const float32_t * pSrc2,
+        float32_t scale,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+  uint32_t blkCnt;                               /* Loop counter */
+
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+      *pDst++ = *pSrc1++ * scale;
+      *pDst++ = *pSrc2++ * scale;
+      
+      /* Decrement the loop counter */
+      blkCnt--;
+  }
+}
+#endif
diff --git a/src/arm_scale_zip_f32.h b/src/arm_scale_zip_f32.h
new file mode 100644
index 0000000..6629b22
--- /dev/null
+++ b/src/arm_scale_zip_f32.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "arm_math_types.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/**
+* @brief Scale two floating-point vector with a scalar and zip after.
+* @param[in]  pSrc1      points to the input vector 1
+* @param[in]  pSrc2      points to the input vector 2
+* @param[in]  scale      scale scalar
+* @param[out] pDst       points to the output vector
+* @param[in]  blockSize  number of samples in the vector
+*/
+void arm_scale_zip_f32(const float32_t * pSrc1, const float32_t * pSrc2, float32_t scale, float32_t * pDst, uint32_t blockSize);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/minidexed.cpp b/src/minidexed.cpp
index acd79a2..257ef55 100644
--- a/src/minidexed.cpp
+++ b/src/minidexed.cpp
@@ -30,6 +30,7 @@
 #include <stdio.h>
 #include <assert.h>
 #include "arm_float_to_q23.h"
+#include "arm_scale_zip_f32.h"
 
 const char WLANFirmwarePath[] = "SD:firmware/";
 const char WLANConfigFile[]   = "SD:wpa_supplicant.conf";
@@ -1441,11 +1442,7 @@ void CMiniDexed::ProcessSound (void)
 			}
 
 			// Convert dual float array (left, right) to single int16 array (left/right)
-			for(uint16_t i=0; i<nFrames;i++)
-			{
-				tmp_float[i*2]=SampleBuffer[indexL][i] * nMasterVolume;
-				tmp_float[(i*2)+1]=SampleBuffer[indexR][i] * nMasterVolume;
-			}
+			arm_scale_zip_f32(SampleBuffer[indexL], SampleBuffer[indexR], nMasterVolume, tmp_float, nFrames);
 
 			arm_float_to_q23(tmp_float,tmp_int,nFrames*2);