From c80da12e63b8281d7e1a241f86d8e3cf93d113be Mon Sep 17 00:00:00 2001
From: Gergo Koteles <soyer@irl.hu>
Date: Wed, 9 Jul 2025 18:22:58 +0200
Subject: [PATCH 1/7] reverb: do not use the reverb mixer if reverb is disabled

---
 src/minidexed.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/src/minidexed.cpp b/src/minidexed.cpp
index 51e6444..0e1c283 100644
--- a/src/minidexed.cpp
+++ b/src/minidexed.cpp
@@ -1409,7 +1409,6 @@ void CMiniDexed::ProcessSound (void)
 				for (uint8_t i = 0; i < m_nToneGenerators; i++)
 				{
 					tg_mixer->doAddMix(i,m_OutputLevel[i]);
-					reverb_send_mixer->doAddMix(i,m_OutputLevel[i]);
 				}
 				// END TG mixing
 
@@ -1425,6 +1424,11 @@ void CMiniDexed::ProcessSound (void)
 				{
 					float32_t ReverbBuffer[2][nFrames];
 					float32_t ReverbSendBuffer[2][nFrames];
+					
+					for (uint8_t i = 0; i < m_nToneGenerators; i++)
+					{
+						reverb_send_mixer->doAddMix(i,m_OutputLevel[i]);
+					}
 
 					arm_fill_f32(0.0f, ReverbBuffer[indexL], nFrames);
 					arm_fill_f32(0.0f, ReverbBuffer[indexR], nFrames);

From 2e90cb459ef180fe054d6b629616a8f6a55616b7 Mon Sep 17 00:00:00 2001
From: Gergo Koteles <soyer@irl.hu>
Date: Wed, 9 Jul 2025 18:24:59 +0200
Subject: [PATCH 2/7] reverb: do not fill the reverb buffers unnecessarily

they will be overwritten later
---
 src/minidexed.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/minidexed.cpp b/src/minidexed.cpp
index 0e1c283..1b5b4f6 100644
--- a/src/minidexed.cpp
+++ b/src/minidexed.cpp
@@ -1430,11 +1430,6 @@ void CMiniDexed::ProcessSound (void)
 						reverb_send_mixer->doAddMix(i,m_OutputLevel[i]);
 					}
 
-					arm_fill_f32(0.0f, ReverbBuffer[indexL], nFrames);
-					arm_fill_f32(0.0f, ReverbBuffer[indexR], nFrames);
-					arm_fill_f32(0.0f, ReverbSendBuffer[indexR], nFrames);
-					arm_fill_f32(0.0f, ReverbSendBuffer[indexL], nFrames);
-
 					m_ReverbSpinLock.Acquire ();
 
 					reverb_send_mixer->getMix(ReverbSendBuffer[indexL], ReverbSendBuffer[indexR]);

From f1dce8f6f022d80f2fadd2fd036acd2da68798af Mon Sep 17 00:00:00 2001
From: Gergo Koteles <soyer@irl.hu>
Date: Wed, 9 Jul 2025 19:15:03 +0200
Subject: [PATCH 3/7] mixer: use the buffer directly, do not copy

---
 src/effect_mixer.hpp | 14 ++++++++++++++
 src/minidexed.cpp    | 22 ++++++++++++----------
 2 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/src/effect_mixer.hpp b/src/effect_mixer.hpp
index 44184ab..78efbb4 100644
--- a/src/effect_mixer.hpp
+++ b/src/effect_mixer.hpp
@@ -168,6 +168,20 @@ public:
 			arm_fill_f32(0.0f, sumbufR, buffer_length);
 	}
 
+	void getBuffers(float32_t (*buffers[2]))
+	{
+		buffers[0] = sumbufL;
+		buffers[1] = sumbufR;
+	}
+
+	void zeroFill()
+	{
+		if(sumbufL)
+			arm_fill_f32(0.0f, sumbufL, buffer_length);
+		if(sumbufR)
+			arm_fill_f32(0.0f, sumbufR, buffer_length);
+	}
+
 protected:
 	using AudioMixer<NN>::sumbufL;
 	using AudioMixer<NN>::multiplier;
diff --git a/src/minidexed.cpp b/src/minidexed.cpp
index 1b5b4f6..d8e64df 100644
--- a/src/minidexed.cpp
+++ b/src/minidexed.cpp
@@ -1406,25 +1406,28 @@ void CMiniDexed::ProcessSound (void)
 
 			if(nMasterVolume > 0.0)
 			{
+				// get the mix buffer of all TGs
+				float32_t *SampleBuffer[2];
+				tg_mixer->getBuffers(SampleBuffer);
+
+				tg_mixer->zeroFill();
+
 				for (uint8_t i = 0; i < m_nToneGenerators; i++)
 				{
 					tg_mixer->doAddMix(i,m_OutputLevel[i]);
 				}
 				// END TG mixing
 
-				// BEGIN create SampleBuffer for holding audio data
-				float32_t SampleBuffer[2][nFrames];
-				// END create SampleBuffer for holding audio data
-
-				// get the mix of all TGs
-				tg_mixer->getMix(SampleBuffer[indexL], SampleBuffer[indexR]);
-
 				// BEGIN adding reverb
 				if (m_nParameter[ParameterReverbEnable])
 				{
 					float32_t ReverbBuffer[2][nFrames];
-					float32_t ReverbSendBuffer[2][nFrames];
-					
+
+					float32_t *ReverbSendBuffer[2];
+					reverb_send_mixer->getBuffers(ReverbSendBuffer);
+
+					reverb_send_mixer->zeroFill();
+
 					for (uint8_t i = 0; i < m_nToneGenerators; i++)
 					{
 						reverb_send_mixer->doAddMix(i,m_OutputLevel[i]);
@@ -1432,7 +1435,6 @@ void CMiniDexed::ProcessSound (void)
 
 					m_ReverbSpinLock.Acquire ();
 
-					reverb_send_mixer->getMix(ReverbSendBuffer[indexL], ReverbSendBuffer[indexR]);
 					reverb->doReverb(ReverbSendBuffer[indexL],ReverbSendBuffer[indexR],ReverbBuffer[indexL], ReverbBuffer[indexR],nFrames);
 
 					// scale down and add left reverb buffer by reverb level 

From f264aae58cea7effe2f8156feee6583bb7cf35bc Mon Sep 17 00:00:00 2001
From: Gergo Koteles <soyer@irl.hu>
Date: Wed, 9 Jul 2025 20:47:53 +0200
Subject: [PATCH 4/7] mixer: remove the unused and buggy 3 parameter doAddMix()

It doesn't work well if multiplier[channel]==UNITY_GAIN
---
 src/effect_mixer.hpp | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/src/effect_mixer.hpp b/src/effect_mixer.hpp
index 78efbb4..2cf3586 100644
--- a/src/effect_mixer.hpp
+++ b/src/effect_mixer.hpp
@@ -135,23 +135,6 @@ public:
 		arm_add_f32(sumbufR, tmp, sumbufR, buffer_length);
 	}
 
-	void doAddMix(uint8_t channel, float32_t* inL, float32_t* inR)
-	{
-		float32_t tmp[buffer_length];
-
-		assert(inL);
-		assert(inR);
-
-		// left
-		if(multiplier[channel]!=UNITY_GAIN)
-			arm_scale_f32(inL,multiplier[channel],tmp,buffer_length);
-		arm_add_f32(sumbufL, tmp, sumbufL, buffer_length);
-		// right
-		if(multiplier[channel]!=UNITY_GAIN)
-			arm_scale_f32(inR,multiplier[channel],tmp,buffer_length);
-		arm_add_f32(sumbufR, tmp, sumbufR, buffer_length);
-	}
-
 	void getMix(float32_t* bufferL, float32_t* bufferR)
 	{
 		assert(bufferR);

From ce9f82ca51ea493f12da693f9cf25351486ef31d Mon Sep 17 00:00:00 2001
From: Gergo Koteles <soyer@irl.hu>
Date: Wed, 9 Jul 2025 20:52:56 +0200
Subject: [PATCH 5/7] mixer: prescale the scale parameter so one scale per
 element is enough

---
 src/effect_mixer.hpp | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/effect_mixer.hpp b/src/effect_mixer.hpp
index 2cf3586..9cf5070 100644
--- a/src/effect_mixer.hpp
+++ b/src/effect_mixer.hpp
@@ -124,14 +124,10 @@ public:
 		assert(in);
 
 		// left
-		arm_scale_f32(in, panorama[channel][0], tmp, buffer_length);
-		if(multiplier[channel]!=UNITY_GAIN)
-			arm_scale_f32(tmp,multiplier[channel],tmp,buffer_length);
+		arm_scale_f32(in, panorama[channel][0] * multiplier[channel], tmp, buffer_length);
 		arm_add_f32(sumbufL, tmp, sumbufL, buffer_length);
 		// right
-		arm_scale_f32(in, panorama[channel][1], tmp, buffer_length);
-		if(multiplier[channel]!=UNITY_GAIN)
-			arm_scale_f32(tmp,multiplier[channel],tmp,buffer_length);
+		arm_scale_f32(in, panorama[channel][1] * multiplier[channel], tmp, buffer_length);
 		arm_add_f32(sumbufR, tmp, sumbufR, buffer_length);
 	}
 

From 7c68142b6b5d8b0a7f40725fd296dab7b9ce67f7 Mon Sep 17 00:00:00 2001
From: Gergo Koteles <soyer@irl.hu>
Date: Tue, 22 Jul 2025 00:02:45 +0200
Subject: [PATCH 6/7] ProcessSound: do not optimize for special values

this makes the code easier to understand and the execution more predictable
---
 src/minidexed.cpp | 131 ++++++++++++++++++----------------------------
 1 file changed, 52 insertions(+), 79 deletions(-)

diff --git a/src/minidexed.cpp b/src/minidexed.cpp
index d8e64df..acd79a2 100644
--- a/src/minidexed.cpp
+++ b/src/minidexed.cpp
@@ -1354,33 +1354,20 @@ void CMiniDexed::ProcessSound (void)
 			float32_t tmp_float[nFrames*Channels];
 			int32_t tmp_int[nFrames*Channels];
 
-			if(nMasterVolume > 0.0)
+			// Convert dual float array (8 chan) to single int16 array (8 chan)
+			for(uint16_t i=0; i<nFrames;i++)
 			{
-				// Convert dual float array (8 chan) to single int16 array (8 chan)
-				for(uint16_t i=0; i<nFrames;i++)
+				// TGs will alternate on L/R channels for each output
+				// reading directly from the TG OutputLevel buffer with
+				// no additional processing.
+				for (uint8_t tg = 0; tg < Channels; tg++)
 				{
-					// TGs will alternate on L/R channels for each output
-					// reading directly from the TG OutputLevel buffer with
-					// no additional processing.
-					for (uint8_t tg = 0; tg < Channels; tg++)
-					{
-						if(nMasterVolume >0.0 && nMasterVolume <1.0)
-						{
-							tmp_float[(i*Channels)+tg]=m_OutputLevel[tg][i] * nMasterVolume;
-						}
-						else if(nMasterVolume == 1.0)
-						{
-							tmp_float[(i*Channels)+tg]=m_OutputLevel[tg][i];
-						}
-					}
+					tmp_float[(i*Channels)+tg]=m_OutputLevel[tg][i] * nMasterVolume;
 				}
-				arm_float_to_q23(tmp_float,tmp_int,nFrames*Channels);
-			}
-			else
-			{
-				arm_fill_q31(0, tmp_int, nFrames*Channels);
 			}
 
+			arm_float_to_q23(tmp_float,tmp_int,nFrames*Channels);
+
 			// Prevent PCM510x analog mute from kicking in
 			for (uint8_t tg = 0; tg < Channels; tg++) 
 			{
@@ -1404,78 +1391,64 @@ void CMiniDexed::ProcessSound (void)
 			float32_t tmp_float[nFrames*2];
 			int32_t tmp_int[nFrames*2];
 
-			if(nMasterVolume > 0.0)
-			{
-				// get the mix buffer of all TGs
-				float32_t *SampleBuffer[2];
-				tg_mixer->getBuffers(SampleBuffer);
+			// get the mix buffer of all TGs
+			float32_t *SampleBuffer[2];
+			tg_mixer->getBuffers(SampleBuffer);
 
-				tg_mixer->zeroFill();
+			tg_mixer->zeroFill();
 
-				for (uint8_t i = 0; i < m_nToneGenerators; i++)
-				{
-					tg_mixer->doAddMix(i,m_OutputLevel[i]);
-				}
-				// END TG mixing
-
-				// BEGIN adding reverb
-				if (m_nParameter[ParameterReverbEnable])
-				{
-					float32_t ReverbBuffer[2][nFrames];
+			for (uint8_t i = 0; i < m_nToneGenerators; i++)
+			{
+				tg_mixer->doAddMix(i,m_OutputLevel[i]);
+			}
+			// END TG mixing
 
-					float32_t *ReverbSendBuffer[2];
-					reverb_send_mixer->getBuffers(ReverbSendBuffer);
+			// BEGIN adding reverb
+			if (m_nParameter[ParameterReverbEnable])
+			{
+				float32_t ReverbBuffer[2][nFrames];
 
-					reverb_send_mixer->zeroFill();
+				float32_t *ReverbSendBuffer[2];
+				reverb_send_mixer->getBuffers(ReverbSendBuffer);
 
-					for (uint8_t i = 0; i < m_nToneGenerators; i++)
-					{
-						reverb_send_mixer->doAddMix(i,m_OutputLevel[i]);
-					}
+				reverb_send_mixer->zeroFill();
 
-					m_ReverbSpinLock.Acquire ();
+				for (uint8_t i = 0; i < m_nToneGenerators; i++)
+				{
+					reverb_send_mixer->doAddMix(i,m_OutputLevel[i]);
+				}
 
-					reverb->doReverb(ReverbSendBuffer[indexL],ReverbSendBuffer[indexR],ReverbBuffer[indexL], ReverbBuffer[indexR],nFrames);
+				m_ReverbSpinLock.Acquire ();
 
-					// scale down and add left reverb buffer by reverb level 
-					arm_scale_f32(ReverbBuffer[indexL], reverb->get_level(), ReverbBuffer[indexL], nFrames);
-					arm_add_f32(SampleBuffer[indexL], ReverbBuffer[indexL], SampleBuffer[indexL], nFrames);
-					// scale down and add right reverb buffer by reverb level 
-					arm_scale_f32(ReverbBuffer[indexR], reverb->get_level(), ReverbBuffer[indexR], nFrames);
-					arm_add_f32(SampleBuffer[indexR], ReverbBuffer[indexR], SampleBuffer[indexR], nFrames);
+				reverb->doReverb(ReverbSendBuffer[indexL],ReverbSendBuffer[indexR],ReverbBuffer[indexL], ReverbBuffer[indexR],nFrames);
 
-					m_ReverbSpinLock.Release ();
-				}
-				// END adding reverb
+				// scale down and add left reverb buffer by reverb level 
+				arm_scale_f32(ReverbBuffer[indexL], reverb->get_level(), ReverbBuffer[indexL], nFrames);
+				arm_add_f32(SampleBuffer[indexL], ReverbBuffer[indexL], SampleBuffer[indexL], nFrames);
+				// scale down and add right reverb buffer by reverb level 
+				arm_scale_f32(ReverbBuffer[indexR], reverb->get_level(), ReverbBuffer[indexR], nFrames);
+				arm_add_f32(SampleBuffer[indexR], ReverbBuffer[indexR], SampleBuffer[indexR], nFrames);
 
-				// swap stereo channels if needed prior to writing back out
-				if (m_bChannelsSwapped)
-				{
-					indexL=1;
-					indexR=0;
-				}
+				m_ReverbSpinLock.Release ();
+			}
+			// END adding reverb
 
-				// Convert dual float array (left, right) to single int16 array (left/right)
-				for(uint16_t i=0; i<nFrames;i++)
-				{
-					if(nMasterVolume >0.0 && nMasterVolume <1.0)
-					{
-						tmp_float[i*2]=SampleBuffer[indexL][i] * nMasterVolume;
-						tmp_float[(i*2)+1]=SampleBuffer[indexR][i] * nMasterVolume;
-					}
-					else if(nMasterVolume == 1.0)
-					{
-						tmp_float[i*2]=SampleBuffer[indexL][i];
-						tmp_float[(i*2)+1]=SampleBuffer[indexR][i];
-					}
-				}
-				arm_float_to_q23(tmp_float,tmp_int,nFrames*2);
+			// swap stereo channels if needed prior to writing back out
+			if (m_bChannelsSwapped)
+			{
+				indexL=1;
+				indexR=0;
 			}
-			else
+
+			// Convert dual float array (left, right) to single int16 array (left/right)
+			for(uint16_t i=0; i<nFrames;i++)
 			{
-				arm_fill_q31(0, tmp_int, nFrames * 2);
+				tmp_float[i*2]=SampleBuffer[indexL][i] * nMasterVolume;
+				tmp_float[(i*2)+1]=SampleBuffer[indexR][i] * nMasterVolume;
 			}
 
+			arm_float_to_q23(tmp_float,tmp_int,nFrames*2);
+
 			// Prevent PCM510x analog mute from kicking in
 			if (tmp_int[nFrames * 2 - 1] == 0)
 			{

From d5ddae6ac0e49da748e93b46f89ab249b92ece36 Mon Sep 17 00:00:00 2001
From: Gergo Koteles <soyer@irl.hu>
Date: Mon, 21 Jul 2025 23:20:48 +0200
Subject: [PATCH 7/7] ProcessSound: use arm_scale_zip_f32

---
 src/Makefile            |  2 +-
 src/arm_scale_zip_f32.c | 85 +++++++++++++++++++++++++++++++++++++++++
 src/arm_scale_zip_f32.h | 22 +++++++++++
 src/minidexed.cpp       |  7 +---
 4 files changed, 110 insertions(+), 6 deletions(-)
 create mode 100644 src/arm_scale_zip_f32.c
 create mode 100644 src/arm_scale_zip_f32.h

diff --git a/src/Makefile b/src/Makefile
index b06b977..2aed327 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -10,7 +10,7 @@ OBJS = main.o kernel.o minidexed.o config.o userinterface.o uimenu.o \
        mididevice.o midikeyboard.o serialmididevice.o pckeyboard.o \
        sysexfileloader.o performanceconfig.o perftimer.o \
        effect_platervbstereo.o uibuttons.o midipin.o \
-       arm_float_to_q23.o \
+       arm_float_to_q23.o arm_scale_zip_f32.o \
        net/ftpdaemon.o net/ftpworker.o net/applemidi.o net/udpmidi.o net/mdnspublisher.o udpmididevice.o
 
 OPTIMIZE = -O3
diff --git a/src/arm_scale_zip_f32.c b/src/arm_scale_zip_f32.c
new file mode 100644
index 0000000..28ff1c7
--- /dev/null
+++ b/src/arm_scale_zip_f32.c
@@ -0,0 +1,85 @@
+#include "arm_scale_zip_f32.h"
+
+/**
+  Scale two vectors and zip after.  For floating-point data, the algorithm used is:
+
+  <pre>
+      pDst[n] = pSrc1[n] * scale, pDst[n+1] = pSrc2[n] * scale   0 <= n < blockSize.
+  </pre>
+
+ */
+
+/**
+* @brief Scale two floating-point vector with a scalar and zip after.
+* @param[in]  pSrc1      points to the input vector 1
+* @param[in]  pSrc2      points to the input vector 2
+* @param[in]  scale      scale scalar
+* @param[out] pDst       points to the output vector
+* @param[in]  blockSize  number of samples in the vector
+*/
+
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
+void arm_scale_zip_f32(
+  const float32_t * pSrc1,
+  const float32_t * pSrc2,
+        float32_t scale,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;                               /* Loop counter */
+
+    f32x2x2_t res;
+
+    /* Compute 2 outputs at a time */
+    blkCnt = blockSize >> 1U;
+
+    while (blkCnt > 0U)
+    {
+        res.val[0] = vmul_n_f32(vld1_f32(pSrc1), scale);
+        res.val[1] = vmul_n_f32(vld1_f32(pSrc2), scale);
+        vst2_f32(pDst, res);
+
+        /* Increment pointers */
+        pSrc1 += 2;
+        pSrc2 += 2;
+        pDst += 4;
+        
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+    ** No loop unrolling is used. */
+    blkCnt = blockSize & 1;
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = *pSrc1++ * scale;
+        *pDst++ = *pSrc2++ * scale;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+}
+#else
+void arm_scale_zip_f32(
+  const float32_t * pSrc1,
+  const float32_t * pSrc2,
+        float32_t scale,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+  uint32_t blkCnt;                               /* Loop counter */
+
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+      *pDst++ = *pSrc1++ * scale;
+      *pDst++ = *pSrc2++ * scale;
+      
+      /* Decrement the loop counter */
+      blkCnt--;
+  }
+}
+#endif
diff --git a/src/arm_scale_zip_f32.h b/src/arm_scale_zip_f32.h
new file mode 100644
index 0000000..6629b22
--- /dev/null
+++ b/src/arm_scale_zip_f32.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "arm_math_types.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/**
+* @brief Scale two floating-point vector with a scalar and zip after.
+* @param[in]  pSrc1      points to the input vector 1
+* @param[in]  pSrc2      points to the input vector 2
+* @param[in]  scale      scale scalar
+* @param[out] pDst       points to the output vector
+* @param[in]  blockSize  number of samples in the vector
+*/
+void arm_scale_zip_f32(const float32_t * pSrc1, const float32_t * pSrc2, float32_t scale, float32_t * pDst, uint32_t blockSize);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/minidexed.cpp b/src/minidexed.cpp
index acd79a2..257ef55 100644
--- a/src/minidexed.cpp
+++ b/src/minidexed.cpp
@@ -30,6 +30,7 @@
 #include <stdio.h>
 #include <assert.h>
 #include "arm_float_to_q23.h"
+#include "arm_scale_zip_f32.h"
 
 const char WLANFirmwarePath[] = "SD:firmware/";
 const char WLANConfigFile[]   = "SD:wpa_supplicant.conf";
@@ -1441,11 +1442,7 @@ void CMiniDexed::ProcessSound (void)
 			}
 
 			// Convert dual float array (left, right) to single int16 array (left/right)
-			for(uint16_t i=0; i<nFrames;i++)
-			{
-				tmp_float[i*2]=SampleBuffer[indexL][i] * nMasterVolume;
-				tmp_float[(i*2)+1]=SampleBuffer[indexR][i] * nMasterVolume;
-			}
+			arm_scale_zip_f32(SampleBuffer[indexL], SampleBuffer[indexR], nMasterVolume, tmp_float, nFrames);
 
 			arm_float_to_q23(tmp_float,tmp_int,nFrames*2);