From acf9e11d5f9f60e188635de1714064fc00aa02f7 Mon Sep 17 00:00:00 2001
From: soyer <soyer@irl.hu>
Date: Mon, 21 Apr 2025 23:39:42 +0200
Subject: [PATCH] Use SoundFormatSigned24_32 with NEON (#852)

* use SoundFormatSigned24_32 format instead of SoundFormatSigned16

More detailed, and not much slower.

* fix ARM_MATH_NEON defines

---------

Co-authored-by: probonopd <probonopd@users.noreply.github.com>
---
 src/Makefile           |  1 +
 src/Synth_Dexed.mk     |  3 +-
 src/arm_float_to_q23.c | 88 ++++++++++++++++++++++++++++++++++++++++++
 src/arm_float_to_q23.h | 22 +++++++++++
 src/minidexed.cpp      | 19 ++++-----
 5 files changed, 123 insertions(+), 10 deletions(-)
 create mode 100644 src/arm_float_to_q23.c
 create mode 100644 src/arm_float_to_q23.h

diff --git a/src/Makefile b/src/Makefile
index 7882018..73dbddc 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -10,6 +10,7 @@ OBJS = main.o kernel.o minidexed.o config.o userinterface.o uimenu.o \
        mididevice.o midikeyboard.o serialmididevice.o pckeyboard.o \
        sysexfileloader.o performanceconfig.o perftimer.o \
        effect_compressor.o effect_platervbstereo.o uibuttons.o midipin.o \
+       arm_float_to_q23.o \
        net/ftpdaemon.o net/ftpworker.o net/applemidi.o net/udpmidi.o net/mdnspublisher.o udpmididevice.o
 
 OPTIMIZE = -O3
diff --git a/src/Synth_Dexed.mk b/src/Synth_Dexed.mk
index 6aa4a49..4d42e67 100644
--- a/src/Synth_Dexed.mk
+++ b/src/Synth_Dexed.mk
@@ -40,8 +40,9 @@ INCLUDE += -I $(CMSIS_DSP_COMPUTELIB_INCLUDE_DIR)
 
 DEFINE += -DUSE_FX
 
-ifeq ($(strip $(AARCH)),64)
+ifeq ($(RPI), $(filter $(RPI), 3 4 5))
 DEFINE += -DARM_MATH_NEON
+DEFINE += -DARM_MATH_NEON_EXPERIMENTAL
 DEFINE += -DHAVE_NEON
 endif
 
diff --git a/src/arm_float_to_q23.c b/src/arm_float_to_q23.c
new file mode 100644
index 0000000..8eb21be
--- /dev/null
+++ b/src/arm_float_to_q23.c
@@ -0,0 +1,88 @@
+#include "arm_float_to_q23.h"
+
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
+void arm_float_to_q23(const float32_t * pSrc, q23_t * pDst, uint32_t blockSize)
+{
+    const float32_t *pIn = pSrc;                   /* Src pointer */
+    uint32_t blkCnt;                               /* loop counter */
+    
+    float32x4_t inV;
+
+    int32x4_t cvt;
+
+    blkCnt = blockSize >> 2U;
+
+    /* Compute 4 outputs at a time.
+    ** a second loop below computes the remaining 1 to 3 samples. */
+    while (blkCnt > 0U)
+    {
+        /* C = A * 8388608 */
+        /* Convert from float to q23 and then store the results in the destination buffer */
+        inV = vld1q_f32(pIn);
+
+        cvt = vcvtq_n_s32_f32(inV, 23);
+
+        /* saturate */
+        cvt = vminq_s32(cvt, vdupq_n_s32(0x007fffff));
+        cvt = vmaxq_s32(cvt, vdupq_n_s32(0xff800000));
+
+        vst1q_s32(pDst, cvt);
+        pDst += 4;
+        pIn += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+    ** No loop unrolling is used. */
+    blkCnt = blockSize & 3;
+
+    while (blkCnt > 0U)
+    {
+        /* C = A * 8388608 */
+        /* Convert from float to q23 and then store the results in the destination buffer */
+        *pDst++ = (q23_t) __SSAT((q31_t) (*pIn++ * 8388608.0f), 24);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+}
+#else
+void arm_float_to_q23(const float32_t * pSrc, q23_t * pDst, uint32_t blockSize)
+{
+    uint32_t blkCnt;                /* Loop counter */
+    const float32_t *pIn = pSrc;    /* Source pointer */
+
+    /* Loop unrolling: Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = A * 8388608 */
+        /* convert from float to Q23 and store result in destination buffer */
+
+        *pDst++ = (q23_t) __SSAT((q31_t) (*pIn++ * 8388608.0f), 24);
+        *pDst++ = (q23_t) __SSAT((q31_t) (*pIn++ * 8388608.0f), 24);
+        *pDst++ = (q23_t) __SSAT((q31_t) (*pIn++ * 8388608.0f), 24);
+        *pDst++ = (q23_t) __SSAT((q31_t) (*pIn++ * 8388608.0f), 24);
+
+        /* Decrement loop counter */
+        blkCnt--;
+    }
+
+    /* Loop unrolling: Compute remaining outputs */
+    blkCnt = blockSize % 0x4U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = A * 8388608 */
+        /* Convert from float to q23 and then store the results in the destination buffer */
+        *pDst++ = (q23_t) __SSAT((q31_t) (*pIn++ * 8388608.0f), 24);
+
+        /* Decrement loop counter */
+        blkCnt--;
+    }
+
+}
+#endif /* #if defined(ARM_MATH_NEON_EXPERIMENTAL) */
diff --git a/src/arm_float_to_q23.h b/src/arm_float_to_q23.h
new file mode 100644
index 0000000..6a77ea8
--- /dev/null
+++ b/src/arm_float_to_q23.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "arm_math_types.h"
+
+typedef int32_t q23_t;
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/**
+ * @brief Converts the elements of the floating-point vector to Q23 vector.
+ * @param[in]  pSrc       points to the floating-point input vector
+ * @param[out] pDst       points to the Q23 output vector
+ * @param[in]  blockSize  length of the input vector
+ */
+void arm_float_to_q23(const float32_t * pSrc, q23_t * pDst, uint32_t blockSize);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/minidexed.cpp b/src/minidexed.cpp
index 1cca446..cbed9f9 100644
--- a/src/minidexed.cpp
+++ b/src/minidexed.cpp
@@ -29,6 +29,7 @@
 #include <string.h>
 #include <stdio.h>
 #include <assert.h>
+#include "arm_float_to_q23.h"
 
 const char WLANFirmwarePath[] = "SD:firmware/";
 const char WLANConfigFile[]   = "SD:wpa_supplicant.conf";
@@ -359,7 +360,7 @@ bool CMiniDexed::Initialize (void)
 		return false;
 	}
 
-	m_pSoundDevice->SetWriteFormat (SoundFormatSigned16, Channels);
+	m_pSoundDevice->SetWriteFormat (SoundFormatSigned24_32, Channels);
 
 	m_nQueueSizeFrames = m_pSoundDevice->GetQueueSizeFrames ();
 
@@ -1260,8 +1261,8 @@ void CMiniDexed::ProcessSound (void)
 		m_pTG[0]->getSamples (SampleBuffer, nFrames);
 
 		// Convert single float array (mono) to int16 array
-		int16_t tmp_int[nFrames];
-		arm_float_to_q15(SampleBuffer,tmp_int,nFrames);
+		int32_t tmp_int[nFrames];
+		arm_float_to_q23(SampleBuffer,tmp_int,nFrames);
 
 		if (m_pSoundDevice->Write (tmp_int, sizeof(tmp_int)) != (int) sizeof(tmp_int))
 		{
@@ -1328,7 +1329,7 @@ void CMiniDexed::ProcessSound (void)
 			// Note: one TG per audio channel; output=mono; no processing.
 			const int Channels = 8;  // One TG per channel
 			float32_t tmp_float[nFrames*Channels];
-			int16_t tmp_int[nFrames*Channels];
+			int32_t tmp_int[nFrames*Channels];
 
 			if(nMasterVolume > 0.0)
 			{
@@ -1350,11 +1351,11 @@ void CMiniDexed::ProcessSound (void)
 						}
 					}
 				}
-				arm_float_to_q15(tmp_float,tmp_int,nFrames*Channels);
+				arm_float_to_q23(tmp_float,tmp_int,nFrames*Channels);
 			}
 			else
 			{
-				arm_fill_q15(0, tmp_int, nFrames*Channels);
+				arm_fill_q31(0, tmp_int, nFrames*Channels);
 			}
 
 			// Prevent PCM510x analog mute from kicking in
@@ -1378,7 +1379,7 @@ void CMiniDexed::ProcessSound (void)
 
 			// BEGIN TG mixing
 			float32_t tmp_float[nFrames*2];
-			int16_t tmp_int[nFrames*2];
+			int32_t tmp_int[nFrames*2];
 
 			if(nMasterVolume > 0.0)
 			{
@@ -1444,11 +1445,11 @@ void CMiniDexed::ProcessSound (void)
 						tmp_float[(i*2)+1]=SampleBuffer[indexR][i];
 					}
 				}
-				arm_float_to_q15(tmp_float,tmp_int,nFrames*2);
+				arm_float_to_q23(tmp_float,tmp_int,nFrames*2);
 			}
 			else
 			{
-				arm_fill_q15(0, tmp_int, nFrames * 2);
+				arm_fill_q31(0, tmp_int, nFrames * 2);
 			}
 
 			// Prevent PCM510x analog mute from kicking in