Use SoundFormatSigned24_32 with NEON (#852)

* use SoundFormatSigned24_32 format instead of SoundFormatSigned16 More detailed, and not much slower. * fix ARM_MATH_NEON defines --------- Co-authored-by: probonopd <probonopd@users.noreply.github.com>
2 months ago · acf9e11d5f
parent 9d8ed87aab
commit acf9e11d5f
5 changed files with 123 additions and 10 deletions
--- a/src/Makefile
+++ b/src/Makefile
@ -10,6 +10,7 @@ OBJS = main.o kernel.o minidexed.o config.o userinterface.o uimenu.o \
       mididevice.o midikeyboard.o serialmididevice.o pckeyboard.o \
       sysexfileloader.o performanceconfig.o perftimer.o \
       effect_compressor.o effect_platervbstereo.o uibuttons.o midipin.o \
+       arm_float_to_q23.o \
       net/ftpdaemon.o net/ftpworker.o net/applemidi.o net/udpmidi.o net/mdnspublisher.o udpmididevice.o

 OPTIMIZE = -O3
--- a/src/Synth_Dexed.mk
+++ b/src/Synth_Dexed.mk
@ -40,8 +40,9 @@ INCLUDE += -I $(CMSIS_DSP_COMPUTELIB_INCLUDE_DIR)

 DEFINE += -DUSE_FX

-ifeq ($(strip $(AARCH)),64)
+ifeq ($(RPI), $(filter $(RPI), 3 4 5))
 DEFINE += -DARM_MATH_NEON
+DEFINE += -DARM_MATH_NEON_EXPERIMENTAL
 DEFINE += -DHAVE_NEON
 endif

--- a/src/arm_float_to_q23.c
+++ b/src/arm_float_to_q23.c
@ -0,0 +1,88 @@
+#include "arm_float_to_q23.h"
+
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
+void arm_float_to_q23(const float32_t * pSrc, q23_t * pDst, uint32_t blockSize)
+{
+    const float32_t *pIn = pSrc;                   /* Src pointer */
+    uint32_t blkCnt;                               /* loop counter */
+    
+    float32x4_t inV;
+
+    int32x4_t cvt;
+
+    blkCnt = blockSize >> 2U;
+
+    /* Compute 4 outputs at a time.
+    ** a second loop below computes the remaining 1 to 3 samples. */
+    while (blkCnt > 0U)
+    {
+        /* C = A * 8388608 */
+        /* Convert from float to q23 and then store the results in the destination buffer */
+        inV = vld1q_f32(pIn);
+
+        cvt = vcvtq_n_s32_f32(inV, 23);
+
+        /* saturate */
+        cvt = vminq_s32(cvt, vdupq_n_s32(0x007fffff));
+        cvt = vmaxq_s32(cvt, vdupq_n_s32(0xff800000));
+
+        vst1q_s32(pDst, cvt);
+        pDst += 4;
+        pIn += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+    ** No loop unrolling is used. */
+    blkCnt = blockSize & 3;
+
+    while (blkCnt > 0U)
+    {
+        /* C = A * 8388608 */
+        /* Convert from float to q23 and then store the results in the destination buffer */
+        *pDst++ = (q23_t) __SSAT((q31_t) (*pIn++ * 8388608.0f), 24);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+}
+#else
+void arm_float_to_q23(const float32_t * pSrc, q23_t * pDst, uint32_t blockSize)
+{
+    uint32_t blkCnt;                /* Loop counter */
+    const float32_t *pIn = pSrc;    /* Source pointer */
+
+    /* Loop unrolling: Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = A * 8388608 */
+        /* convert from float to Q23 and store result in destination buffer */
+
+        *pDst++ = (q23_t) __SSAT((q31_t) (*pIn++ * 8388608.0f), 24);
+        *pDst++ = (q23_t) __SSAT((q31_t) (*pIn++ * 8388608.0f), 24);
+        *pDst++ = (q23_t) __SSAT((q31_t) (*pIn++ * 8388608.0f), 24);
+        *pDst++ = (q23_t) __SSAT((q31_t) (*pIn++ * 8388608.0f), 24);
+
+        /* Decrement loop counter */
+        blkCnt--;
+    }
+
+    /* Loop unrolling: Compute remaining outputs */
+    blkCnt = blockSize % 0x4U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = A * 8388608 */
+        /* Convert from float to q23 and then store the results in the destination buffer */
+        *pDst++ = (q23_t) __SSAT((q31_t) (*pIn++ * 8388608.0f), 24);
+
+        /* Decrement loop counter */
+        blkCnt--;
+    }
+
+}
+#endif /* #if defined(ARM_MATH_NEON_EXPERIMENTAL) */
--- a/src/arm_float_to_q23.h
+++ b/src/arm_float_to_q23.h
@ -0,0 +1,22 @@
+#pragma once
+
+#include "arm_math_types.h"
+
+typedef int32_t q23_t;
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/**
+ * @brief Converts the elements of the floating-point vector to Q23 vector.
+ * @param[in]  pSrc       points to the floating-point input vector
+ * @param[out] pDst       points to the Q23 output vector
+ * @param[in]  blockSize  length of the input vector
+ */
+void arm_float_to_q23(const float32_t * pSrc, q23_t * pDst, uint32_t blockSize);
+
+#ifdef __cplusplus
+}
+#endif
--- a/src/minidexed.cpp
+++ b/src/minidexed.cpp
@ -29,6 +29,7 @@
 #include <string.h>
 #include <stdio.h>
 #include <assert.h>
+#include "arm_float_to_q23.h"

 const char WLANFirmwarePath[] = "SD:firmware/";
 const char WLANConfigFile[]   = "SD:wpa_supplicant.conf";
@ -359,7 +360,7 @@ bool CMiniDexed::Initialize (void)
 		return false;
 	}

-	m_pSoundDevice->SetWriteFormat (SoundFormatSigned16, Channels);
+	m_pSoundDevice->SetWriteFormat (SoundFormatSigned24_32, Channels);

 	m_nQueueSizeFrames = m_pSoundDevice->GetQueueSizeFrames ();

@ -1260,8 +1261,8 @@ void CMiniDexed::ProcessSound (void)
 		m_pTG[0]->getSamples (SampleBuffer, nFrames);

 		// Convert single float array (mono) to int16 array
-		int16_t tmp_int[nFrames];
-		arm_float_to_q15(SampleBuffer,tmp_int,nFrames);
+		int32_t tmp_int[nFrames];
+		arm_float_to_q23(SampleBuffer,tmp_int,nFrames);

 		if (m_pSoundDevice->Write (tmp_int, sizeof(tmp_int)) != (int) sizeof(tmp_int))
 		{
@ -1328,7 +1329,7 @@ void CMiniDexed::ProcessSound (void)
 			// Note: one TG per audio channel; output=mono; no processing.
 			const int Channels = 8;  // One TG per channel
 			float32_t tmp_float[nFrames*Channels];
-			int16_t tmp_int[nFrames*Channels];
+			int32_t tmp_int[nFrames*Channels];

 			if(nMasterVolume > 0.0)
 			{
@ -1350,11 +1351,11 @@ void CMiniDexed::ProcessSound (void)
 						}
 					}
 				}
-				arm_float_to_q15(tmp_float,tmp_int,nFrames*Channels);
+				arm_float_to_q23(tmp_float,tmp_int,nFrames*Channels);
 			}
 			else
 			{
-				arm_fill_q15(0, tmp_int, nFrames*Channels);
+				arm_fill_q31(0, tmp_int, nFrames*Channels);
 			}

 			// Prevent PCM510x analog mute from kicking in
@ -1378,7 +1379,7 @@ void CMiniDexed::ProcessSound (void)

 			// BEGIN TG mixing
 			float32_t tmp_float[nFrames*2];
-			int16_t tmp_int[nFrames*2];
+			int32_t tmp_int[nFrames*2];

 			if(nMasterVolume > 0.0)
 			{
@ -1444,11 +1445,11 @@ void CMiniDexed::ProcessSound (void)
 						tmp_float[(i*2)+1]=SampleBuffer[indexR][i];
 					}
 				}
-				arm_float_to_q15(tmp_float,tmp_int,nFrames*2);
+				arm_float_to_q23(tmp_float,tmp_int,nFrames*2);
 			}
 			else
 			{
-				arm_fill_q15(0, tmp_int, nFrames * 2);
+				arm_fill_q31(0, tmp_int, nFrames * 2);
 			}

 			// Prevent PCM510x analog mute from kicking in