Use SoundFormatSigned24_32 with NEON (#852)

* use SoundFormatSigned24_32 format instead of SoundFormatSigned16 More detailed, and not much slower. * fix ARM_MATH_NEON defines --------- Co-authored-by: probonopd <probonopd@users.noreply.github.com>
2 months ago · acf9e11d5f
parent 9d8ed87aab
commit acf9e11d5f
5 changed files with 123 additions and 10 deletions
--- a/src/Makefile
+++ b/src/Makefile
@ -10,6 +10,7 @@ OBJS = main.o kernel.o minidexed.o config.o userinterface.o uimenu.o \
       mididevice.o midikeyboard.o serialmididevice.o pckeyboard.o \
       sysexfileloader.o performanceconfig.o perftimer.o \
       effect_compressor.o effect_platervbstereo.o uibuttons.o midipin.o \
       arm_float_to_q23.o \
       net/ftpdaemon.o net/ftpworker.o net/applemidi.o net/udpmidi.o net/mdnspublisher.o udpmididevice.o
 OPTIMIZE = -O3
--- a/src/Synth_Dexed.mk
+++ b/src/Synth_Dexed.mk
@ -40,8 +40,9 @@ INCLUDE += -I $(CMSIS_DSP_COMPUTELIB_INCLUDE_DIR)
 DEFINE += -DUSE_FX
-ifeq ($(strip $(AARCH)),64)
+ifeq ($(RPI), $(filter $(RPI), 3 4 5))
 DEFINE += -DARM_MATH_NEON
 DEFINE += -DARM_MATH_NEON_EXPERIMENTAL
 DEFINE += -DHAVE_NEON
 endif
--- a/src/arm_float_to_q23.c
+++ b/src/arm_float_to_q23.c
@ -0,0 +1,88 @@
 #include "arm_float_to_q23.h"
 #if defined(ARM_MATH_NEON_EXPERIMENTAL)
 void arm_float_to_q23(const float32_t * pSrc, q23_t * pDst, uint32_t blockSize)
 {
    const float32_t *pIn = pSrc;                   /* Src pointer */
    uint32_t blkCnt;                               /* loop counter */
    float32x4_t inV;
    int32x4_t cvt;
    blkCnt = blockSize >> 2U;
    /* Compute 4 outputs at a time.
    ** a second loop below computes the remaining 1 to 3 samples. */
    while (blkCnt > 0U)
    {
        /* C = A * 8388608 */
        /* Convert from float to q23 and then store the results in the destination buffer */
        inV = vld1q_f32(pIn);
        cvt = vcvtq_n_s32_f32(inV, 23);
        /* saturate */
        cvt = vminq_s32(cvt, vdupq_n_s32(0x007fffff));
        cvt = vmaxq_s32(cvt, vdupq_n_s32(0xff800000));
        vst1q_s32(pDst, cvt);
        pDst += 4;
        pIn += 4;
        /* Decrement the loop counter */
        blkCnt--;
    }
    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
    ** No loop unrolling is used. */
    blkCnt = blockSize & 3;
    while (blkCnt > 0U)
    {
        /* C = A * 8388608 */
        /* Convert from float to q23 and then store the results in the destination buffer */
        *pDst++ = (q23_t) __SSAT((q31_t) (*pIn++ * 8388608.0f), 24);
        /* Decrement the loop counter */
        blkCnt--;
    }
 }
 #else
 void arm_float_to_q23(const float32_t * pSrc, q23_t * pDst, uint32_t blockSize)
 {
    uint32_t blkCnt;                /* Loop counter */
    const float32_t *pIn = pSrc;    /* Source pointer */
    /* Loop unrolling: Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        /* C = A * 8388608 */
        /* convert from float to Q23 and store result in destination buffer */
        *pDst++ = (q23_t) __SSAT((q31_t) (*pIn++ * 8388608.0f), 24);
        *pDst++ = (q23_t) __SSAT((q31_t) (*pIn++ * 8388608.0f), 24);
        *pDst++ = (q23_t) __SSAT((q31_t) (*pIn++ * 8388608.0f), 24);
        *pDst++ = (q23_t) __SSAT((q31_t) (*pIn++ * 8388608.0f), 24);
        /* Decrement loop counter */
        blkCnt--;
    }
    /* Loop unrolling: Compute remaining outputs */
    blkCnt = blockSize % 0x4U;
    while (blkCnt > 0U)
    {
        /* C = A * 8388608 */
        /* Convert from float to q23 and then store the results in the destination buffer */
        *pDst++ = (q23_t) __SSAT((q31_t) (*pIn++ * 8388608.0f), 24);
        /* Decrement loop counter */
        blkCnt--;
    }
 }
 #endif /* #if defined(ARM_MATH_NEON_EXPERIMENTAL) */
--- a/src/arm_float_to_q23.h
+++ b/src/arm_float_to_q23.h
@ -0,0 +1,22 @@
 #pragma once
 #include "arm_math_types.h"
 typedef int32_t q23_t;
 #ifdef __cplusplus
 extern "C"
 {
 #endif
 /**
 * @brief Converts the elements of the floating-point vector to Q23 vector.
 * @param[in]  pSrc       points to the floating-point input vector
 * @param[out] pDst       points to the Q23 output vector
 * @param[in]  blockSize  length of the input vector
 */
 void arm_float_to_q23(const float32_t * pSrc, q23_t * pDst, uint32_t blockSize);
 #ifdef __cplusplus
 }
 #endif
--- a/src/minidexed.cpp
+++ b/src/minidexed.cpp
@ -29,6 +29,7 @@
 #include <string.h>
 #include <stdio.h>
 #include <assert.h>
 #include "arm_float_to_q23.h"
 const char WLANFirmwarePath[] = "SD:firmware/";
 const char WLANConfigFile[]   = "SD:wpa_supplicant.conf";
@ -359,7 +360,7 @@ bool CMiniDexed::Initialize (void)
 		return false;
 	}
-	m_pSoundDevice->SetWriteFormat (SoundFormatSigned16, Channels);
+	m_pSoundDevice->SetWriteFormat (SoundFormatSigned24_32, Channels);
 	m_nQueueSizeFrames = m_pSoundDevice->GetQueueSizeFrames ();
@ -1260,8 +1261,8 @@ void CMiniDexed::ProcessSound (void)
 		m_pTG[0]->getSamples (SampleBuffer, nFrames);
 		// Convert single float array (mono) to int16 array
-		int16_t tmp_int[nFrames];
+		int32_t tmp_int[nFrames];
-		arm_float_to_q15(SampleBuffer,tmp_int,nFrames);
+		arm_float_to_q23(SampleBuffer,tmp_int,nFrames);
 		if (m_pSoundDevice->Write (tmp_int, sizeof(tmp_int)) != (int) sizeof(tmp_int))
 		{
@ -1328,7 +1329,7 @@ void CMiniDexed::ProcessSound (void)
 			// Note: one TG per audio channel; output=mono; no processing.
 			const int Channels = 8;  // One TG per channel
 			float32_t tmp_float[nFrames*Channels];
-			int16_t tmp_int[nFrames*Channels];
+			int32_t tmp_int[nFrames*Channels];
 			if(nMasterVolume > 0.0)
 			{
@ -1350,11 +1351,11 @@ void CMiniDexed::ProcessSound (void)
 						}
 					}
 				}
-				arm_float_to_q15(tmp_float,tmp_int,nFrames*Channels);
+				arm_float_to_q23(tmp_float,tmp_int,nFrames*Channels);
 			}
 			else
 			{
-				arm_fill_q15(0, tmp_int, nFrames*Channels);
+				arm_fill_q31(0, tmp_int, nFrames*Channels);
 			}
 			// Prevent PCM510x analog mute from kicking in
@ -1378,7 +1379,7 @@ void CMiniDexed::ProcessSound (void)
 			// BEGIN TG mixing
 			float32_t tmp_float[nFrames*2];
-			int16_t tmp_int[nFrames*2];
+			int32_t tmp_int[nFrames*2];
 			if(nMasterVolume > 0.0)
 			{
@ -1444,11 +1445,11 @@ void CMiniDexed::ProcessSound (void)
 						tmp_float[(i*2)+1]=SampleBuffer[indexR][i];
 					}
 				}
-				arm_float_to_q15(tmp_float,tmp_int,nFrames*2);
+				arm_float_to_q23(tmp_float,tmp_int,nFrames*2);
 			}
 			else
 			{
-				arm_fill_q15(0, tmp_int, nFrames * 2);
+				arm_fill_q31(0, tmp_int, nFrames * 2);
 			}
 			// Prevent PCM510x analog mute from kicking in