From acf9e11d5f9f60e188635de1714064fc00aa02f7 Mon Sep 17 00:00:00 2001 From: soyer Date: Mon, 21 Apr 2025 23:39:42 +0200 Subject: [PATCH] Use SoundFormatSigned24_32 with NEON (#852) * use SoundFormatSigned24_32 format instead of SoundFormatSigned16 More detailed, and not much slower. * fix ARM_MATH_NEON defines --------- Co-authored-by: probonopd --- src/Makefile | 1 + src/Synth_Dexed.mk | 3 +- src/arm_float_to_q23.c | 88 ++++++++++++++++++++++++++++++++++++++++++ src/arm_float_to_q23.h | 22 +++++++++++ src/minidexed.cpp | 19 ++++----- 5 files changed, 123 insertions(+), 10 deletions(-) create mode 100644 src/arm_float_to_q23.c create mode 100644 src/arm_float_to_q23.h diff --git a/src/Makefile b/src/Makefile index 7882018..73dbddc 100644 --- a/src/Makefile +++ b/src/Makefile @@ -10,6 +10,7 @@ OBJS = main.o kernel.o minidexed.o config.o userinterface.o uimenu.o \ mididevice.o midikeyboard.o serialmididevice.o pckeyboard.o \ sysexfileloader.o performanceconfig.o perftimer.o \ effect_compressor.o effect_platervbstereo.o uibuttons.o midipin.o \ + arm_float_to_q23.o \ net/ftpdaemon.o net/ftpworker.o net/applemidi.o net/udpmidi.o net/mdnspublisher.o udpmididevice.o OPTIMIZE = -O3 diff --git a/src/Synth_Dexed.mk b/src/Synth_Dexed.mk index 6aa4a49..4d42e67 100644 --- a/src/Synth_Dexed.mk +++ b/src/Synth_Dexed.mk @@ -40,8 +40,9 @@ INCLUDE += -I $(CMSIS_DSP_COMPUTELIB_INCLUDE_DIR) DEFINE += -DUSE_FX -ifeq ($(strip $(AARCH)),64) +ifeq ($(RPI), $(filter $(RPI), 3 4 5)) DEFINE += -DARM_MATH_NEON +DEFINE += -DARM_MATH_NEON_EXPERIMENTAL DEFINE += -DHAVE_NEON endif diff --git a/src/arm_float_to_q23.c b/src/arm_float_to_q23.c new file mode 100644 index 0000000..8eb21be --- /dev/null +++ b/src/arm_float_to_q23.c @@ -0,0 +1,88 @@ +#include "arm_float_to_q23.h" + +#if defined(ARM_MATH_NEON_EXPERIMENTAL) +void arm_float_to_q23(const float32_t * pSrc, q23_t * pDst, uint32_t blockSize) +{ + const float32_t *pIn = pSrc; /* Src pointer */ + uint32_t blkCnt; /* loop counter */ + + float32x4_t inV; + + int32x4_t cvt; + + blkCnt = blockSize >> 2U; + + /* Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while (blkCnt > 0U) + { + /* C = A * 8388608 */ + /* Convert from float to q23 and then store the results in the destination buffer */ + inV = vld1q_f32(pIn); + + cvt = vcvtq_n_s32_f32(inV, 23); + + /* saturate */ + cvt = vminq_s32(cvt, vdupq_n_s32(0x007fffff)); + cvt = vmaxq_s32(cvt, vdupq_n_s32(0xff800000)); + + vst1q_s32(pDst, cvt); + pDst += 4; + pIn += 4; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize & 3; + + while (blkCnt > 0U) + { + /* C = A * 8388608 */ + /* Convert from float to q23 and then store the results in the destination buffer */ + *pDst++ = (q23_t) __SSAT((q31_t) (*pIn++ * 8388608.0f), 24); + + /* Decrement the loop counter */ + blkCnt--; + } +} +#else +void arm_float_to_q23(const float32_t * pSrc, q23_t * pDst, uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ + const float32_t *pIn = pSrc; /* Source pointer */ + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = A * 8388608 */ + /* convert from float to Q23 and store result in destination buffer */ + + *pDst++ = (q23_t) __SSAT((q31_t) (*pIn++ * 8388608.0f), 24); + *pDst++ = (q23_t) __SSAT((q31_t) (*pIn++ * 8388608.0f), 24); + *pDst++ = (q23_t) __SSAT((q31_t) (*pIn++ * 8388608.0f), 24); + *pDst++ = (q23_t) __SSAT((q31_t) (*pIn++ * 8388608.0f), 24); + + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; + + while (blkCnt > 0U) + { + /* C = A * 8388608 */ + /* Convert from float to q23 and then store the results in the destination buffer */ + *pDst++ = (q23_t) __SSAT((q31_t) (*pIn++ * 8388608.0f), 24); + + /* Decrement loop counter */ + blkCnt--; + } + +} +#endif /* #if defined(ARM_MATH_NEON_EXPERIMENTAL) */ diff --git a/src/arm_float_to_q23.h b/src/arm_float_to_q23.h new file mode 100644 index 0000000..6a77ea8 --- /dev/null +++ b/src/arm_float_to_q23.h @@ -0,0 +1,22 @@ +#pragma once + +#include "arm_math_types.h" + +typedef int32_t q23_t; + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** + * @brief Converts the elements of the floating-point vector to Q23 vector. + * @param[in] pSrc points to the floating-point input vector + * @param[out] pDst points to the Q23 output vector + * @param[in] blockSize length of the input vector + */ +void arm_float_to_q23(const float32_t * pSrc, q23_t * pDst, uint32_t blockSize); + +#ifdef __cplusplus +} +#endif diff --git a/src/minidexed.cpp b/src/minidexed.cpp index 1cca446..cbed9f9 100644 --- a/src/minidexed.cpp +++ b/src/minidexed.cpp @@ -29,6 +29,7 @@ #include #include #include +#include "arm_float_to_q23.h" const char WLANFirmwarePath[] = "SD:firmware/"; const char WLANConfigFile[] = "SD:wpa_supplicant.conf"; @@ -359,7 +360,7 @@ bool CMiniDexed::Initialize (void) return false; } - m_pSoundDevice->SetWriteFormat (SoundFormatSigned16, Channels); + m_pSoundDevice->SetWriteFormat (SoundFormatSigned24_32, Channels); m_nQueueSizeFrames = m_pSoundDevice->GetQueueSizeFrames (); @@ -1260,8 +1261,8 @@ void CMiniDexed::ProcessSound (void) m_pTG[0]->getSamples (SampleBuffer, nFrames); // Convert single float array (mono) to int16 array - int16_t tmp_int[nFrames]; - arm_float_to_q15(SampleBuffer,tmp_int,nFrames); + int32_t tmp_int[nFrames]; + arm_float_to_q23(SampleBuffer,tmp_int,nFrames); if (m_pSoundDevice->Write (tmp_int, sizeof(tmp_int)) != (int) sizeof(tmp_int)) { @@ -1328,7 +1329,7 @@ void CMiniDexed::ProcessSound (void) // Note: one TG per audio channel; output=mono; no processing. const int Channels = 8; // One TG per channel float32_t tmp_float[nFrames*Channels]; - int16_t tmp_int[nFrames*Channels]; + int32_t tmp_int[nFrames*Channels]; if(nMasterVolume > 0.0) { @@ -1350,11 +1351,11 @@ void CMiniDexed::ProcessSound (void) } } } - arm_float_to_q15(tmp_float,tmp_int,nFrames*Channels); + arm_float_to_q23(tmp_float,tmp_int,nFrames*Channels); } else { - arm_fill_q15(0, tmp_int, nFrames*Channels); + arm_fill_q31(0, tmp_int, nFrames*Channels); } // Prevent PCM510x analog mute from kicking in @@ -1378,7 +1379,7 @@ void CMiniDexed::ProcessSound (void) // BEGIN TG mixing float32_t tmp_float[nFrames*2]; - int16_t tmp_int[nFrames*2]; + int32_t tmp_int[nFrames*2]; if(nMasterVolume > 0.0) { @@ -1444,11 +1445,11 @@ void CMiniDexed::ProcessSound (void) tmp_float[(i*2)+1]=SampleBuffer[indexR][i]; } } - arm_float_to_q15(tmp_float,tmp_int,nFrames*2); + arm_float_to_q23(tmp_float,tmp_int,nFrames*2); } else { - arm_fill_q15(0, tmp_int, nFrames * 2); + arm_fill_q31(0, tmp_int, nFrames * 2); } // Prevent PCM510x analog mute from kicking in