add some code

2025-09-05 13:25:11 +08:00
parent 9ff0a99e7a
commit 3cf1229a85
8911 changed files with 2535396 additions and 0 deletions
--- a/managed_components/espressif__esp-dsp/modules/fir/fixed/dsps_fir_s16_m_ae32.S
+++ b/managed_components/espressif__esp-dsp/modules/fir/fixed/dsps_fir_s16_m_ae32.S
@@ -0,0 +1,94 @@
+/*
+ * SPDX-FileCopyrightText: 2022 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+.macro fir_s16_ae32_mul x1, x2, count, ID
+// This macro calculates fixed point dot product for ((count + 1)*4) int16 samples
+// x1 - input array1 register (samples)
+// x2 - input array2 register (coefficients) the array is inverted and is being decremented
+// count - counter register (for example a7)
+// count - (samples_count / 4) - 1
+// acc += x1[i + 0]*x2[N - i - 1] + x1[i + 1]*x2[N - i - 2] + x1[i + 2]*x2[N - i - 3] + x1[i + 3]*x2[N - i - 4]; i: 0..count
+// acchi, and acclo have to be initialized before
+// Result - acchi || acclo
+// Modifies: 
+// m0, m1, m2, m3
+// acchi || acclo - must be loaded before (for example 0x3fff to acclo). 
+
+		/*
+		 * Data schedule. Each line represents instruction and columns represent
+		 * register contents. Last column (MUL) shows the multiplication which
+		 * takes place. Values loaded in the given cycle are shown in square brackets.
+		 *
+		 *  m0     m1         m2              m3          MUL
+		 * -----------------  pre-load  --------------------------
+		 *[x0 x1]								                  (no MULs in the first 3 instructions)
+		 * x0 x1        [y(N-1) y(N-2)]
+		 * x0 x1 [x2 x3] y(N-1) y(N-2)
+		 * x0 x1  x2 x3  y(N-1) y(N-2) [y(N-3) y(N-4)] x0*y(N-1)
+		 * --------------------   loop  ------------------------	 (the following 4 instructions are
+		 *[x4 x5] x2 x3  y(N-1) y(N-2)  y(N-3) y(N-4)  x1*y(N-2)     repeated as much as needed)
+		 * x4 x5  x2 x3 [y(N-5) y(M-6)] y(N-3) y(N-4)  x2*y(N-3)
+		 * x4 x5 [x6 x7] y(N-5) y(M-6)  y(N-3) y(N-4)  x3*y(N-4)
+		 * x4 x5  x6 x7  y(N-5) y(M-6) [y(N-7) y(M-8)] x4*y(N-5)
+		 * -------------------  finalize  ----------------------
+		 * x4 x5  x6 x7  y(N-5) y(M-6)  y(N-7) y(M-8)  x5*y(N-6)	(nothing is load)
+		 * x4 x5  x6 x7  y(N-5) y(M-6)  y(N-7) y(M-8)  x6*y(N-7)
+		 * x4 x5  x6 x7  y(N-5) y(M-6)  y(N-7) y(M-8)  x7*y(N-8)
+		 */
+
+		ldinc m0, \x1
+		lddec m2, \x2
+		ldinc m1, \x1
+	
+		mula.dd.lh.lddec m3, \x2, m0, m2
+		loopnez \count, .loop_end_\ID
+		.loop_\ID:
+			mula.dd.hl.ldinc m0, \x1, m0, m2
+			mula.dd.lh.lddec m2, \x2, m1, m3
+			mula.dd.hl.ldinc m1, \x1, m1, m3
+			mula.dd.lh.lddec m3, \x2, m0, m2
+		.loop_end_\ID:
+	
+		mula.dd.hl m0, m2
+		mula.dd.lh m1, m3
+		mula.dd.hl m1, m3
+
+.endm // fir_s16_ae32_mul
+
+.macro fir_s16_ae32_full x1, x2, count, full_count, ID
+// This macro calculates fixed point dot product for ((count + 1)*4) int16 samples
+// x1 - input array1 register (for example a2)
+// x2 - input array2 register (for example a3)
+// count - counter register (for example a7)
+// count -   samples_count / 4 - 1
+// full_count - samples_count
+// acc += x1[i + 0]*x2[N - i - 1] + x1[i + 1]*x2[N - i - 2] + x1[i + 2]*x2[N - i - 3] + x1[i + 3]*x2[N - i - 4]; i: 0..count
+// acchi, and acclo have to be initialized before
+// Result - acchi || acclo
+// Modifies: 
+// m0, m1, m2, m3
+// acchi || acclo - must be loaded before (for example 0x3fff to acclo). 
+
+		// the main mac16 multiplication loop is skipped for cases with less than 4 samples
+		blti \full_count, 4, .less_than_4_operands_\ID
+        	fir_s16_ae32_mul \x1, \x2, \count, \ID
+
+		.less_than_4_operands_\ID:
+
+        bbci  \full_count, 1, .mod2chk_\ID
+		    ldinc m0, \x1
+		    lddec m2, \x2
+		    mula.dd.hl m0, m2
+		    mula.dd.lh m0, m2
+	    .mod2chk_\ID:
+
+		bbci  \full_count, 0, .mod1chk_\ID
+		    ldinc m0, \x1
+		    lddec m2, \x2
+		    mula.dd.lh m0, m2
+	    .mod1chk_\ID:
+
+.endm // fir_s16_ae32_full
--- a/managed_components/espressif__esp-dsp/modules/fir/fixed/dsps_fird_init_s16.c
+++ b/managed_components/espressif__esp-dsp/modules/fir/fixed/dsps_fird_init_s16.c
@@ -0,0 +1,157 @@
+/*
+ * SPDX-FileCopyrightText: 2022-2023 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "dsps_fir.h"
+#include "malloc.h"
+#include <string.h>
+#include "dsp_tests.h"
+
+#define ROUNDING_VALUE  0x7fff
+
+esp_err_t dsps_fird_init_s16(fir_s16_t *fir, int16_t *coeffs, int16_t *delay, int16_t coeffs_len, int16_t decim, int16_t start_pos, int16_t shift)
+{
+    fir->coeffs = coeffs;
+    fir->delay = delay;
+    fir->coeffs_len = coeffs_len;
+    fir->pos = 0;
+    fir->decim = decim;
+    fir->d_pos = start_pos;
+    fir->shift = shift;
+    fir->rounding_val = (int16_t)(ROUNDING_VALUE);
+    fir->free_status = 0;
+
+    if (fir->coeffs_len < 2) {                                          // number of coeffcients must be higer than 1
+        return ESP_ERR_DSP_INVALID_LENGTH;
+    }
+
+    if ((fir->shift > 40) || (fir->shift < -40)) {                      // shift amount must be within a range from -40 to 40
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+
+    if (fir->d_pos >= fir->decim) {                                     // start position must be lower than decimation
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+
+#if CONFIG_DSP_OPTIMIZED
+
+    // Rounding value buffer primary for a purpose of ee.ld.accx.ip, but used for both the esp32 and esp32s3
+    // dsps_fird_s16_aexx_free() must be called to free the memory after the FIR function is finished
+    int32_t *aexx_rounding_buff = (int32_t *)memalign(16, 2 * sizeof(int32_t));
+
+    long long rounding = (long long)(fir->rounding_val);
+
+    if (fir->shift >= 0) {
+        rounding = (rounding >> fir->shift);
+    } else {
+        rounding = (rounding << (-fir->shift));
+    }
+#if dsps_fird_s16_arp4_enabled
+    fir->pos = start_pos;
+
+    int16_t *new_delay_buff = (int16_t *)memalign(16, (coeffs_len + 8 * 2) * sizeof(int16_t));
+    for (int i = 0 ; i < (coeffs_len + 8 * 2) ; i++) {
+        new_delay_buff[i] = 0;
+    }
+    fir->delay = &new_delay_buff[8];
+    fir->free_status |= 0x0001;
+
+#endif // dsps_fird_s16_arp4_enabled
+
+
+    aexx_rounding_buff[0] = (int32_t)(rounding);                        // 32 lower bits (acclo) type reassignment to 32-bit
+    aexx_rounding_buff[1] = (int32_t)((rounding >> 32) & 0xFF);         // 8 higher bits (acchi) shift by 32 and apply the mask
+    fir->rounding_buff = aexx_rounding_buff;
+    fir->free_status |= 0x0004;
+
+#if dsps_fird_s16_aes3_enabled
+
+    if (fir->delay == NULL) {                                   // New delay buffer is allocated if the current delay line is NULL
+        int16_t *new_delay_buff = (int16_t *)memalign(16, coeffs_len * sizeof(int16_t));
+        fir->delay = new_delay_buff;
+        fir->free_status |= 0x0001;
+    } else {
+        if ((int)fir->delay & 0xf) {                            // Delay line array must be aligned
+            return ESP_ERR_DSP_ARRAY_NOT_ALIGNED;
+        }
+    }
+
+    if ((int)fir->coeffs & 0xf) {                               // Coefficients array must be aligned
+        return ESP_ERR_DSP_ARRAY_NOT_ALIGNED;
+    }
+
+    // If the number of coefficients is not divisible by 8, a new delay line a new coefficients arrays are allocated
+    // the newly allocated arrays are divisible by 8. Coefficients are copied from the original fir structure to
+    // the new coeffs array and the remaining space is filled with zeroes
+    // dsps_fird_s16_free_coeffs_delay must be called to free the memory after the FIR function is finished
+    if (fir->coeffs_len % 8) {                                           // Number of coefficients must be devisible by 8
+        int16_t zero_coeffs = (8 - (fir->coeffs_len % 8));
+        int16_t new_coeffs_len = fir->coeffs_len + zero_coeffs;
+        int16_t *aes3_delay_buff = (int16_t *)memalign(16, new_coeffs_len * sizeof(int16_t));
+        int16_t *aes3_coeffs_buff = (int16_t *)memalign(16, new_coeffs_len * sizeof(int16_t));
+
+        for (int i = 0; i < fir->coeffs_len; i++) {                      // copy fir->coeffs to aes3_coeffs_buff
+            aes3_coeffs_buff[i] = fir->coeffs[i];
+        }
+
+        for (int i = fir->coeffs_len; i < new_coeffs_len; i++) {                  // add zeroes to the end
+            aes3_coeffs_buff[i] = 0;
+        }
+
+        fir->delay = aes3_delay_buff;
+        fir->coeffs = aes3_coeffs_buff;
+        fir->coeffs_len = new_coeffs_len;
+        fir->free_status |= 0x0002;
+    }
+
+#endif      // dsps_fird_s16_aes3_enabled
+#endif      // CONFIG_DSP_OPTIMIZED
+
+    for (int i = 0; i < fir->coeffs_len; i++) {                                  // Initialize the dealy line to zero
+        fir->delay[i] = 0;
+    }
+
+    return ESP_OK;
+}
+
+esp_err_t dsps_fird_s16_aexx_free(fir_s16_t *fir)
+{
+
+    if (fir->free_status == 0) {
+        return ESP_OK;
+    }
+
+    if (fir->free_status & 0x0003) {
+
+        if (fir->free_status & 0x0002) {
+            free(fir->coeffs);
+        }
+#if dsps_fird_s16_arp4_enabled
+        fir->delay = &fir->delay[-8];
+#endif
+        free(fir->delay);
+    }
+
+    if (fir->free_status & 0x0004) {
+        free(fir->rounding_buff);
+    }
+    fir->free_status = 0;
+
+    return ESP_OK;
+}
+
+
+esp_err_t dsps_16_array_rev(int16_t *arr, int16_t len)
+{
+
+    int16_t temp;
+
+    for (int i = 0; i < (int)(len / 2); i++) {
+        temp = arr[i];
+        arr[i] = arr[len - 1 - i];
+        arr[len - 1 - i] = temp;
+    }
+    return ESP_OK;
+}
--- a/managed_components/espressif__esp-dsp/modules/fir/fixed/dsps_fird_s16_ae32.S
+++ b/managed_components/espressif__esp-dsp/modules/fir/fixed/dsps_fird_s16_ae32.S
@@ -0,0 +1,181 @@
+/*
+ * SPDX-FileCopyrightText: 2022 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "dsps_fir_platform.h"
+#if (dsps_fird_s16_ae32_enabled == 1)
+
+#include "dsps_fir_s16_m_ae32.S"
+ 
+// This is FIR filter for ESP32 processor.
+	.text
+	.align  4
+	.global dsps_fird_s16_ae32
+	.type   dsps_fird_s16_ae32,@function
+// The function implements the following C code:
+//int32_t dsps_fird_s16_ansi(fir_s16_t *fir, const int16_t *input, int16_t *output, int32_t len)
+
+
+dsps_fird_s16_ae32: 
+// Input params					Variables
+//
+// fir      - a2				N			- a6
+// input    - a3				pos			- a7
+// output   - a4				rounding_lo - a8
+// len      - a5				d_pos		- a9
+//								&coeffs[N]	- a10
+//								delay		- a11
+//								decim		- a12
+//								rounding_hi - a13	
+//								final_shift - a14 (shift)
+
+	entry    a1,    32				
+
+	l16si    a7,    a2,    10					// a7  - pos
+	l16si	 a6,    a2,    8					// a6  - N
+	l32i	 a10,   a2,    0					// a10 - coeffs
+	addx2	 a10,   a6,    a10					// a10 - coeffs[N+1]
+	addi     a10,   a10,  -4					// a10 - coeffs[N]
+	s32i	 a10,   a1,    0					// save pointer to a1
+	l32i     a11,   a2,    4         			// a11 - delay line
+	l16si    a12,   a2,    12        			// a12 - decimation		
+	l16si    a9,    a2,    14        			// a9  - d_pos			
+	l16si    a14,   a2,    16					// a14 - shift		    
+		
+	// prepare rounding value		
+	l32i     a15,   a2,    20                   // get address of rounding array to a15
+	l32i 	 a8,    a15,    0					// a8 =  lower 32 bits of the rounding value (acclo)
+	l32i     a13,   a15,    4					// a13 = higher 8 bits of the rounding value (acchi), offset 4 (32 bits)
+		
+	// prepare final_shift value					
+	addi 	 a14,   a14,  -15					// shift - 15
+	abs		 a15, 	a14
+	blti	 a15,    32,   _shift_lower_than_32_init		// check if lower than 32
+
+												// greater than 32 could only be negative shift ((-40 to +40) - 15) -> -55 to +25 
+	addi	 a14, 	a14,   32					// if greater than 32, add 32 (SRC is not defined for SAR greater than 32)
+	_shift_lower_than_32_init:
+
+	bltz	 a14,   _shift_negative_init		// branch if lower than zero (not including zero)
+	beqz	 a14,	_shift_negative_init		// branch if equal to zero (add zero to the previous statement)
+	ssl		 a14								// if positive, set SAR register to left shift value (SAR = 32 - shift)
+	
+	j _end_of_shift_init
+
+	_shift_negative_init:						// negative shift
+	abs		 a14,   a14							// absolute value
+	ssr		 a14								// SAR = -shift
+	// final_shift is saved to SAR register, SAR is not being changed during the execution
+
+	_end_of_shift_init:	
+	l16si    a14,   a2,    16					// a14 - load shift value
+	addi 	 a14,   a14,  -15					// shift - 15
+	
+	s32i     a5,    a1,    4             		// save len to a1, used as the return value
+
+
+ 	// first delay line load (decim - d_pos) when d_pos is not 0
+	beqz	a9,     _fird_loop_len
+	sub		a15,    a12, a9						// a15 = decim - d_pos
+
+	loopnez a15,  ._loop_d_pos
+		 
+		blt    a7,   a6,   reset_fir_pos_d_pos	// branch if fir->pos >= fir->N
+			movi.n   a7,   0					// fir->pos = 0
+			l32i     a11,  a2,   4      		// reset delay line to the beginning
+		reset_fir_pos_d_pos:		
+
+		l16si	 a15,  a3,   0					// load 16 bits from input (a3) to a15
+		addi 	 a7,   a7,   1					// fir->pos++
+		s16i	 a15,  a11,  0					// save 16 bits from a15 to delay line (a11)
+		addi	 a3,   a3,   2					// increment input pointer
+		addi	 a11, a11,   2					// increment delay line pointer
+	._loop_d_pos:		
+
+	j .fill_delay_line							// skip the first iteration of the delay line filling routine
+
+	// outer loop
+	_fird_loop_len:
+
+		loopnez a12, .fill_delay_line
+
+			blt a7, a6, reset_fir_pos			// branch if fir->pos >= fir->N
+				movi.n   a7,   0				// fir->pos = 0
+				l32i	 a11,  a2,  4       	// reset delay line to the beginning
+			reset_fir_pos:		
+
+			l16si	 a15,  a3,    0				// load 16 bits from input (a3) to a15
+			addi 	 a7,   a7,    1				// fir->pos++
+			s16i	 a15,  a11,   0				// save 16 bits from a15 to delay line (a11)
+			addi	 a3,   a3,    2				// increment input pointer
+			addi	 a11,  a11,   2				// increment delay line pointer
+		.fill_delay_line:
+
+		// prepare MAC unit
+		wsr	    a8,   acclo						// acclo = a8
+		wsr		a13,  acchi						// acchi = a13
+
+		addi    a11,  a11,  -4 					// preset delay line pointer, samples (array is being incremented)
+		sub     a9,   a6,    a7   				// a9 = full_count = fir->N - fir->pos
+
+		// (Count / 4) - 1		
+		srli    a15,  a9,    2					// a15 = count = full_count /4
+		addi    a10,  a10,   4 					// preset coeffs pointer, samples (array is being decremented)
+		addi    a15,  a15,  -1					// count - 1
+
+		// x1, x2, count, full_count, ID
+		fir_s16_ae32_full a11, a10, a15, a9, __LINE__
+
+		l32i	a10,  a2,    0         			// load coeffs
+		l32i 	a11,  a2,    4					// reset delay line to the beginning
+		addx2	a10,  a7,    a10				// move coeffs pointer to the end
+		
+		srli 	a15,  a7,    2					// a15 = count = full_count (fir->pos) / 4
+		addi    a11,  a11,  -4 					// preset delay line pointer, samples (array is being incremented)
+		addi    a15,  a15,  -1					// count - 1
+
+		// x1, x2, count, full_count, ID
+		fir_s16_ae32_full a11, a10, a15, a7, __LINE__
+
+		// SAR already set from the beginning to final_shift value
+		abs		a15,  a14						// absolute value of shift
+		l32i	a10,  a1, 	 0					// reset coefficient pointer
+		blti    a15,  32,   _shift_lower_than_32
+		rsr 	a9,   acchi						// get only higher 8 bits of the acc register
+		movi.n	a15,  0xFF						// higher 8 bits mask
+		and		a9,   a9,  a15					// apply mask
+		srl		a15,  a9	
+		j 		_shift_set
+
+		_shift_lower_than_32:
+		rsr 	a9,   acchi						// get higher 8 bits of the acc register
+		movi.n	a11,  0xFF						// higher 8 bits mask
+		rsr 	a15,  acclo						// get lower 32 bits of the acc register
+		and		a9,   a9,  a11					// apply mask
+
+
+		bltz	a14,  _shift_negative 			// branch if lower than zero (if negative)
+		beqz	a14,  _shift_negative
+		src		a15,  a15,  a9					// funnel shift left
+		j 		_shift_set
+
+		_shift_negative:						// negative shift
+		src		a15,  a9,  a15					// funnel shift right
+
+		_shift_set:
+		
+		l32i    a11,  a2,    4					// Load initial position of the delay line
+		s16i	a15,  a4, 	 0					// save the shifted value to the output array (a4)
+		addi 	a5,   a5,   -1					// len--
+		addi	a4,   a4, 	 2					// increase pointer of the output array		
+		addx2	a11,  a7,    a11				// p_delay[fir->pos] - (two times the fir->pos)	
+
+		// counter				
+		bnez    a5,   _fird_loop_len			// break if a5 == 0
+
+	l32i.n     a2,  a1,  4                     	// load return value to a2
+	retw.n
+
+#endif // dsps_fird_s16_ae32_enabled
--- a/managed_components/espressif__esp-dsp/modules/fir/fixed/dsps_fird_s16_aes3.S
+++ b/managed_components/espressif__esp-dsp/modules/fir/fixed/dsps_fird_s16_aes3.S
--- a/managed_components/espressif__esp-dsp/modules/fir/fixed/dsps_fird_s16_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/fir/fixed/dsps_fird_s16_ansi.c
@@ -0,0 +1,54 @@
+/*
+ * SPDX-FileCopyrightText: 2022-2023 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "dsps_fir.h"
+
+int32_t dsps_fird_s16_ansi(fir_s16_t *fir, const int16_t *input, int16_t *output, int32_t len)
+{
+    int32_t result = 0;
+    int32_t input_pos = 0;
+    long long rounding = 0;
+    const int32_t final_shift = fir->shift - 15;
+
+    rounding = (long long)(fir->rounding_val);
+
+    if (fir->shift >= 0) {
+        rounding = (rounding >> fir->shift) & 0xFFFFFFFFFF;         // 40-bit mask
+    } else {
+        rounding = (rounding << (-fir->shift)) & 0xFFFFFFFFFF;      // 40-bit mask
+    }
+
+    // len is already a length of the *output array, calculated as (length of the input array / decimation)
+    for (int i = 0; i < len; i++) {
+
+        for (int j = 0; j < fir->decim - fir->d_pos; j++) {
+
+            if (fir->pos >= fir->coeffs_len) {
+                fir->pos = 0;
+            }
+            fir->delay[fir->pos++] = input[input_pos++];
+        }
+        fir->d_pos = 0;
+
+        long long acc = rounding;
+        int16_t coeff_pos = fir->coeffs_len - 1;
+
+        for (int n = fir->pos; n < fir->coeffs_len ; n++) {
+            acc += (int32_t)fir->coeffs[coeff_pos--] * (int32_t)fir->delay[n];
+        }
+        for (int n = 0; n < fir->pos ; n++) {
+            acc += (int32_t)fir->coeffs[coeff_pos--] * (int32_t)fir->delay[n];
+        }
+
+        if (final_shift > 0) {
+            output[result++] = (int16_t)(acc << final_shift);
+        } else {
+            output[result++] = (int16_t)(acc >> (-final_shift));
+        }
+
+    }
+    return result;
+}
--- a/managed_components/espressif__esp-dsp/modules/fir/fixed/dsps_fird_s16_arp4.S
+++ b/managed_components/espressif__esp-dsp/modules/fir/fixed/dsps_fird_s16_arp4.S
@@ -0,0 +1,150 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dsps_fir_platform.h"
+#if (dsps_fird_s16_arp4_enabled == 1)
+
+// This is FIR filter for esp32p4 processor.
+    .text
+    .align  4
+    .global dsps_fird_s16_arp4
+    .global dsps_fird_s16_ansi
+    .type   dsps_fird_s16_arp4,@function
+// The function implements the following C code:
+// int32_t dsps_fird_s16_arp4(fir_s16_t *fir, const int16_t *input, int16_t *output, int32_t len);
+
+dsps_fird_s16_arp4:
+    // In case of filter length different then 8*K
+    lh  t2, 8(a0)   // t2 - coeffs_len
+    andi t2, t2, 7
+    beqz    t2, .dsps_fird_s16_arp4_body
+    j   dsps_fird_s16_ansi
+
+.dsps_fird_s16_arp4_body:
+    add sp,sp,-48
+    sw  s0, 0(sp)
+    sw  s1, 4(sp)
+    sw  s2, 8(sp)
+    sw  s3, 12(sp)
+    sw  s4, 16(sp)
+    sw  s5, 20(sp)
+    sw  s6, 24(sp)
+    sw  s7, 28(sp)
+    sw  s8, 32(sp)
+    sw  s9, 36(sp)
+    sw  s10, 40(sp)
+    sw  s11, 44(sp)
+
+    // Enable analigned data access
+    esp.movx.r.cfg t6
+    or t6, t6, 2
+    esp.movx.w.cfg t6
+
+    lw  t1, 4(a0)       // t1 - delay_line
+    lh  t2, 8(a0)       // t2 - coeffs_len
+    lh  t3, 10(a0)      // t3 - pos
+    lh  t6, 16(a0)      // t6 - shift
+    add t6, t6, -15
+    neg t6, t6
+    lw  t5, 20(a0)      // t5 - rounding_buff
+    lw  s2, 4(a0)       // s2 - delay_line* current position
+    add s2, s2, t3      // s2 = delay_line + pos*2
+    add s2, s2, t3      //
+    add s4, t2, t2      // s4 = coeff_len*2
+    add s0, t1, s4      // s0 - &delay[coeffs_len]
+
+    lh  a4,  0(t1)
+.loop_len:
+         lh  t4, 12(a0)          // t4 - decim
+        .loop_decim_copy:
+            lh   s1, 0(a1)      // load input data
+            add  a1, a1, 2
+
+            sh   s1, 0(s2)            
+            add  s2, s2, 2     // preincrement of delay line
+            bgt  s0, s2, .skeep_reset
+                lw  s2, 4(a0)       // s2 - delay_line
+            .skeep_reset:
+            add  t4, t4, -1
+        bgtz t4, .loop_decim_copy
+
+        // s5 - count1 = length - pos
+        // s6 = count1 >> 3 :  
+        sub  t3, s2, t1
+        srli t3, t3, 1          // t3 = (pos*2)>>1
+        sub  s5, t2, t3
+        srli s6, s5, 3          // s6 = (coeff_len - pos)>>3
+
+        srli s7, t3, 3          // s7 = pos>>3
+        and  s8, t3, 0x07       // s8 = pos&0x07
+
+        esp.ld.xacc.ip           t5, 0                                          // load rounding value to accx
+
+        lw  s10, 0(a0)          // s10 - coeffs
+        esp.vld.128.ip          q0, s10, 16 //q0 - coeffs
+        mv      s9, s2          // s9 - pointer to delay line
+        esp.vld.128.ip          q1, s9, 16  // q1 - delay line data
+
+        beqz s6, .skip_main_loop1
+        esp.lp.setup    0, s6, .main_loop1
+            esp.vmulas.s16.xacc.ld.ip     q0, s10, 16, q0, q1   // q0 - coeffs, q1 - data
+        .main_loop1:     esp.vld.128.ip  q1, s9, 16              // Load delay line    
+.skip_main_loop1: nop
+
+
+        add     s9, s9, -16
+        sub     s9, s9, s4
+        beqz s8, .skip_rest_add
+            esp.vld.128.ip          q2, s9, 16
+            esp.vadd.s16            q1, q2, q1
+            esp.vmulas.s16.xacc.ld.ip     q0, s10, 16, q0, q1   // q0 - coeffs, q1 - data
+        .skip_rest_add: 
+        esp.vld.128.ip          q1, s9, 16
+        
+        beqz s7, .skip_main_loop3
+        esp.lp.setup    1, s7, .main_loop3
+            esp.vmulas.s16.xacc.ld.ip     q0, s10, 16, q0, q1 // q0 - coeffs, q1 - data
+            esp.vld.128.ip  q1, s9, 16
+        .main_loop3: nop
+.skip_main_loop3: nop
+
+        // Shift and Store result
+        esp.srs.s.xacc       s11, t6   // shift accx register by final_shift amount (a6), save the lower 32bits to a15
+        sh  s11, 0(a2)      // store result to output buffer 
+        add a2, a2, 2
+
+        add  a3, a3, -1
+    bgtz a3, .loop_len
+    sh   t3, 10(a0)
+
+.fast_exit:
+    mv  a0, a6
+
+    lw  s0, 0(sp)
+    lw  s1, 4(sp)
+    lw  s2, 8(sp)
+    lw  s3, 12(sp)
+    lw  s4, 16(sp)
+    lw  s5, 20(sp)
+    lw  s6, 24(sp)
+    lw  s7, 28(sp)
+    lw  s8, 32(sp)
+    lw  s9, 36(sp)
+    lw  s10, 40(sp)
+    lw  s11, 44(sp)
+
+    add sp,sp,48
+    ret
+
+#endif // 
--- a/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fir_f32_ae32.S
+++ b/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fir_f32_ae32.S
@@ -0,0 +1,95 @@
+// Copyright 2018-2023 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dsps_fir_platform.h"
+#if (dsps_fir_f32_ae32_enabled == 1)
+
+#include "dsps_dotprod_f32_m_ae32.S"
+
+// This is FIR filter for ESP32 processor.
+	.text
+	.align  4
+	.global dsps_fir_f32_ae32
+	.type   dsps_fir_f32_ae32,@function
+// The function implements the following C code:
+//esp_err_t dsps_fir_f32_ae32(fir_f32_t* fir, const float* input, float* output, int len);
+
+dsps_fir_f32_ae32: 
+// fir      - a2
+// input    - a3
+// output   - a4
+// len      - a5
+
+	entry	a1, 16
+	// Array increment for floating point data should be 4
+	l32i    a7,  a2, 12 // a7  - pos
+	movi    a10, 4
+	mull    a13, a7, a10// a13 - a7*4
+	l32i    a6,  a2, 8  // a6  - N
+	mull    a6, a6, a10// a6 = a6*4
+	l32i    a10, a2, 0  // a10 - coeffs
+	l32i    a6,  a2, 8  // a6  - N
+
+	movi.n a9, 0
+	movi.n a8, 4
+	movi.n a12, 4
+
+//  a13 - delay index
+fir_loop_len:
+		// Store to delay line
+		l32i    a11, a2, 4      // a11 - delay line
+		lsi     f0, a3, 0       // f0 = x[i]
+		addi    a3, a3, 4       // x++
+		ssx     f0, a11, a13    // delay[a13] = f0;
+		addi    a13, a13, 4     // a13++
+		addi    a7, a7, 1       // a7++
+		// verify deley line
+		blt     a7, a6, do_not_reset_a13
+			movi    a13, 0
+			movi    a7,  0
+	do_not_reset_a13:
+		// Calc amount for delay line before end
+		mov     a15, a10        // a15 - coeffs
+		wfr	    f2, a9 // f2 = 0;
+		sub   a14, a6, a7   // a14 = N-pos
+
+		// a11 = &delay[pos]
+		add     a11, a11, a13
+
+		loopnez  a14, first_fir_loop // pos...N-1
+			lsxp     f1, a15, a8     // f1 = *(coeffs--)
+			lsxp     f0, a11, a12    // load delay f0 = *(delay++)
+			madd.s  f2, f0, f1       // f2 += f0*f1
+first_fir_loop:
+		l32i    a11, a2, 4           // a11 - delay line
+		loopnez  a7, second_fir_loop // 0..pos
+			lsxp     f1, a15, a8     // f1 = *(coeffs--)
+			lsxp     f0, a11, a12    // load delay f0 = *(delay++)
+			madd.s  f2, f0, f1      // f2 += f0*f1
+second_fir_loop:
+
+		// and after end
+		// Store result
+		ssi     f2, a4, 0
+		addi    a4, a4, 4 // y++ - increment output pointer
+		// Check loop 
+		addi   a5, a5, -1
+	bnez    a5, fir_loop_len
+	// store state
+
+	s32i    a7,  a2, 12 // pos = a7
+	movi.n	a2, 0 // return status ESP_OK
+	retw.n
+
+#endif // dsps_fir_f32_ae32_enabled
--- a/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fir_f32_aes3.S
+++ b/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fir_f32_aes3.S
@@ -0,0 +1,233 @@
+// Copyright 2018-2023 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dsps_fir_platform.h"
+#if (dsps_fir_f32_aes3_enabled == 1)
+
+// This is FIR filter for Esp32s3 processor.
+    .text
+    .align  4
+    .global dsps_fir_f32_aes3
+    .type   dsps_fir_f32_aes3,@function
+// The function implements the following C code:
+//esp_err_t dsps_fir_f32_aes3(fir_f32_t* fir, const float* input, float* output, int len);
+
+dsps_fir_f32_aes3:
+// fir      - a2
+// input    - a3
+// output   - a4
+// len      - a5
+
+// a2 - fir structure
+// a3 - input
+// a4 - output
+// a5 - length
+
+// a6 - fir length
+// a7 - position in delay line
+// a8 - temp
+// a9 - const 0
+// a10 - coeffs ptr
+// a11 - delay line ptr
+// a12 - const
+// a13 - 
+// a14 - temp for loops
+// a15 - delay line rounded to 16
+
+    entry	a1, 16
+    // Array increment for floating point data should be 4
+    l32i    a7,  a2, 12 // a7  - pos
+
+    l32i    a6,  a2, 8  // a6  - N - amount of coefficients
+    l32i    a10, a2, 0  // a10 - coeffs
+    l32i    a11, a2, 4  // a11 - delay line
+    addx4	a11, a7, a11 // a11 = a11 + a7*4	
+    l32i    a6,  a2, 8   // a6  - N
+
+    movi.n	a9, 0
+    movi.n	a12, 3
+
+    movi.n	a12, -16
+    movi.n	a13, 15
+// Main loop for input samples
+.fir_loop_len:
+        // Store to delay line
+        lsip	f15,  a3, 4		// a3  += 4, f15 = input[n]
+        ssip	f15, a11, 4		// a11 += 4, *a11 = f15
+        addi    a7,  a7,  1     // a7++ - position in delay line
+
+        //
+        blt     a7, a6, .do_not_reset_a11
+            l32i    a11, a2, 4	// Load delay line
+            movi    a7,  0
+    .do_not_reset_a11:
+        // Load rounded delay line address
+        and     a15, a11, a12
+
+        l32i    a10, a2, 0  // a10 - coeffs
+
+        // Clear f4, f5 for multiplications
+        const.s f4, 0
+        const.s f5, 0
+        const.s f6, 0
+        const.s f7, 0
+
+        and		a8, a11, a13		// a8 = a11 & 15
+        beqz   	a8, .offset_0
+        addi   	a8, a8, -4
+        beqz   	a8, .offset_1
+        addi   	a8, a8, -4
+        beqz   	a8, .offset_2
+        addi   	a8, a8, -4
+        beqz   	a8, .offset_3
+
+// a10 - coeffs
+// a11 - delay line
+.offset_0:
+        sub   a14, a6, a7   // a14 = N-pos
+        srli  a14, a14, 2
+        loopnez  a14, .first_fir_loop_0 // pos...N-1
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+            madd.s  f4, f0, f8
+            madd.s  f5, f1, f9
+            madd.s  f6, f2, f10
+            madd.s  f7, f3, f11
+        .first_fir_loop_0:
+        
+        l32i    a15, a2, 4  // a11 - delay line [0]	
+        srli  a14, a7, 2
+        loopnez  a14, .second_fir_loop_0 // 0..pos
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+            madd.s  f4, f0, f8
+            madd.s  f5, f1, f9
+            madd.s  f6, f2, f10
+            madd.s  f7, f3, f11
+        .second_fir_loop_0:
+        j    .store_fir_result;
+
+.offset_1:
+        sub   a14, a6, a7   // a14 = N-pos
+        addi  a14, a14, 3
+        srli  a14, a14, 2
+        EE.LDF.128.IP f11, f10, f9, f12, a15, 16 // Load data from delay line
+        // f12 - delay[N-1], store for the last operation
+        // f9..f11 - delay[0..2]
+        loopnez  a14, .first_fir_loop_1 // pos...N-1
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            madd.s  f4, f0, f9
+            madd.s  f5, f1, f10
+            madd.s  f6, f2, f11
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+            madd.s  f7, f3, f8
+        .first_fir_loop_1:
+        
+        l32i    a15, a2, 4  // a11 - delay line [0]
+        EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line		
+        srli  a14, a7, 2
+        loopnez  a14, .second_fir_loop_1 // 0..pos
+            madd.s  f4, f3, f8
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            madd.s  f5, f0, f9
+            madd.s  f6, f1, f10
+            madd.s  f7, f2, f11
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+        .second_fir_loop_1:
+
+        madd.s  f4, f3, f12
+        j    .store_fir_result;
+
+.offset_2:
+        sub   a14, a6, a7   // a14 = N-pos
+        addi  a14, a14, 3
+        srli  a14, a14, 2
+        EE.LDF.128.IP f11, f10, f13, f12, a15, 16 // Load data from delay line
+        // f12, f13 - delay[N-1], delay[N-2], store for the last operation
+        // f10..f11 - delay[0..1]
+        loopnez  a14, .first_fir_loop_2 // pos...N-1
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            madd.s  f4, f0, f10
+            madd.s  f5, f1, f11
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+            madd.s  f6, f2, f8
+            madd.s  f7, f3, f9
+        .first_fir_loop_2:
+        
+        l32i    a15, a2, 4  // a11 - delay line [0]
+        EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line		
+        srli  a14, a7, 2
+        loopnez  a14, .second_fir_loop_2 // 0..pos
+            madd.s  f4, f2, f8
+            madd.s  f5, f3, f9
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            madd.s  f6, f0, f10
+            madd.s  f7, f1, f11
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+        .second_fir_loop_2:
+
+        madd.s  f4, f2, f12
+        madd.s  f5, f3, f13
+        j    .store_fir_result;
+
+.offset_3:
+        sub   a14, a6, a7   // a14 = N-pos
+        addi  a14, a14, 3
+        srli  a14, a14, 2
+        EE.LDF.128.IP f11, f14, f13, f12, a15, 16 // Load data from delay line
+        // f12, f13, f14 - delay[N-1], delay[N-2], delay[N-3], store for the last operation
+        // f11 - delay[0]
+        loopnez  a14, .first_fir_loop_3 // pos...N-1
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            madd.s  f4, f0, f11
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+            madd.s  f5, f1, f8
+            madd.s  f6, f2, f9
+            madd.s  f7, f3, f10
+        .first_fir_loop_3:
+        
+        l32i    a15, a2, 4  // a11 - delay line [0]
+        EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line		
+        srli  a14, a7, 2
+        loopnez  a14, .second_fir_loop_3 // 0..pos
+            madd.s  f4, f1, f8
+            madd.s  f5, f2, f9
+            madd.s  f6, f3, f10
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            madd.s  f7, f0, f11
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+        .second_fir_loop_3:
+
+        madd.s  f4, f1, f12
+        madd.s  f5, f2, f13
+        madd.s  f4, f3, f14
+
+.store_fir_result:
+
+    add.s   f4, f4, f5
+    add.s   f6, f6, f7
+    add.s   f4, f4, f6
+
+    // Store result
+    ssip     f4, a4, 4  // y++ - save result and increment output pointer
+    // Check loop length
+    addi   a5, a5, -1
+    bnez    a5, .fir_loop_len
+    // store state
+
+    s32i    a7,  a2, 12 // pos = a7
+    movi.n	a2, 0 // return status ESP_OK
+    retw.n
+
+#endif // dsps_fir_f32_aes3_enabled
--- a/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fir_f32_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fir_f32_ansi.c
@@ -0,0 +1,36 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dsps_fir.h"
+
+esp_err_t dsps_fir_f32_ansi(fir_f32_t *fir, const float *input, float *output, int len)
+{
+    for (int i = 0 ; i < len ; i++) {
+        float acc = 0;
+        int coeff_pos = 0;
+        fir->delay[fir->pos] = input[i];
+        fir->pos++;
+        if (fir->pos >= fir->N) {
+            fir->pos = 0;
+        }
+        for (int n = fir->pos; n < fir->N ; n++) {
+            acc += fir->coeffs[coeff_pos++] * fir->delay[n];
+        }
+        for (int n = 0; n < fir->pos ; n++) {
+            acc += fir->coeffs[coeff_pos++] * fir->delay[n];
+        }
+        output[i] = acc;
+    }
+    return ESP_OK;
+}
--- a/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fir_init_f32.c
+++ b/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fir_init_f32.c
@@ -0,0 +1,67 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dsps_fir.h"
+#include "malloc.h"
+
+
+esp_err_t dsps_fir_init_f32(fir_f32_t *fir, float *coeffs, float *delay, int coeffs_len)
+{
+    // Allocate delay line in case if it's NULL
+    if (delay == NULL) {
+#ifdef CONFIG_IDF_TARGET_ESP32S3
+        delay = (float *)memalign(16, (coeffs_len + 4) * sizeof(float));
+#else
+        delay = (float *)malloc((coeffs_len + 4) * sizeof(float));
+#endif // CONFIG_IDF_TARGET_ESP32S3
+        fir->use_delay = 1;
+    } else {
+        fir->use_delay = 0;
+    }
+    for (int i = 0; i < (coeffs_len + 4); i++) {
+        delay[i] = 0;
+    }
+    fir->coeffs = coeffs;
+    fir->delay = delay;
+    fir->N = coeffs_len;
+    fir->pos = 0;
+
+#ifdef CONFIG_IDF_TARGET_ESP32S3
+    if (fir->N % 4 != 0) {
+        return ESP_ERR_DSP_INVALID_LENGTH;
+    }
+    // The coeffs array should be aligned to 16
+    if (((uint32_t)coeffs) & 0x0f) {
+        return ESP_ERR_DSP_ARRAY_NOT_ALIGNED;
+    }
+    // The delay array should be aligned to 16
+    if (((uint32_t)delay) & 0x0f) {
+        return ESP_ERR_DSP_ARRAY_NOT_ALIGNED;
+    }
+#endif // CONFIG_IDF_TARGET_ESP32S3
+
+    for (int i = 0 ; i < coeffs_len; i++) {
+        fir->delay[i] = 0;
+    }
+    return ESP_OK;
+}
+
+esp_err_t dsps_fir_f32_free(fir_f32_t *fir)
+{
+    if (fir->use_delay != 0) {
+        fir->use_delay = 0;
+        free(fir->delay);
+    }
+    return ESP_OK;
+}
--- a/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fird_f32_ae32.S
+++ b/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fird_f32_ae32.S
@@ -0,0 +1,98 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dsps_fir_platform.h"
+#if (dsps_fird_f32_ae32_enabled == 1)
+
+#include "dsps_dotprod_f32_m_ae32.S"
+
+// This is FIR filter for ESP32 processor.
+	.text
+	.align  4
+	.global dsps_fird_f32_ae32
+	.type   dsps_fird_f32_ae32,@function
+// The function implements the following C code:
+//esp_err_t dsps_fird_f32_ae32(fir_f32_t* fir, const float* input, float* output, int len);
+
+dsps_fird_f32_ae32: 
+// fir      - a2
+// input    - a3
+// output   - a4
+// len      - a5
+
+	entry	a1, 16
+	// Array increment for floating point data should be 4
+	l32i    a7,  a2, 12 // a7  - pos
+	movi    a10, 4
+	mull    a13, a7, a10// a13 - a7*4
+	l32i    a6,  a2, 8  // a6  - N
+	mull    a6, a6, a10// a6 = a6*4
+	l32i    a10, a2, 0  // a10 - coeffs
+	l32i    a11, a2, 4  // a11 - delay line
+	l32i    a6,  a2, 8  // a6  - N
+	l32i    a12, a2, 16  // a12  - decimation
+	movi    a8, 0         // result = 0;
+
+//  a13 - delay index
+fird_loop_len:
+		// Store to delay line
+		
+        loopnez  a12, .fird_load_data // K loops
+			lsip    f0, a3, 4       // f0 = x[i++]
+			ssx     f0, a11, a13    // delay[a13] = f0;
+			addi    a13, a13, 4     // a13++
+			addi    a7, a7, 1       // a7++
+			// verify deley line
+			blt     a7, a6, do_not_reset_a13
+				movi    a13, 0
+				movi    a7,  0
+			do_not_reset_a13:
+        	const.s f2, 0
+		.fird_load_data:
+
+		addi    a8, a8, 1
+
+		// Calc amount for delay line before end
+		mov     a15, a10        // a15 - coeffs
+		sub   a14, a6, a7   // a14 = N-pos
+		loopnez  a14, first_fird_loop // pos...N-1
+			lsip    f1, a15, 4		// a15++
+			lsx     f0, a11, a13    // load delay f0 = delay[pos]
+			addi    a13, a13, 4     // a13++, pos++
+			madd.s  f2, f0, f1      // f2 += f0*f1
+first_fird_loop:
+		movi a13, 0    // load delay line counter to 0
+		loopnez  a7, second_fird_loop // 0..pos
+			lsip    f1, a15, 4		// a15++
+			lsx     f0, a11, a13    // load delay f0 = delay[pos]
+			addi    a13, a13, 4     // a13++, pos++
+			madd.s  f2, f0, f1      // f2 += f0*f1
+second_fird_loop:
+
+		// and after end
+		// Store result
+		ssi     f2, a4, 0
+		addi    a4, a4, 4 // y++ - increment output pointer
+next_itt_fir32:        
+		// Check loop 
+		addi   a5, a5, -1
+	bnez    a5, fird_loop_len
+	// store state
+
+	s32i    a7,  a2, 12 // pos = a7
+	
+	mov 	a2, a8 // return status ESP_OK
+	retw.n
+
+#endif // dsps_fird_f32_ae32_enabled
--- a/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fird_f32_aes3.S
+++ b/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fird_f32_aes3.S
@@ -0,0 +1,239 @@
+// Copyright 2018-2023 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dsps_fir_platform.h"
+#if (dsps_fird_f32_aes3_enabled == 1)
+
+// This is FIR filter for Esp32s3 processor.
+    .text
+    .align  4
+    .global dsps_fird_f32_aes3
+    .type   dsps_fird_f32_aes3,@function
+// The function implements the following C code:
+//esp_err_t dsps_fird_f32_aes3(fir_f32_t* fir, const float* input, float* output, int len);
+
+dsps_fird_f32_aes3:
+// fir      - a2
+// input    - a3
+// output   - a4
+// len      - a5
+
+// a2 - fir structure
+// a3 - input
+// a4 - output
+// a5 - length
+
+// a6 - fir length
+// a7 - position in delay line
+// a8 - temp
+// a10 - coeffs ptr
+// a11 - delay line ptr
+// a12 - const
+// a13 - 
+// a14 - temp for loops
+// a15 - delay line rounded to 16
+
+    entry	a1, 16
+    // Array increment for floating point data should be 4
+    l32i    a7,  a2, 12 // a7  - pos
+
+    l32i    a6,  a2, 8  // a6  - N - amount of coefficients
+    l32i    a10, a2, 0  // a10 - coeffs
+    l32i    a11, a2, 4  // a11 - delay line
+    addx4	a11, a7, a11 // a11 = a11 + a7*4	
+    l32i    a6,  a2, 8   // a6  - N
+
+    mov.n    a9, a5
+    movi.n	a12, 3
+
+    movi.n	a12, -16
+    movi.n	a13, 15
+// Main loop for input samples
+.fird_loop_len:
+        // Store K values from input to delay line:
+
+        l32i    a14,  a2, 16   // a14  - decimation
+        loopnez  a14, .fird_load_data // K loops
+            // Store to delay line
+            lsip	f15,  a3, 4		// a3  += 4, f15 = input[n]
+            ssip	f15, a11, 4		// a11 += 4, *a11 = f15
+            addi    a7,  a7,  1     // a7++ - position in delay line
+
+            blt     a7, a6, .do_not_reset_a11
+                l32i    a11, a2, 4	// Load delay line
+                movi    a7,  0
+            .do_not_reset_a11:
+            and     a15, a11, a12
+        .fird_load_data:
+        //
+        // Process data
+        //
+        // Load rounded delay line address
+
+        l32i    a10, a2, 0  // a10 - coeffs
+
+        // Clear f4, f5 for multiplications
+        const.s f4, 0
+        const.s f5, 0
+        const.s f6, 0
+        const.s f7, 0
+
+        and		a8, a11, a13		// a8 = a11 & 15
+        beqz   	a8, .offset_0
+        addi   	a8, a8, -4
+        beqz   	a8, .offset_1
+        addi   	a8, a8, -4
+        beqz   	a8, .offset_2
+        addi   	a8, a8, -4
+        beqz   	a8, .offset_3
+
+// a10 - coeffs
+// a11 - delay line
+.offset_0:
+        sub   a14, a6, a7   // a14 = N-pos
+        srli  a14, a14, 2
+        loopnez  a14, .first_fir_loop_0 // pos...N-1
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+            madd.s  f4, f0, f8
+            madd.s  f5, f1, f9
+            madd.s  f6, f2, f10
+            madd.s  f7, f3, f11
+        .first_fir_loop_0:
+        
+        l32i    a15, a2, 4  // a11 - delay line [0]	
+        srli  a14, a7, 2
+        loopnez  a14, .second_fir_loop_0 // 0..pos
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+            madd.s  f4, f0, f8
+            madd.s  f5, f1, f9
+            madd.s  f6, f2, f10
+            madd.s  f7, f3, f11
+        .second_fir_loop_0:
+        j    .store_fir_result;
+
+.offset_1:
+        sub   a14, a6, a7   // a14 = N-pos
+        addi  a14, a14, 3
+        srli  a14, a14, 2
+        EE.LDF.128.IP f11, f10, f9, f12, a15, 16 // Load data from delay line
+        // f12 - delay[N-1], store for the last operation
+        // f9..f11 - delay[0..2]
+        loopnez  a14, .first_fir_loop_1 // pos...N-1
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            madd.s  f4, f0, f9
+            madd.s  f5, f1, f10
+            madd.s  f6, f2, f11
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+            madd.s  f7, f3, f8
+        .first_fir_loop_1:
+        
+        l32i    a15, a2, 4  // a11 - delay line [0]
+        EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line		
+        srli  a14, a7, 2
+        loopnez  a14, .second_fir_loop_1 // 0..pos
+            madd.s  f4, f3, f8
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            madd.s  f5, f0, f9
+            madd.s  f6, f1, f10
+            madd.s  f7, f2, f11
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+        .second_fir_loop_1:
+
+        madd.s  f4, f3, f12
+        j    .store_fir_result;
+
+.offset_2:
+        sub   a14, a6, a7   // a14 = N-pos
+        addi  a14, a14, 3
+        srli  a14, a14, 2
+        EE.LDF.128.IP f11, f10, f13, f12, a15, 16 // Load data from delay line
+        // f12, f13 - delay[N-1], delay[N-2], store for the last operation
+        // f10..f11 - delay[0..1]
+        loopnez  a14, .first_fir_loop_2 // pos...N-1
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            madd.s  f4, f0, f10
+            madd.s  f5, f1, f11
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+            madd.s  f6, f2, f8
+            madd.s  f7, f3, f9
+        .first_fir_loop_2:
+        
+        l32i    a15, a2, 4  // a11 - delay line [0]
+        EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line		
+        srli  a14, a7, 2
+        loopnez  a14, .second_fir_loop_2 // 0..pos
+            madd.s  f4, f2, f8
+            madd.s  f5, f3, f9
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            madd.s  f6, f0, f10
+            madd.s  f7, f1, f11
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+        .second_fir_loop_2:
+
+        madd.s  f4, f2, f12
+        madd.s  f5, f3, f13
+        j    .store_fir_result;
+
+.offset_3:
+        sub   a14, a6, a7   // a14 = N-pos
+        addi  a14, a14, 3
+        srli  a14, a14, 2
+        EE.LDF.128.IP f11, f14, f13, f12, a15, 16 // Load data from delay line
+        // f12, f13, f14 - delay[N-1], delay[N-2], delay[N-3], store for the last operation
+        // f11 - delay[0]
+        loopnez  a14, .first_fir_loop_3 // pos...N-1
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            madd.s  f4, f0, f11
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+            madd.s  f5, f1, f8
+            madd.s  f6, f2, f9
+            madd.s  f7, f3, f10
+        .first_fir_loop_3:
+        
+        l32i    a15, a2, 4  // a11 - delay line [0]
+        EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line		
+        srli  a14, a7, 2
+        loopnez  a14, .second_fir_loop_3 // 0..pos
+            madd.s  f4, f1, f8
+            madd.s  f5, f2, f9
+            madd.s  f6, f3, f10
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            madd.s  f7, f0, f11
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+        .second_fir_loop_3:
+
+        madd.s  f4, f1, f12
+        madd.s  f5, f2, f13
+        madd.s  f4, f3, f14
+
+.store_fir_result:
+
+    add.s   f4, f4, f5
+    add.s   f6, f6, f7
+    add.s   f4, f4, f6
+
+    // Store result
+    ssip     f4, a4, 4  // y++ - save result and increment output pointer
+    // Check loop length
+    addi   a5, a5, -1
+    bnez    a5, .fird_loop_len
+    // store state
+
+    s32i    a7,  a2, 12 // pos = a7
+    mov.n    a2,  a9
+    retw.n
+
+#endif // dsps_fir_f32_aes3_enabled
--- a/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fird_f32_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fird_f32_ansi.c
@@ -0,0 +1,38 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dsps_fir.h"
+
+int dsps_fird_f32_ansi(fir_f32_t *fir, const float *input, float *output, int len)
+{
+    int result = 0;
+    for (int i = 0; i < len ; i++) {
+        for (int k = 0 ; k < fir->decim ; k++) {
+            fir->delay[fir->pos++] = *input++;
+            if (fir->pos >= fir->N) {
+                fir->pos = 0;
+            }
+        }
+        float acc = 0;
+        int coeff_pos = 0;
+        for (int n = fir->pos; n < fir->N ; n++) {
+            acc += fir->coeffs[coeff_pos++] * fir->delay[n];
+        }
+        for (int n = 0; n < fir->pos ; n++) {
+            acc += fir->coeffs[coeff_pos++] * fir->delay[n];
+        }
+        output[result++] = acc;
+    }
+    return result;
+}
--- a/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fird_f32_arp4.S
+++ b/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fird_f32_arp4.S
@@ -0,0 +1,99 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dsps_fir_platform.h"
+#if (dsps_fird_f32_arp4_enabled == 1)
+
+// This is FIR filter for esp32p4 processor.
+    .text
+    .align  4
+    .global dsps_fird_f32_arp4
+    .type   dsps_fird_f32_arp4,@function
+// The function implements the following C code:
+//esp_err_t dsps_fird_f32_arp4(fir_f32_t* fir, const float* input, float* output, int len);
+
+dsps_fird_f32_arp4:
+    add sp,sp,-16
+    
+    mv  a6, a3
+    lw  t1, 4(a0)   // t1 - delay
+    lw  a4, 4(a0)   // a4 - delay 
+    lw  t2, 8(a0)   // t2 - N :FIR filter coefficients amount
+    lw  t3, 12(a0)  // t3 - pos
+    lw  t4, 16(a0)  // t4 - decim
+    slli    t3, t3, 2   // t5 = pos*4 (bytes)
+    add     t1, t1, t3  // delay[pos]
+    slli    t6, t2, 2   // t6 = N*4 (bytes)
+    add     t3, a4, t6  // last position for the daly[N]
+
+    nop
+.fird_loop_len:
+//    p.lw      a1, 4(a1)
+//fmv.w.x   fa5,zero
+    flw         fa0, 0(a1)  // f0 = x[i],  first load
+    esp.lp.setup    0, t4, .fird_load_data      // label to the last executed instruction
+        add         a1, a1, 4                   // i++
+        fsw         fa0, 0(t1)                  // delay[pos]
+        add         t1, t1, 4
+        blt         t1, t3, .do_not_reset_pos # if t0 < t1 then target
+            lw  t1, 4(a0)   // t1 - delay
+    .do_not_reset_pos:
+    .fird_load_data:    flw         fa0, 0(a1)                  // f0 = x[i]
+
+    lw      t0, 0(a0)   // t0 - coeffs
+    sub     t5, t3, t1  // (last_pos - pos)*4
+    srli    t5, t5, 2   // N-pos
+    sub     t6, t1, a4
+    srli    t6, t6, 2   // pos
+
+    fmv.w.x fa2,zero
+    
+    lw  a5, 0(a0)   // a5 - coeffs 
+    esp.lp.setup    0, t5, .first_fird_loop
+        flw     fa1, 0(a5)
+        flw     fa0, 0(t1)
+        addi    a5, a5, 4
+        fmadd.s   fa2, fa1, fa0, fa2
+.first_fird_loop:  addi      t1, t1, 4
+
+
+    lw  t1, 4(a0)   // t1 - delay
+
+    beqz    t6, .skeep_loop
+    esp.lp.setup    0, t6, .second_fird_loop
+        flw     fa1, 0(a5)
+        flw     fa0, 0(t1)
+        addi    a5, a5, 4
+        fmadd.s   fa2, fa1, fa0, fa2
+.second_fird_loop:   addi      t1, t1, 4
+
+.skeep_loop:
+    // Store result
+
+    fsw     fa2, 0(a2)
+    addi    a2, a2, 4
+
+    addi    a3, a3, -1
+    BNEZ    a3, .fird_loop_len// Jump if > 0
+
+    sub     t6,  t1, a4
+    srli    t6, t6, 2   // pos
+
+    sw  t6, 12(a0)  // t3 - pos
+
+    mv  a0, a6
+    add sp,sp,16
+    ret
+
+#endif // 
--- a/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fird_init_f32.c
+++ b/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fird_init_f32.c
@@ -0,0 +1,46 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dsps_fir.h"
+
+
+esp_err_t dsps_fird_init_f32(fir_f32_t *fir, float *coeffs, float *delay, int N, int decim)
+{
+    fir->coeffs = coeffs;
+    fir->delay = delay;
+    fir->N = N;
+    fir->pos = 0;
+    fir->decim = decim;
+
+#ifdef CONFIG_IDF_TARGET_ESP32S3
+    // The amount of coefficients should be divided to 4,
+    // if not, add zero coefficients to round length to 0
+    if (fir->N % 4 != 0) {
+        return ESP_ERR_DSP_INVALID_LENGTH;
+    }
+    // The coeffs array should be aligned to 16
+    if (((uint32_t)coeffs) & 0x0f) {
+        return ESP_ERR_DSP_ARRAY_NOT_ALIGNED;
+    }
+    // The delay array should be aligned to 16
+    if (((uint32_t)delay) & 0x0f) {
+        return ESP_ERR_DSP_ARRAY_NOT_ALIGNED;
+    }
+#endif // CONFIG_IDF_TARGET_ESP32S3
+
+    for (int i = 0 ; i < N; i++) {
+        fir->delay[i] = 0;
+    }
+    return ESP_OK;
+}
--- a/managed_components/espressif__esp-dsp/modules/fir/include/dsps_fir.h
+++ b/managed_components/espressif__esp-dsp/modules/fir/include/dsps_fir.h
@@ -0,0 +1,279 @@
+// Copyright 2018-2022 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef _dsps_fir_H_
+#define _dsps_fir_H_
+
+
+#include "dsp_err.h"
+
+#include "dsps_fir_platform.h"
+#include "dsp_common.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/**
+ * @brief Data struct of f32 fir filter
+ *
+ * This structure is used by a filter internally. A user should access this structure only in case of
+ * extensions for the DSP Library.
+ * All fields of this structure are initialized by the dsps_fir_init_f32(...) function.
+ */
+typedef struct fir_f32_s {
+    float  *coeffs;     /*!< Pointer to the coefficient buffer.*/
+    float  *delay;      /*!< Pointer to the delay line buffer.*/
+    int     N;          /*!< FIR filter coefficients amount.*/
+    int     pos;        /*!< Position in delay line.*/
+    int     decim;      /*!< Decimation factor.*/
+    int16_t use_delay;  /*!< The delay line was allocated by init function.*/
+} fir_f32_t;
+
+/**
+ * @brief Data struct of s16 fir filter
+ *
+ * This structure is used by a filter internally. A user should access this structure only in case of
+ * extensions for the DSP Library.
+ * All fields of this structure are initialized by the dsps_fir_init_s16(...) function.
+ */
+typedef struct fir_s16_s {
+    int16_t    *coeffs;         /*!< Pointer to the coefficient buffer.*/
+    int16_t    *delay;          /*!< Pointer to the delay line buffer.*/
+    int16_t     coeffs_len;     /*!< FIR filter coefficients amount.*/
+    int16_t     pos;            /*!< Position in delay line.*/
+    int16_t     decim;          /*!< Decimation factor.*/
+    int16_t     d_pos;          /*!< Actual decimation counter.*/
+    int16_t     shift;          /*!< Shift value of the result.*/
+    int32_t    *rounding_buff;  /*!< Rounding buffer for the purposes of esp32s3 ee.ld.accx.ip assembly instruction */
+    int32_t     rounding_val;   /*!< Rounding value*/
+    int16_t     free_status;    /*!< Indicator for dsps_fird_s16_aes3_free() function*/
+} fir_s16_t;
+
+/**
+ * @brief   initialize structure for 32 bit FIR filter
+ *
+ * Function initialize structure for 32 bit floating point FIR filter
+ * The implementation use ANSI C and could be compiled and run on any platform
+ *
+ * @param fir: pointer to fir filter structure, that must be preallocated
+ * @param coeffs: array with FIR filter coefficients. Must be length N
+ * @param delay: array for FIR filter delay line. Must have a length = coeffs_len + 4
+ * @param coeffs_len: FIR filter length. Length of coeffs array. For esp32s3 length should be divided by 4 and aligned to 16.
+ *
+ * @return
+ *      - ESP_OK on success
+ *      - One of the error codes from DSP library
+ */
+esp_err_t dsps_fir_init_f32(fir_f32_t *fir, float *coeffs, float *delay, int coeffs_len);
+
+/**
+ * @brief   initialize structure for 32 bit Decimation FIR filter
+ * Function initialize structure for 32 bit floating point FIR filter with decimation
+ * The implementation use ANSI C and could be compiled and run on any platform
+ *
+ * @param fir: pointer to fir filter structure, that must be preallocated
+ * @param coeffs: array with FIR filter coefficients. Must be length N
+ * @param delay: array for FIR filter delay line. Must be length N
+ * @param N: FIR filter length. Length of coeffs and delay arrays.
+ * @param decim: decimation factor.
+ *
+ * @return
+ *      - ESP_OK on success
+ *      - One of the error codes from DSP library
+ */
+esp_err_t dsps_fird_init_f32(fir_f32_t *fir, float *coeffs, float *delay, int N, int decim);
+
+/**
+ * @brief   initialize structure for 16 bit Decimation FIR filter
+ * Function initialize structure for 16 bit signed fixed point FIR filter with decimation
+ * The implementation use ANSI C and could be compiled and run on any platform
+ *
+ * @param fir: pointer to fir filter structure, that must be preallocated
+ * @param coeffs: array with FIR filter coefficients. Must be length N
+ * @param delay: array for FIR filter delay line. Must be length N
+ * @param coeffs_len: FIR filter length. Length of coeffs and delay arrays.
+ * @param decim: decimation factor.
+ * @param start_pos: initial value of decimation counter. Must be [0..d)
+ * @param shift: shift position of the result
+ *
+ * @return
+ *      - ESP_OK on success
+ *      - One of the error codes from DSP library
+ */
+esp_err_t dsps_fird_init_s16(fir_s16_t *fir, int16_t *coeffs, int16_t *delay, int16_t coeffs_len, int16_t decim, int16_t start_pos, int16_t shift);
+
+
+/**@{*/
+/**
+ * @brief   32 bit floating point FIR filter
+ *
+ * Function implements FIR filter
+ * The extension (_ansi) uses ANSI C and could be compiled and run on any platform.
+ * The extension (_ae32) is optimized for ESP32 chip.
+ *
+ * @param fir: pointer to fir filter structure, that must be initialized before
+ * @param[in] input: input array
+ * @param[out] output: array with the result of FIR filter
+ * @param[in] len: length of input and result arrays
+ *
+ * @return
+ *      - ESP_OK on success
+ *      - One of the error codes from DSP library
+ */
+esp_err_t dsps_fir_f32_ansi(fir_f32_t *fir, const float *input, float *output, int len);
+esp_err_t dsps_fir_f32_ae32(fir_f32_t *fir, const float *input, float *output, int len);
+esp_err_t dsps_fir_f32_aes3(fir_f32_t *fir, const float *input, float *output, int len);
+/**@}*/
+
+/**@{*/
+/**
+ *  @brief   32 bit floating point Decimation FIR filter
+ *
+ * Function implements FIR filter with decimation
+ * The extension (_ansi) uses ANSI C and could be compiled and run on any platform.
+ * The extension (_ae32) is optimized for ESP32 chip.
+ *
+ * @param fir: pointer to fir filter structure, that must be initialized before
+ * @param input: input array
+ * @param output: array with the result of FIR filter
+ * @param len: length of result array
+ *
+ * @return: function returns the number of samples stored in the output array
+ *          depends on the previous state value could be [0..len/decimation]
+ */
+int dsps_fird_f32_ansi(fir_f32_t *fir, const float *input, float *output, int len);
+int dsps_fird_f32_ae32(fir_f32_t *fir, const float *input, float *output, int len);
+int dsps_fird_f32_aes3(fir_f32_t *fir, const float *input, float *output, int len);
+int dsps_fird_f32_arp4(fir_f32_t *fir, const float *input, float *output, int len);
+/**@}*/
+
+/**@{*/
+/**
+ *  @brief   16 bit signed fixed point Decimation FIR filter
+ *
+ * Function implements FIR filter with decimation
+ * The extension (_ansi) uses ANSI C and could be compiled and run on any platform.
+ * The extension (_ae32) is optimized for ESP32 chip.
+ *
+ * @param fir: pointer to fir filter structure, that must be initialized before
+ * @param input: input array
+ * @param output: array with the result of the FIR filter
+ * @param len: length of the result array
+ *
+ * @return: function returns the number of samples stored in the output array
+ *          depends on the previous state value could be [0..len/decimation]
+ */
+int32_t dsps_fird_s16_ansi(fir_s16_t *fir, const int16_t *input, int16_t *output, int32_t len);
+int32_t dsps_fird_s16_ae32(fir_s16_t *fir, const int16_t *input, int16_t *output, int32_t len);
+int32_t dsps_fird_s16_aes3(fir_s16_t *fir, const int16_t *input, int16_t *output, int32_t len);
+int32_t dsps_fird_s16_arp4(fir_s16_t *fir, const int16_t *input, int16_t *output, int32_t len);
+/**@}*/
+
+
+/**@{*/
+/**
+ * @brief   support arrays freeing function
+ *
+ * Function frees all the arrays, which were created during the initialization of the fir_s16_t structure
+ * 1.   frees allocated memory for rounding buffer, for the purposes of esp32s3 ee.ld.accx.ip assembly instruction
+ * 2.   frees allocated memory in case the delay line is NULL
+ * 3.   frees allocated memory in case the length of the filter (and the delay line) is not divisible by 8
+ *      and new delay line and filter coefficients arrays are created for the purpose of the esp32s3 assembly
+ *
+ * @param fir: pointer to fir filter structure, that must be initialized before
+ *
+ * @return
+ *      - ESP_OK on success
+ */
+esp_err_t dsps_fird_s16_aexx_free(fir_s16_t *fir);
+/**@}*/
+
+
+/**@{*/
+/**
+ * @brief   support arrays freeing function
+ *
+ * Function frees the delay line arrays, if it was allocated by the init functions.
+ *
+ * @param fir: pointer to fir filter structure, that must be initialized before
+ *
+ * @return
+ *      - ESP_OK on success
+ */
+esp_err_t dsps_fir_f32_free(fir_f32_t *fir);
+/**@}*/
+
+
+/**@{*/
+/**
+ * @brief   Array reversal
+ *
+ * Function reverses 16-bit long array members for the purpose of the dsps_fird_s16_aes3 implementation
+ * The function has to be called either during the fir struct initialization or every time the coefficients change
+ *
+ * @param arr: pointer to the array to be reversed
+ * @param len: length of the array to be reversed
+ *
+ * @return
+ *      - ESP_OK on success
+ */
+esp_err_t dsps_16_array_rev(int16_t *arr, int16_t len);
+/**@}*/
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#if CONFIG_DSP_OPTIMIZED
+
+#if (dsps_fir_f32_ae32_enabled == 1)
+#define dsps_fir_f32 dsps_fir_f32_ae32
+#elif (dsps_fir_f32_aes3_enabled == 1)
+#define dsps_fir_f32 dsps_fir_f32_aes3
+#else
+#define dsps_fir_f32 dsps_fir_f32_ansi
+#endif
+
+#if (dsps_fird_f32_aes3_enabled == 1)
+#define dsps_fird_f32 dsps_fird_f32_aes3
+#elif (dsps_fird_f32_ae32_enabled == 1)
+#define dsps_fird_f32 dsps_fird_f32_ae32
+#elif (dsps_fird_f32_arp4_enabled == 1)
+#define dsps_fird_f32 dsps_fird_f32_arp4
+#else
+#define dsps_fird_f32 dsps_fird_f32_ansi
+#endif
+
+#if (dsps_fird_s16_ae32_enabled == 1)
+#define dsps_fird_s16 dsps_fird_s16_ae32
+#elif (dsps_fird_s16_aes3_enabled == 1)
+#define dsps_fird_s16 dsps_fird_s16_aes3
+#elif (dsps_fird_s16_arp4_enabled == 1)
+#define dsps_fird_s16 dsps_fird_s16_arp4
+#else
+#define dsps_fird_s16 dsps_fird_s16_ansi
+#endif
+
+#else // CONFIG_DSP_OPTIMIZED
+
+#define dsps_fir_f32 dsps_fir_f32_ansi
+#define dsps_fird_f32 dsps_fird_f32_ansi
+#define dsps_fird_s16 dsps_fird_s16_ansi
+
+#endif // CONFIG_DSP_OPTIMIZED
+
+#endif // _dsps_fir_H_
--- a/managed_components/espressif__esp-dsp/modules/fir/include/dsps_fir_platform.h
+++ b/managed_components/espressif__esp-dsp/modules/fir/include/dsps_fir_platform.h
@@ -0,0 +1,40 @@
+#ifndef _dsps_fir_platform_H_
+#define _dsps_fir_platform_H_
+
+#include "sdkconfig.h"
+
+#ifdef __XTENSA__
+#include <xtensa/config/core-isa.h>
+#include <xtensa/config/core-matmap.h>
+
+
+#if ((XCHAL_HAVE_FP == 1) && (XCHAL_HAVE_LOOPS == 1))
+
+#if CONFIG_IDF_TARGET_ESP32S3
+#define dsps_fird_f32_aes3_enabled  1
+#define dsps_fird_f32_ae32_enabled  1
+#define dsps_fird_s16_aes3_enabled 1
+#define dsps_fird_s16_ae32_enabled 0
+#define dsps_fir_f32_aes3_enabled  1
+#define dsps_fir_f32_ae32_enabled  0
+#else
+#define dsps_fird_f32_ae32_enabled  1
+#define dsps_fird_s16_aes3_enabled 0
+#define dsps_fird_s16_ae32_enabled 1
+#define dsps_fir_f32_aes3_enabled  0
+#define dsps_fir_f32_ae32_enabled  1
+#endif
+
+#endif //
+#endif // __XTENSA__
+
+#ifdef CONFIG_IDF_TARGET_ESP32P4
+#ifdef CONFIG_DSP_OPTIMIZED
+#define dsps_fird_f32_arp4_enabled 1
+#define dsps_fird_s16_arp4_enabled 1
+#else
+#define dsps_fird_f32_arp4_enabled 0
+#define dsps_fird_s16_arp4_enabled 0
+#endif // CONFIG_DSP_OPTIMIZED
+#endif
+#endif // _dsps_fir_platform_H_
--- a/managed_components/espressif__esp-dsp/modules/fir/test/test_dsps_fir_f32_ae32.c
+++ b/managed_components/espressif__esp-dsp/modules/fir/test/test_dsps_fir_f32_ae32.c
@@ -0,0 +1,134 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "esp_dsp.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+
+#include "dsps_tone_gen.h"
+#include "dsps_d_gen.h"
+#include "dsps_fir.h"
+#include "dsp_tests.h"
+
+static const char *TAG = "dsps_fir_f32_aexx";
+
+__attribute__((aligned(16)))
+static float x[1024];
+__attribute__((aligned(16)))
+static float y[1024];
+__attribute__((aligned(16)))
+static float y_compare[1024];
+
+__attribute__((aligned(16)))
+static float coeffs[32];
+__attribute__((aligned(16)))
+static float delay[32 + 4];
+__attribute__((aligned(16)))
+static float delay_compare[32];
+
+TEST_CASE("dsps_fir_f32_aexx functionality", "[dsps]")
+{
+    // In the test we generate filter with cutt off frequency 0.1
+    // and then filtering 0.1 and 0.3 frequencis.
+    // Result must be better then 24 dB
+    int len = sizeof(x) / sizeof(float);
+    int fir_len = sizeof(coeffs) / sizeof(float);
+
+    fir_f32_t fir1;
+    fir_f32_t fir2;
+    for (int i = 0 ; i < fir_len ; i++) {
+        coeffs[i] = (fir_len - i - 1);
+    }
+
+    for (int i = 0 ; i < len ; i++) {
+        x[i] = 0;
+    }
+    x[0] = 1;
+
+    esp_err_t err = dsps_fir_init_f32(&fir1, coeffs, delay, fir_len);
+    TEST_ESP_OK(err);
+    err = dsps_fir_f32(&fir1, x, y, len);
+    TEST_ESP_OK(err);
+
+    for (int i = 0 ; i < fir_len * 3 ; i++) {
+        ESP_LOGD(TAG, "fir[%i] = %f", i, y[i]);
+    }
+
+    for (int i = 0 ; i < fir_len ; i++) {
+        if (y[i] != i) {
+            TEST_ASSERT_EQUAL(y[i], i);
+        }
+    }
+
+    for (int i = 0 ; i < fir_len ; i++) {
+        coeffs[i] = i;
+    }
+
+    for (int i = 0 ; i < len ; i++) {
+        x[i] = i;
+    }
+    x[0] = 1;
+    dsps_fir_init_f32(&fir1, coeffs, delay, fir_len);
+    dsps_fir_init_f32(&fir2, coeffs, delay_compare, fir_len);
+
+    dsps_fir_f32(&fir1, x, y, len);
+    dsps_fir_f32_ansi(&fir2, x, y_compare, len);
+    dsps_fir_f32(&fir1, x, y, len);
+    dsps_fir_f32_ansi(&fir2, x, y_compare, len);
+    dsps_fir_f32(&fir1, x, y, len);
+    dsps_fir_f32_ansi(&fir2, x, y_compare, len);
+
+    for (int i = 0 ; i < len ; i++) {
+        if (y[i] != y_compare[i]) {
+            TEST_ASSERT_EQUAL(y[i], y_compare[i]);
+        }
+    }
+}
+
+TEST_CASE("dsps_fir_f32_aexx benchmark", "[dsps]")
+{
+
+    int len = sizeof(x) / sizeof(float);
+    int fir_len = sizeof(coeffs) / sizeof(float);
+    int repeat_count = 1;
+
+    fir_f32_t fir1;
+    for (int i = 0 ; i < fir_len ; i++) {
+        coeffs[i] = i;
+    }
+
+    for (int i = 0 ; i < len ; i++) {
+        x[i] = 0;
+    }
+    x[0] = 1;
+
+    dsps_fir_init_f32(&fir1, coeffs, delay, fir_len);
+
+    unsigned int start_b = dsp_get_cpu_cycle_count();
+    for (int i = 0 ; i < repeat_count ; i++) {
+        dsps_fir_f32(&fir1, x, y, len);
+    }
+    unsigned int end_b = dsp_get_cpu_cycle_count();
+
+    float total_b = end_b - start_b;
+    float cycles = total_b / (len * repeat_count);
+
+    ESP_LOGI(TAG, "dsps_fir_f32_aexx - %f per sample for for %i coefficients, %f per tap \n", cycles, fir_len, cycles / (float)fir_len);
+
+    float min_exec = 3;
+    float max_exec = 800;
+    TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
+}
--- a/managed_components/espressif__esp-dsp/modules/fir/test/test_dsps_fir_f32_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/fir/test/test_dsps_fir_f32_ansi.c
@@ -0,0 +1,124 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "esp_dsp.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+
+#include "dsps_tone_gen.h"
+#include "dsps_d_gen.h"
+#include "dsps_fir.h"
+#include "dsp_tests.h"
+
+static const char *TAG = "dsps_fir_f32_ansi";
+
+static float x[1024];
+static float y[1024];
+
+static float coeffs[32];
+static float delay[32 + 4];
+
+TEST_CASE("dsps_fir_f32_ansi functionality", "[dsps]")
+{
+    // In the test we generate filter with cutt off frequency 0.1
+    // and then filtering 0.1 and 0.3 frequencis.
+    // Result must be better then 24 dB
+    int len = sizeof(x) / sizeof(float);
+    int fir_len = sizeof(coeffs) / sizeof(float);
+
+    fir_f32_t fir1;
+    for (int i = 0 ; i < fir_len ; i++) {
+        coeffs[i] = (fir_len - i - 1);
+    }
+
+    for (int i = 0 ; i < len ; i++) {
+        x[i] = 0;
+    }
+    x[0] = 1;
+
+    dsps_fir_init_f32(&fir1, coeffs, delay, fir_len);
+    dsps_fir_f32_ansi(&fir1, x, y, len);
+
+    for (int i = 0 ; i < fir_len * 3 ; i++) {
+        ESP_LOGD(TAG, "fir[%i] = %f", i, y[i]);
+    }
+    for (int i = 0 ; i < fir_len ; i++) {
+        if (y[i] != i) {
+            TEST_ASSERT_EQUAL(y[i], i);
+        }
+    }
+
+    // Check even length
+#ifndef CONFIG_IDF_TARGET_ESP32S3
+    fir_len--;
+#endif
+    for (int i = 0 ; i < fir_len ; i++) {
+        coeffs[i] = (fir_len - i - 1);
+    }
+
+    for (int i = 0 ; i < len ; i++) {
+        x[i] = 0;
+    }
+    x[0] = 1;
+    dsps_fir_init_f32(&fir1, coeffs, delay, fir_len);
+    dsps_fir_f32_ansi(&fir1, x, y, len);
+
+    for (int i = 0 ; i < fir_len ; i++) {
+        if (y[i] != i) {
+            TEST_ASSERT_EQUAL(y[i], i);
+        }
+    }
+    for (int i = fir_len ; i < len ; i++) {
+        if (y[i] != 0) {
+            TEST_ASSERT_EQUAL(y[i], 0);
+        }
+    }
+}
+
+TEST_CASE("dsps_fir_f32_ansi benchmark", "[dsps]")
+{
+
+    int len = sizeof(x) / sizeof(float);
+    int fir_len = sizeof(coeffs) / sizeof(float);
+    int repeat_count = 1;
+
+    fir_f32_t fir1;
+    for (int i = 0 ; i < fir_len ; i++) {
+        coeffs[i] = i;
+    }
+
+    for (int i = 0 ; i < len ; i++) {
+        x[i] = 0;
+    }
+    x[0] = 1;
+
+    dsps_fir_init_f32(&fir1, coeffs, delay, fir_len);
+
+    unsigned int start_b = dsp_get_cpu_cycle_count();
+    for (int i = 0 ; i < repeat_count ; i++) {
+        dsps_fir_f32_ansi(&fir1, x, y, len);
+    }
+    unsigned int end_b = dsp_get_cpu_cycle_count();
+
+    float total_b = end_b - start_b;
+    float cycles = total_b / (len * repeat_count);
+
+    ESP_LOGI(TAG, "dsps_fir_f32_ansi - %f per sample for for %i coefficients, %f per tap \n", cycles, fir_len, cycles / (float)fir_len);
+
+    float min_exec = 10;
+    float max_exec = 800;
+    TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
+}
--- a/managed_components/espressif__esp-dsp/modules/fir/test/test_dsps_fird_f32_ae32.c
+++ b/managed_components/espressif__esp-dsp/modules/fir/test/test_dsps_fird_f32_ae32.c
@@ -0,0 +1,113 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "esp_dsp.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+
+#include "dsps_tone_gen.h"
+#include "dsps_d_gen.h"
+#include "dsps_fir.h"
+#include "dsp_tests.h"
+
+static const char *TAG = "dsps_fird_f32";
+
+static float x[1024];
+static float y[1024];
+static float y_compare[1024];
+
+static float coeffs[32];
+static float delay[32];
+static float delay_compare[32];
+
+TEST_CASE("dsps_fird_f32_aexx functionality", "[dsps]")
+{
+    // In the test we generate filter with cutt off frequency 0.1
+    // and then filtering 0.1 and 0.3 frequencis.
+    // Result must be better then 24 dB
+    int len = sizeof(x) / sizeof(float);
+    int fir_len = sizeof(coeffs) / sizeof(float);
+    int decim = 4;
+
+    fir_f32_t fir1;
+    fir_f32_t fir2;
+    for (int i = 0 ; i < fir_len ; i++) {
+        coeffs[i] = i;
+    }
+    coeffs[0] = 1;
+
+    for (int i = 0 ; i < len ; i++) {
+        x[i] = i;
+    }
+
+    dsps_fird_init_f32(&fir1, coeffs, delay, fir_len, decim);
+    dsps_fird_init_f32(&fir2, coeffs, delay_compare, fir_len, decim);
+    int total1 = dsps_fird_f32(&fir1, x, y, len / decim);
+    int total2 = dsps_fird_f32_ansi(&fir2, x, y_compare, len / decim);
+    total1 += dsps_fird_f32(&fir1, x, y, len / decim);
+    total2 += dsps_fird_f32_ansi(&fir2, x, y_compare, len / decim);
+    total1 += dsps_fird_f32(&fir1, x, y, len / decim);
+    total2 += dsps_fird_f32_ansi(&fir2, x, y_compare, len / decim);
+    ESP_LOGI(TAG, "Total result = %i, expected %i from %i", total1, total2, len);
+    TEST_ASSERT_EQUAL(total1, total2);
+    for (int i = 0 ; i < total1 ; i++) {
+        ESP_LOGD(TAG, "data[%i] = %f expected %f\n", i, y[i], y_compare[i]);
+    }
+    for (int i = 0 ; i < total1 ; i++) {
+        if (y[i] != y_compare[i]) {
+            TEST_ASSERT_EQUAL(y[i], y_compare[i]);
+        }
+    }
+
+
+}
+
+
+TEST_CASE("dsps_fird_f32_aexx benchmark", "[dsps]")
+{
+
+    int len = sizeof(x) / sizeof(float);
+    int fir_len = sizeof(coeffs) / sizeof(float);
+    int repeat_count = 1;
+    int decim = 4;
+
+    fir_f32_t fir1;
+    for (int i = 0 ; i < fir_len ; i++) {
+        coeffs[i] = i;
+    }
+
+    for (int i = 0 ; i < len ; i++) {
+        x[i] = 0;
+    }
+    x[0] = 1;
+
+    dsps_fird_init_f32(&fir1, coeffs, delay, fir_len, decim);
+
+    unsigned int start_b = dsp_get_cpu_cycle_count();
+    for (int i = 0 ; i < repeat_count ; i++) {
+        dsps_fird_f32(&fir1, x, y, len / decim);
+    }
+    unsigned int end_b = dsp_get_cpu_cycle_count();
+
+    float total_b = end_b - start_b;
+    float cycles = total_b / (len * repeat_count);
+
+    ESP_LOGI(TAG, "dsps_fir_f32_ae32 - %f per sample for for %i coefficients, %f per decim tap\n", cycles, fir_len, cycles / (float)fir_len * decim);
+    ESP_LOGI(TAG, "Total cycles = %i", end_b - start_b);
+    float min_exec = 3;
+    float max_exec = 300;
+    TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
+}
--- a/managed_components/espressif__esp-dsp/modules/fir/test/test_dsps_fird_f32_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/fir/test/test_dsps_fird_f32_ansi.c
@@ -0,0 +1,101 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "esp_dsp.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+
+#include "dsps_tone_gen.h"
+#include "dsps_d_gen.h"
+#include "dsps_fir.h"
+#include "dsp_tests.h"
+
+static const char *TAG = "dsps_fird_f32_ansi";
+
+static float x[1024];
+static float y[1024];
+
+static float coeffs[32];
+static float delay[32];
+
+TEST_CASE("dsps_fird_f32_ansi functionality", "[dsps]")
+{
+    // In the test we generate filter with cutt off frequency 0.1
+    // and then filtering 0.1 and 0.3 frequencis.
+    // Result must be better then 24 dB
+    int len = sizeof(x) / sizeof(float);
+    int fir_len = sizeof(coeffs) / sizeof(float);
+    int decim = 4;
+
+    fir_f32_t fir1;
+    for (int i = 0 ; i < fir_len ; i++) {
+        coeffs[i] = 0;
+    }
+    coeffs[0] = 1;
+
+    for (int i = 0 ; i < len ; i++) {
+        x[i] = (i) % decim;
+    }
+
+    dsps_fird_init_f32(&fir1, coeffs, delay, fir_len, decim);
+    int total = dsps_fird_f32_ansi(&fir1, x, y, len / decim);
+    ESP_LOGI(TAG, "Total result = %i from %i", total, len);
+    TEST_ASSERT_EQUAL(total, len / decim);
+    for (int i = 0 ; i < total ; i++) {
+        ESP_LOGD(TAG, "data[%i] = %f\n", i, y[i]);
+    }
+    for (int i = 0 ; i < total ; i++) {
+        TEST_ASSERT_EQUAL(y[i], 0);
+    }
+
+
+}
+
+
+TEST_CASE("dsps_fird_f32_ansi benchmark", "[dsps]")
+{
+
+    int len = sizeof(x) / sizeof(float);
+    int fir_len = sizeof(coeffs) / sizeof(float);
+    int repeat_count = 1;
+    int decim = 4;
+
+    fir_f32_t fir1;
+    for (int i = 0 ; i < fir_len ; i++) {
+        coeffs[i] = i;
+    }
+
+    for (int i = 0 ; i < len ; i++) {
+        x[i] = 0;
+    }
+    x[0] = 1;
+
+    dsps_fird_init_f32(&fir1, coeffs, delay, fir_len, decim);
+
+    unsigned int start_b = dsp_get_cpu_cycle_count();
+    for (int i = 0 ; i < repeat_count ; i++) {
+        dsps_fird_f32_ansi(&fir1, x, y, len / decim);
+    }
+    unsigned int end_b = dsp_get_cpu_cycle_count();
+
+    float total_b = end_b - start_b;
+    float cycles = total_b / (len * repeat_count);
+
+    ESP_LOGI(TAG, "dsps_fir_f32_ansi - %f per sample for for %i coefficients, %f per decim tap \n", cycles, fir_len, cycles / (float)fir_len * decim);
+    float min_exec = 10;
+    float max_exec = 300;
+    TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
+}
--- a/managed_components/espressif__esp-dsp/modules/fir/test/test_dsps_fird_s16_ae32.c
+++ b/managed_components/espressif__esp-dsp/modules/fir/test/test_dsps_fird_s16_ae32.c
@@ -0,0 +1,371 @@
+/*
+ * SPDX-FileCopyrightText: 2022-2023 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+
+#include <string.h>
+#include <malloc.h>
+#include <stdbool.h>
+#include "unity.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+#include "esp_err.h"
+#include "esp_dsp.h"
+
+#include "dsps_tone_gen.h"
+#include "dsps_d_gen.h"
+#include "dsps_fir.h"
+#include "dsp_tests.h"
+#include "dsps_wind.h"
+#include "dsps_view.h"
+#include "dsps_fft2r.h"
+
+#define COEFFS 256
+#define N_IN_SAMPLES 4096
+#define DECIMATION 2
+#define Q15_MAX INT16_MAX
+#define LEAKAGE_BINS 10
+#define FIR_BUFF_LEN 16
+
+#define MAX_FIR_LEN 64
+
+static const char *TAG = "dsps_fird_s16_aexx";
+
+const static int32_t len = N_IN_SAMPLES;
+const static int32_t N_FFT = (N_IN_SAMPLES / DECIMATION);
+const static int16_t decim = DECIMATION;
+const static int16_t fir_len = COEFFS;
+const static int32_t fir_buffer = (N_IN_SAMPLES + FIR_BUFF_LEN);
+
+
+// error messages for the init functions
+static void error_msg_handler(fir_s16_t *fir, esp_err_t status)
+{
+
+    if (status != ESP_OK) {
+        dsps_fird_s16_aexx_free(fir);
+
+        switch (status) {
+        case ESP_ERR_DSP_INVALID_LENGTH:
+            TEST_ASSERT_MESSAGE(false, "Number of the coefficients must be higher than 1");
+            break;
+        case ESP_ERR_DSP_ARRAY_NOT_ALIGNED:
+            TEST_ASSERT_MESSAGE(false, "Delay line or (and) coefficients arrays not aligned");
+            break;
+        case ESP_ERR_DSP_PARAM_OUTOFRANGE:
+            TEST_ASSERT_MESSAGE(false, "Start position or (and) Decimation ratio or (and) Shift out of range");
+            break;
+        default:
+            TEST_ASSERT_MESSAGE(false, "Unspecified error");
+            break;
+        }
+    }
+}
+
+
+
+TEST_CASE("dsps_fird_s16_aexx functionality", "[dsps]")
+{
+
+    const int32_t max_len[2] = {2048, 2520};                        // 2520 can be divided by 3, 6, 9, 12, 15, 18 and 21
+    const int16_t max_dec[2] = {32, 21};
+    const int16_t min_dec[2] = {2, 3};
+    const int16_t shift_vals[17] = {-15, 0, 15};
+
+    int16_t *x = (int16_t *)memalign(16, max_len[1] * sizeof(int16_t));
+    int16_t *y = (int16_t *)memalign(16, max_len[1] * sizeof(int16_t));
+    int16_t *y_compare = (int16_t *)memalign(16, max_len[1] * sizeof(int16_t));
+
+    int16_t *coeffs = (int16_t *)memalign(16, MAX_FIR_LEN * sizeof(int16_t));
+    int16_t *coeffs_aexx = (int16_t *)memalign(16, MAX_FIR_LEN * sizeof(int16_t));
+    int16_t *coeffs_ansi = (int16_t *)memalign(16, MAX_FIR_LEN * sizeof(int16_t));
+
+    int16_t *delay = (int16_t *)memalign(16, MAX_FIR_LEN * sizeof(int16_t));
+    int16_t *delay_compare = (int16_t *)memalign(16, MAX_FIR_LEN * sizeof(int16_t));
+
+    int32_t combinations = 0;
+    esp_err_t status1 = ESP_OK, status2 = ESP_OK;
+    fir_s16_t fir1, fir2;
+
+    for (int i = 0 ; i < MAX_FIR_LEN ; i++) {
+        coeffs[i] = i + 0x100;
+    }
+
+    for (int i = 0 ; i < max_len[1] ; i++) {
+        x[i] = 0x10;
+    }
+
+    for (int variations = 0; variations < 2; variations++) {
+
+        ESP_LOGI(TAG, ": %"PRId32" input samples, coefficients range from 2 to %"PRId16", decimation range from %"PRId16" to %"PRId16", shift in range from -40 to 40 and start positions within the coeffs range",
+                 max_len[variations], (int16_t)MAX_FIR_LEN, min_dec[variations], max_dec[variations]);
+
+        // decimation increment is set as dec * 2 for input data length 2048 (2, 4, 8, 16, 32)
+        //                             as dec + 3 for input data length 2520 (3, 6, 9, 12, 15, 18, 21)
+        for (int16_t dec = min_dec[variations]; dec <= max_dec[variations]; ((variations) ? (dec += 3) : (dec *= 2)) ) {
+
+            const int32_t loop_len = max_len[variations] / dec;
+            const int16_t start_position = 0;
+
+            for (int16_t fir_length = 2; fir_length <= MAX_FIR_LEN; fir_length += 16) {
+
+                for (int16_t shift_amount = 0; shift_amount < sizeof(shift_vals) / sizeof(uint16_t); shift_amount++) {
+
+                    for (int k = 0 ; k < fir_length; k++) {
+                        coeffs_ansi[k] = coeffs[k];
+                        coeffs_aexx[k] = coeffs[k];
+                    }
+
+                    status1 = dsps_fird_init_s16(&fir1, coeffs_aexx, delay, fir_length, dec, start_position, shift_vals[shift_amount]);
+                    error_msg_handler(&fir1, status1);
+                    status2 = dsps_fird_init_s16(&fir2, coeffs_ansi, delay_compare, fir_length, dec, start_position, shift_vals[shift_amount]);
+                    error_msg_handler(&fir2, status2);
+
+#if(dsps_fird_s16_aes3_enabled)
+                    dsps_16_array_rev(fir1.coeffs, fir1.coeffs_len);        // coefficients are being reverted for the purposes of the aes3 TIE implementation
+#endif
+
+                    for (int16_t start_pos = 0; start_pos < dec; start_pos++) {
+
+                        fir1.d_pos = start_pos;
+                        fir2.d_pos = start_pos;
+
+                        for (int j = 0; j < fir1.coeffs_len; j++) {
+                            fir1.delay[j] = 0;
+                            fir2.delay[j] = 0;
+                        }
+
+                        fir1.pos = 0;
+                        fir2.pos = 0;
+
+                        const int32_t total1 = dsps_fird_s16(&fir1, x, y, loop_len);
+                        const int32_t total2 = dsps_fird_s16_ansi(&fir2, x, y_compare, loop_len);
+
+                        TEST_ASSERT_EQUAL(total1, total2);
+                        for (int i = 0 ; i < total1 ; i++) {
+                            TEST_ASSERT_EQUAL(y[i], y_compare[i]);
+                        }
+
+                        combinations++;
+                    }
+
+                    dsps_fird_s16_aexx_free(&fir1);
+                    dsps_fird_s16_aexx_free(&fir2);
+                }
+            }
+        }
+    }
+
+    ESP_LOGI(TAG, ": %"PRId32" total filter combinations\n", combinations);
+
+    free(x);
+    free(y);
+    free(coeffs);
+    free(delay);
+    free(y_compare);
+    free(coeffs_ansi);
+    free(coeffs_aexx);
+    free(delay_compare);
+
+}
+
+TEST_CASE("dsps_fird_s16_aexx benchmark", "[dsps]")
+{
+
+    const int16_t local_dec = 2;
+    const int32_t local_len = (len % 16) ? (4096) : (len);                          // length must be devisible by 16
+
+    int16_t *x = (int16_t *)memalign(16, local_len * sizeof(int16_t));
+    int16_t *y = (int16_t *)memalign(16, local_len * sizeof(int16_t));
+
+    int16_t *coeffs = (int16_t *)memalign(16, MAX_FIR_LEN * sizeof(int16_t));
+    int16_t *delay = (int16_t *)memalign(16, MAX_FIR_LEN * sizeof(int16_t));
+
+    const int repeat_count = 100;
+    const int16_t start_pos = 0;
+    const int16_t shift = 0;
+    int32_t loop_len = 0;
+
+    fir_s16_t fir;
+    esp_err_t status = ESP_OK;
+
+    status = dsps_fird_init_s16(&fir, coeffs, delay, MAX_FIR_LEN, local_dec, start_pos, shift);
+    error_msg_handler(&fir, status);
+
+#if(dsps_fird_s16_aes3_enabled)
+    dsps_16_array_rev(fir.coeffs, fir.coeffs_len);
+#endif
+
+    // Test for decimations 2, 4, 8, 16
+    for (int dec = local_dec; dec <= 16 ; dec *= 2) {
+
+        loop_len = (local_len / dec);
+        fir.decim = dec;
+
+        const unsigned int start_b = dsp_get_cpu_cycle_count();
+        for (int j = 0 ; j < repeat_count ; j++) {
+            dsps_fird_s16(&fir, x, y, loop_len);
+        }
+        const unsigned int end_b = dsp_get_cpu_cycle_count();
+
+        const float total_b = end_b - start_b;
+        const float cycles = total_b / (float)(repeat_count);
+        const float cycles_per_sample = total_b / (float)(local_len * repeat_count);
+        const float cycles_per_decim_tap = cycles_per_sample / (float)(fir.coeffs_len * fir.decim);
+
+        ESP_LOGI(TAG, ": %.2f total cycles, %.2f cycles per sample, %.2f per decim tap, for %"PRId32" input samples, %"PRId16" coefficients and decimation %"PRId16"\n",
+                 cycles, cycles_per_sample, cycles_per_decim_tap, local_len, (int16_t)MAX_FIR_LEN, fir.decim);
+
+        const float min_exec = (((local_len / fir.decim) * fir.coeffs_len) / 2);
+        const float max_exec = min_exec * 20;
+        TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
+
+    }
+
+    dsps_fird_s16_aexx_free(&fir);
+    free(x);
+    free(y);
+    free(coeffs);
+    free(delay);
+}
+
+TEST_CASE("dsps_fird_s16_aexx noise_snr", "[dsps]")
+{
+
+    // In the SNR-noise test we are generating a sine wave signal, filtering the signal using a fixed point FIRD filter
+    // and do the FFT of the filtered signal. Afterward, a noise and SNR calculated from the FFT spectrum
+
+    // FIR Coeffs
+    int16_t *s_coeffs = (int16_t *)memalign(16, fir_len * sizeof(int16_t));         // fixed point coefficients
+    int16_t *delay_line = (int16_t *)memalign(16, fir_len * sizeof(int16_t));       // fixed point delay line
+    float *f_coeffs = (float *)memalign(16, fir_len * sizeof(float));               // floating point coefficients
+
+    // Coefficients windowing
+    dsps_wind_hann_f32(f_coeffs, fir_len);
+    const float fir_order = (float)fir_len - 1;
+    const float ft = 0.25;                                                          // Transition frequency
+    for (int i = 0; i < fir_len; i++) {
+        f_coeffs[i] *= sinf((2 * M_PI * ft * (i - fir_order / 2))) / (M_PI * (i - fir_order / 2));
+    }
+
+    // FIR coefficients conversion to q15
+    for (int i = 0; i < fir_len; i++) {
+        s_coeffs[i] = f_coeffs[i] * Q15_MAX;
+    }
+
+    free(f_coeffs);
+
+    // Signal generation
+    const float amplitude = 0.9;
+    const float frequency = 0.05;
+    const float phase = 0;
+    float *f_in_signal = (float *)memalign(16, fir_buffer * sizeof(float));
+    dsps_tone_gen_f32(f_in_signal, fir_buffer, amplitude, frequency, phase);
+
+    // Input signal conversion to q15
+    int16_t *fir_x = (int16_t *)memalign(16, fir_buffer * sizeof(int16_t));
+    int16_t *fir_y = (int16_t *)memalign(16, fir_buffer * sizeof(int16_t));
+    int16_t *fir_y2 = (int16_t *)memalign(16, fir_buffer * sizeof(int16_t));
+    for (int i = 0; i < fir_buffer; i++) {
+        fir_x[i] = f_in_signal[i] * (int16_t)Q15_MAX;
+    }
+
+    free(f_in_signal);
+
+    // FIR
+    const int16_t start_pos = 0;
+    const int16_t shift = 0;
+    const int32_t loop_len = (int32_t)(fir_buffer / decim);                         // loop_len result must be without remainder
+    fir_s16_t fir;
+    esp_err_t status = dsps_fird_init_s16(&fir, s_coeffs, delay_line, fir_len, decim, start_pos, shift);
+    fir_s16_t fir2;
+    esp_err_t status2 = dsps_fird_init_s16(&fir2, s_coeffs, delay_line, fir_len, decim, start_pos, shift);
+    error_msg_handler(&fir, status);
+    error_msg_handler(&fir2, status2);
+
+#if(dsps_fird_s16_aes3_enabled || dsps_fird_s16_arv4_enabled)
+    dsps_16_array_rev(fir.coeffs, fir.coeffs_len);
+#endif
+
+    dsps_fird_s16(&fir, fir_x, fir_y, loop_len);
+    dsps_fird_s16_ansi(&fir2, fir_x, fir_y2, loop_len);
+    for (int i = 0 ; i < loop_len ; i++) {
+        ESP_LOGD(TAG, "Data[%i] = %i vs %i, diff = %i", i, fir_y[i], fir_y2[i], fir_y[i] - fir_y2[i]);
+    }
+
+    free(delay_line);
+    free(s_coeffs);
+    free(fir_x);
+
+    // FIR Output conversion to float
+    const unsigned int ignored_fir_samples = (FIR_BUFF_LEN / 2) - 1;
+    float *fir_output = (float *)memalign(16, len * sizeof(float));
+    for (int i = 0; i < N_FFT; i++) {
+        fir_output[i] = (float)(fir_y[ignored_fir_samples + i] / (float)Q15_MAX);
+    }
+
+    free(fir_y);
+
+    // Signal windowing
+    float *window = (float *)memalign(16, N_FFT * sizeof(float));
+    dsps_wind_blackman_f32(window, N_FFT);
+
+    // Prepare FFT input, real and imaginary part
+    const int32_t fft_data_len = (N_IN_SAMPLES / DECIMATION) * 2;
+    float *fft_data = (float *)memalign(16, fft_data_len * sizeof(float));
+    for (int i = 0 ; i < N_FFT ; i++) {
+        fft_data[i * 2 + 0] = fir_output[i] * window[i];
+        fft_data[i * 2 + 1] = 0;
+    }
+    free(fir_output);
+    free(window);
+
+    // Initialize FFT
+    esp_err_t ret = dsps_fft2r_init_fc32(NULL, N_FFT * 2);
+    TEST_ESP_OK(ret);
+
+    // Do the FFT
+    dsps_fft2r_fc32(fft_data, N_FFT);
+    dsps_bit_rev_fc32(fft_data, N_FFT);
+    dsps_cplx2reC_fc32(fft_data, N_FFT);
+
+    // Convert the FFT spectrum from amplitude to watts, find the max value and its position
+    float max_val = -1000000;
+    int max_pos = 0;
+    for (int i = 0 ; i < N_FFT / 2 ; i++) {
+        fft_data[i] = (fft_data[i * 2 + 0] * fft_data[i * 2 + 0] + fft_data[i * 2 + 1] * fft_data[i * 2 + 1]) / (N_FFT * 3);
+        if (fft_data[i] > max_val) {
+            max_val = fft_data[i];
+            max_pos = i;
+        }
+    }
+
+    // Calculate the power of the signal and noise of the spectrum and convert the spectrum to dB
+    float signal_pow = 0, noise_pow = 0;
+    for (int i = 0 ; i < N_FFT / 2 ; i++) {
+        if ((i >= max_pos - LEAKAGE_BINS) && (i <= max_pos + LEAKAGE_BINS)) {
+            signal_pow += fft_data[i];
+        } else {
+            noise_pow += fft_data[i];
+        }
+
+        fft_data[i] = 10 * log10f(0.0000000000001 + fft_data[i]);
+    }
+
+    // Convert the signal power and noise power from watts to dB and calculate SNR
+    const float snr = 10 * log10f(signal_pow / noise_pow);
+    noise_pow = 10 * log10f(noise_pow);
+    signal_pow = 10 * log10f(signal_pow);
+
+    ESP_LOGI(TAG, "\nSignal Power: \t%f\nNoise Power: \t%f\nSNR: \t\t%f", signal_pow, noise_pow, snr);
+    dsps_view(fft_data, N_FFT / 2, 64, 16,  -140, 40, '|');
+    free(fft_data);
+
+    const float min_exec_snr = 50;
+    const float max_exec_snr = 120;
+    TEST_ASSERT_EXEC_IN_RANGE(min_exec_snr, max_exec_snr, snr);
+    dsps_fird_s16_aexx_free(&fir);
+
+}
--- a/managed_components/espressif__esp-dsp/modules/fir/test/test_dsps_fird_s16_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/fir/test/test_dsps_fird_s16_ansi.c
@@ -0,0 +1,292 @@
+/*
+ * SPDX-FileCopyrightText: 2022 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <malloc.h>
+#include <stdint.h>
+#include "unity.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+#include "esp_err.h"
+#include "esp_dsp.h"
+
+#include "dsps_tone_gen.h"
+#include "dsps_d_gen.h"
+#include "dsps_fir.h"
+#include "dsp_tests.h"
+#include "dsps_wind.h"
+#include "dsps_view.h"
+#include "dsps_fft2r.h"
+
+#define COEFFS 64
+#define N_IN_SAMPLES 1024
+#define DECIMATION 2
+#define Q15_MAX INT16_MAX
+#define LEAKAGE_BINS 10
+#define FIR_BUFF_LEN 16
+
+static const char *TAG = "dsps_fird_s16_ansi";
+
+const static int32_t len = N_IN_SAMPLES;
+const static int32_t N_FFT = (N_IN_SAMPLES / DECIMATION);
+const static int16_t decim = DECIMATION;
+const static int16_t fir_len = COEFFS;
+const static int32_t fir_buffer = (N_IN_SAMPLES + FIR_BUFF_LEN);
+
+
+// error messages for the init functions
+static void error_msg_handler(fir_s16_t *fir, esp_err_t status)
+{
+
+    if (status != ESP_OK) {
+        dsps_fird_s16_aexx_free(fir);
+
+        switch (status) {
+        case ESP_ERR_DSP_INVALID_LENGTH:
+            TEST_ASSERT_MESSAGE(false, "Number of the coefficients must be higher than 1");
+            break;
+        case ESP_ERR_DSP_ARRAY_NOT_ALIGNED:
+            TEST_ASSERT_MESSAGE(false, "Delay line or (and) coefficients arrays not aligned");
+            break;
+        case ESP_ERR_DSP_PARAM_OUTOFRANGE:
+            TEST_ASSERT_MESSAGE(false, "Start position or (and) Decimation ratio or (and) Shift out of range");
+            break;
+        default:
+            TEST_ASSERT_MESSAGE(false, "Unspecified error");
+            break;
+        }
+    }
+}
+
+
+TEST_CASE("dsps_fird_s16_ansi functionality", "[dsps]")
+{
+
+    int16_t *x = (int16_t *)memalign(16, len * sizeof(int16_t));
+    int16_t *y = (int16_t *)memalign(16, len * sizeof(int16_t));
+
+    int16_t *coeffs = (int16_t *)memalign(16, fir_len * sizeof(int16_t));
+    int16_t *delay = (int16_t *)memalign(16, fir_len * sizeof(int16_t));
+
+    const int16_t start_pos = 0;
+    const int16_t shift = 0;
+    const int16_t dec = decim;
+    const int32_t output_len = (int32_t)(len / dec);
+
+    fir_s16_t fir1;
+    for (int i = 0 ; i < fir_len ; i++) {
+        coeffs[i] = 0;
+    }
+    coeffs[0] = 0x4000;
+
+    for (int i = 0 ; i < len ; i++) {
+        x[i] = 0x4000;
+    }
+
+    esp_err_t status = dsps_fird_init_s16(&fir1, coeffs, delay, fir_len, dec, start_pos, shift);
+    error_msg_handler(&fir1, status);
+
+    const int32_t total = dsps_fird_s16_ansi(&fir1, x, y, output_len);
+
+    ESP_LOGI(TAG, "%"PRId32" input samples, decimation %"PRId16",total result = %"PRId32"\n", len, dec, total);
+    TEST_ASSERT_EQUAL(total, len / decim);
+    for (int i = 0 ; i < total ; i++) {
+        ESP_LOGD(TAG, "data[%i] = %d\n", i, y[i]);
+    }
+
+    for (int i = 0 ; i < total ; i++) {
+        TEST_ASSERT_EQUAL(y[i], (0x2000));
+    }
+
+    dsps_fird_s16_aexx_free(&fir1);
+    free(x);
+    free(y);
+    free(coeffs);
+    free(delay);
+}
+
+
+TEST_CASE("dsps_fird_s16_ansi benchmark", "[dsps]")
+{
+
+    int16_t *x = (int16_t *)memalign(16, len * sizeof(int16_t));
+    int16_t *y = (int16_t *)memalign(16, len * sizeof(int16_t));
+
+    int16_t *coeffs = (int16_t *)memalign(16, fir_len * sizeof(int16_t));
+    int16_t *delay = (int16_t *)memalign(16, fir_len * sizeof(int16_t));
+
+    const int repeat_count = 4;
+    const int16_t dec = 1;
+    const int16_t start_pos = 0;
+    const int16_t shift = 0;
+    int32_t output_len = 0;
+
+    fir_s16_t fir1;
+    for (int i = 0 ; i < fir_len ; i++) {
+        coeffs[i] = i;
+    }
+
+    for (int i = 0 ; i < len ; i++) {
+        x[i] = 0;
+    }
+    x[0] = 1;
+
+    esp_err_t status = dsps_fird_init_s16(&fir1, coeffs, delay, fir_len, dec, start_pos, shift);
+    error_msg_handler(&fir1, status);
+
+    // Decimations 1, 2, 4, 8
+    for (int i = 0 ; i < 4 ; i++) {
+
+        output_len = (int32_t)(len / fir1.decim);
+        const unsigned int start_b = dsp_get_cpu_cycle_count();
+        for (int i = 0 ; i < repeat_count ; i++) {
+            dsps_fird_s16_ansi(&fir1, x, y, output_len);
+        }
+        const unsigned int end_b = dsp_get_cpu_cycle_count();
+
+        const float total_b = end_b - start_b;
+        const float cycles = total_b / (len * repeat_count);
+
+        ESP_LOGI(TAG, "total cycles %f per sample for %"PRId16" coefficients, decimation %"PRId16", %f per decim tap \n",
+                 cycles, fir_len, fir1.decim, cycles / (float)fir_len * fir1.decim);
+        float min_exec = 10;
+        float max_exec = 1500;
+        TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
+        fir1.decim *= 2;
+    }
+
+    dsps_fird_s16_aexx_free(&fir1);
+    free(x);
+    free(y);
+    free(coeffs);
+    free(delay);
+}
+
+
+TEST_CASE("dsps_fird_s16_ansi noise_snr", "[dsps]")
+{
+
+    // In the SNR-noise test we are generating a sine wave signal, filtering the signal using a fixed point FIRD filter
+    // and do the FFT of the filtered signal. Afterward, a noise and SNR calculated from the FFT spectrum
+
+    // FIR Coeffs
+    int16_t *s_coeffs = (int16_t *)memalign(16, fir_len * sizeof(int16_t));         // fixed point coefficients
+    int16_t *delay_line = (int16_t *)memalign(16, fir_len * sizeof(int16_t));       // fixed point delay line
+    float *f_coeffs = (float *)memalign(16, fir_len * sizeof(float));               // floating point coefficients
+
+    // Coefficients windowing
+    dsps_wind_hann_f32(f_coeffs, fir_len);
+    const float fir_order = (float)fir_len - 1;
+    const float ft = 0.25;                                                          // sine frequency
+    for (int i = 0; i < fir_len; i++) {
+        f_coeffs[i] *= sinf((2 * M_PI * ft * (i - fir_order / 2))) / (M_PI * (i - fir_order / 2));
+    }
+
+    // FIR coefficients conversion to q15
+    for (int i = 0; i < fir_len; i++) {
+        s_coeffs[i] = f_coeffs[i] * Q15_MAX;
+    }
+
+    free(f_coeffs);
+
+    // Signal generation
+    const float amplitude = 0.9;
+    const float frequency = 0.05;
+    const float phase = 0;
+    float *f_in_signal = (float *)memalign(16, fir_buffer * sizeof(float));
+    dsps_tone_gen_f32(f_in_signal, fir_buffer, amplitude, frequency, phase);
+
+    // Input signal conversion to q15
+    int16_t *fir_x = (int16_t *)memalign(16, fir_buffer * sizeof(int16_t));
+    int16_t *fir_y = (int16_t *)memalign(16, fir_buffer * sizeof(int16_t));
+    for (int i = 0; i < fir_buffer; i++) {
+        fir_x[i] = f_in_signal[i] * (int16_t)Q15_MAX;
+    }
+
+    free(f_in_signal);
+
+    // FIR
+    const int16_t start_pos = 0;
+    const int16_t shift = 0;
+    const int32_t output_len = (int32_t)(fir_buffer / decim);
+    fir_s16_t fir1;
+    esp_err_t status = dsps_fird_init_s16(&fir1, s_coeffs, delay_line, fir_len, decim, start_pos, shift);
+    error_msg_handler(&fir1, status);
+    dsps_fird_s16_ansi(&fir1, fir_x, fir_y, output_len);
+
+    free(delay_line);
+    free(s_coeffs);
+    free(fir_x);
+
+    // FIR Output conversion to float
+    const unsigned int ignored_fir_samples = (FIR_BUFF_LEN / 2) - 1;
+    float *fir_output = (float *)memalign(16, len * sizeof(float));
+    for (int i = 0; i < N_FFT; i++) {
+        fir_output[i] = (float)(fir_y[ignored_fir_samples + i] / (float)Q15_MAX);
+    }
+
+    free(fir_y);
+
+    // Signal windowing
+    float *window = (float *)memalign(16, N_FFT * sizeof(float));
+    dsps_wind_blackman_f32(window, N_FFT);
+
+    // Prepare FFT input, real and imaginary part
+    const int32_t fft_data_len = (N_IN_SAMPLES / DECIMATION) * 2;
+    float *fft_data = (float *)memalign(16, fft_data_len * sizeof(float));
+    for (int i = 0 ; i < N_FFT ; i++) {
+        fft_data[i * 2 + 0] = fir_output[i] * window[i];
+        fft_data[i * 2 + 1] = 0;
+    }
+    free(fir_output);
+    free(window);
+
+    // Initialize FFT
+    esp_err_t ret = dsps_fft2r_init_fc32(NULL, N_FFT * 2);
+    TEST_ESP_OK(ret);
+
+    // Do the FFT
+    dsps_fft2r_fc32(fft_data, N_FFT);
+    dsps_bit_rev_fc32(fft_data, N_FFT);
+    dsps_cplx2reC_fc32(fft_data, N_FFT);
+
+    // Convert the FFT spectrum from amplitude to watts, find the max value and its position
+    float max_val = -1000000;
+    int max_pos = 0;
+    for (int i = 0 ; i < N_FFT / 2 ; i++) {
+        fft_data[i] = (fft_data[i * 2 + 0] * fft_data[i * 2 + 0] + fft_data[i * 2 + 1] * fft_data[i * 2 + 1]) / (N_FFT * 3);
+        if (fft_data[i] > max_val) {
+            max_val = fft_data[i];
+            max_pos = i;
+        }
+    }
+
+    // Calculate the power of the signal and noise of the spectrum and convert the spectrum to dB
+    float signal_pow = 0, noise_pow = 0;
+    for (int i = 0 ; i < N_FFT / 2 ; i++) {
+        if ((i >= max_pos - LEAKAGE_BINS) && (i <= max_pos + LEAKAGE_BINS)) {
+            signal_pow += fft_data[i];
+        } else {
+            noise_pow += fft_data[i];
+        }
+
+        fft_data[i] = 10 * log10f(0.0000000000001 + fft_data[i]);
+    }
+
+    // Convert the signal power and noise power to dB, calculate SNR
+    const float snr = 10 * log10f(signal_pow / noise_pow);
+    noise_pow = 10 * log10f(noise_pow);
+    signal_pow = 10 * log10f(signal_pow);
+
+    ESP_LOGI(TAG, "\nSignal Power: \t%f\nNoise Power: \t%f\nSNR: \t\t%f", signal_pow, noise_pow, snr);
+    dsps_view(fft_data, N_FFT / 2, 64, 16,  -140, 40, '|');
+    free(fft_data);
+
+    const float min_exec_snr = 50;
+    const float max_exec_snr = 120;
+    TEST_ASSERT_EXEC_IN_RANGE(min_exec_snr, max_exec_snr, snr);
+    dsps_fird_s16_aexx_free(&fir1);
+
+}
--- a/managed_components/espressif__esp-dsp/modules/fir/test_sim/main.c
+++ b/managed_components/espressif__esp-dsp/modules/fir/test_sim/main.c
@@ -0,0 +1,12 @@
+
+void test_fir();
+
+int main(void)
+{
+    printf("main starts!\n");
+//    xt_iss_profile_enable();
+    test_fir();
+//    xt_iss_profile_disable();
+
+    printf("Test done\n");
+}
--- a/managed_components/espressif__esp-dsp/modules/fir/test_sim/test_fir.c
+++ b/managed_components/espressif__esp-dsp/modules/fir/test_sim/test_fir.c
@@ -0,0 +1,52 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "dsp_common.h"
+
+#include "dsps_fir.h"
+
+float x[1024];
+float y[1024];
+float y_compare[1024];
+
+float coeffs[256];
+float delay[256];
+float delay_compare[256];
+
+
+void test_fir()
+{
+    int len = sizeof(x) / sizeof(float);
+    int fir_len = sizeof(coeffs) / sizeof(float);
+
+    fir_f32_t fir1;
+    fir_f32_t fir2;
+    for (int i = 0 ; i < fir_len ; i++) {
+        coeffs[i] = i;
+    }
+
+    for (int i = 0 ; i < len ; i++) {
+        x[i] = 0;
+    }
+    x[0] = 1;
+
+    for (int i = 0 ; i < fir_len ; i++) {
+        coeffs[i] = i;
+    }
+
+    for (int i = 0 ; i < len ; i++) {
+        x[i] = i;
+    }
+    x[0] = 1;
+    dsps_fir_init_f32(&fir1, coeffs, delay, fir_len / 4);
+    dsps_fir_init_f32(&fir2, coeffs, delay_compare, fir_len);
+
+    xt_iss_profile_enable();
+    dsps_fir_f32_aes3(&fir1, x, y, len);
+    dsps_fir_f32_ansi(&fir2, x, y_compare, len);
+    xt_iss_profile_disable();
+
+    printf("Test Pass!\n");
+}