add some code

2025-09-05 13:25:11 +08:00
parent 9ff0a99e7a
commit 3cf1229a85
8911 changed files with 2535396 additions and 0 deletions
@@ -0,0 +1,94 @@
+/*
+ * SPDX-FileCopyrightText: 2022 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+.macro fir_s16_ae32_mul x1, x2, count, ID
+// This macro calculates fixed point dot product for ((count + 1)*4) int16 samples
+// x1 - input array1 register (samples)
+// x2 - input array2 register (coefficients) the array is inverted and is being decremented
+// count - counter register (for example a7)
+// count - (samples_count / 4) - 1
+// acc += x1[i + 0]*x2[N - i - 1] + x1[i + 1]*x2[N - i - 2] + x1[i + 2]*x2[N - i - 3] + x1[i + 3]*x2[N - i - 4]; i: 0..count
+// acchi, and acclo have to be initialized before
+// Result - acchi || acclo
+// Modifies: 
+// m0, m1, m2, m3
+// acchi || acclo - must be loaded before (for example 0x3fff to acclo). 
+
+		/*
+		 * Data schedule. Each line represents instruction and columns represent
+		 * register contents. Last column (MUL) shows the multiplication which
+		 * takes place. Values loaded in the given cycle are shown in square brackets.
+		 *
+		 *  m0     m1         m2              m3          MUL
+		 * -----------------  pre-load  --------------------------
+		 *[x0 x1]								                  (no MULs in the first 3 instructions)
+		 * x0 x1        [y(N-1) y(N-2)]
+		 * x0 x1 [x2 x3] y(N-1) y(N-2)
+		 * x0 x1  x2 x3  y(N-1) y(N-2) [y(N-3) y(N-4)] x0*y(N-1)
+		 * --------------------   loop  ------------------------	 (the following 4 instructions are
+		 *[x4 x5] x2 x3  y(N-1) y(N-2)  y(N-3) y(N-4)  x1*y(N-2)     repeated as much as needed)
+		 * x4 x5  x2 x3 [y(N-5) y(M-6)] y(N-3) y(N-4)  x2*y(N-3)
+		 * x4 x5 [x6 x7] y(N-5) y(M-6)  y(N-3) y(N-4)  x3*y(N-4)
+		 * x4 x5  x6 x7  y(N-5) y(M-6) [y(N-7) y(M-8)] x4*y(N-5)
+		 * -------------------  finalize  ----------------------
+		 * x4 x5  x6 x7  y(N-5) y(M-6)  y(N-7) y(M-8)  x5*y(N-6)	(nothing is load)
+		 * x4 x5  x6 x7  y(N-5) y(M-6)  y(N-7) y(M-8)  x6*y(N-7)
+		 * x4 x5  x6 x7  y(N-5) y(M-6)  y(N-7) y(M-8)  x7*y(N-8)
+		 */
+
+		ldinc m0, \x1
+		lddec m2, \x2
+		ldinc m1, \x1
+	
+		mula.dd.lh.lddec m3, \x2, m0, m2
+		loopnez \count, .loop_end_\ID
+		.loop_\ID:
+			mula.dd.hl.ldinc m0, \x1, m0, m2
+			mula.dd.lh.lddec m2, \x2, m1, m3
+			mula.dd.hl.ldinc m1, \x1, m1, m3
+			mula.dd.lh.lddec m3, \x2, m0, m2
+		.loop_end_\ID:
+	
+		mula.dd.hl m0, m2
+		mula.dd.lh m1, m3
+		mula.dd.hl m1, m3
+
+.endm // fir_s16_ae32_mul
+
+.macro fir_s16_ae32_full x1, x2, count, full_count, ID
+// This macro calculates fixed point dot product for ((count + 1)*4) int16 samples
+// x1 - input array1 register (for example a2)
+// x2 - input array2 register (for example a3)
+// count - counter register (for example a7)
+// count -   samples_count / 4 - 1
+// full_count - samples_count
+// acc += x1[i + 0]*x2[N - i - 1] + x1[i + 1]*x2[N - i - 2] + x1[i + 2]*x2[N - i - 3] + x1[i + 3]*x2[N - i - 4]; i: 0..count
+// acchi, and acclo have to be initialized before
+// Result - acchi || acclo
+// Modifies: 
+// m0, m1, m2, m3
+// acchi || acclo - must be loaded before (for example 0x3fff to acclo). 
+
+		// the main mac16 multiplication loop is skipped for cases with less than 4 samples
+		blti \full_count, 4, .less_than_4_operands_\ID
+        	fir_s16_ae32_mul \x1, \x2, \count, \ID
+
+		.less_than_4_operands_\ID:
+
+        bbci  \full_count, 1, .mod2chk_\ID
+		    ldinc m0, \x1
+		    lddec m2, \x2
+		    mula.dd.hl m0, m2
+		    mula.dd.lh m0, m2
+	    .mod2chk_\ID:
+
+		bbci  \full_count, 0, .mod1chk_\ID
+		    ldinc m0, \x1
+		    lddec m2, \x2
+		    mula.dd.lh m0, m2
+	    .mod1chk_\ID:
+
+.endm // fir_s16_ae32_full
@@ -0,0 +1,157 @@
+/*
+ * SPDX-FileCopyrightText: 2022-2023 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "dsps_fir.h"
+#include "malloc.h"
+#include <string.h>
+#include "dsp_tests.h"
+
+#define ROUNDING_VALUE  0x7fff
+
+esp_err_t dsps_fird_init_s16(fir_s16_t *fir, int16_t *coeffs, int16_t *delay, int16_t coeffs_len, int16_t decim, int16_t start_pos, int16_t shift)
+{
+    fir->coeffs = coeffs;
+    fir->delay = delay;
+    fir->coeffs_len = coeffs_len;
+    fir->pos = 0;
+    fir->decim = decim;
+    fir->d_pos = start_pos;
+    fir->shift = shift;
+    fir->rounding_val = (int16_t)(ROUNDING_VALUE);
+    fir->free_status = 0;
+
+    if (fir->coeffs_len < 2) {                                          // number of coeffcients must be higer than 1
+        return ESP_ERR_DSP_INVALID_LENGTH;
+    }
+
+    if ((fir->shift > 40) || (fir->shift < -40)) {                      // shift amount must be within a range from -40 to 40
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+
+    if (fir->d_pos >= fir->decim) {                                     // start position must be lower than decimation
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+
+#if CONFIG_DSP_OPTIMIZED
+
+    // Rounding value buffer primary for a purpose of ee.ld.accx.ip, but used for both the esp32 and esp32s3
+    // dsps_fird_s16_aexx_free() must be called to free the memory after the FIR function is finished
+    int32_t *aexx_rounding_buff = (int32_t *)memalign(16, 2 * sizeof(int32_t));
+
+    long long rounding = (long long)(fir->rounding_val);
+
+    if (fir->shift >= 0) {
+        rounding = (rounding >> fir->shift);
+    } else {
+        rounding = (rounding << (-fir->shift));
+    }
+#if dsps_fird_s16_arp4_enabled
+    fir->pos = start_pos;
+
+    int16_t *new_delay_buff = (int16_t *)memalign(16, (coeffs_len + 8 * 2) * sizeof(int16_t));
+    for (int i = 0 ; i < (coeffs_len + 8 * 2) ; i++) {
+        new_delay_buff[i] = 0;
+    }
+    fir->delay = &new_delay_buff[8];
+    fir->free_status |= 0x0001;
+
+#endif // dsps_fird_s16_arp4_enabled
+
+
+    aexx_rounding_buff[0] = (int32_t)(rounding);                        // 32 lower bits (acclo) type reassignment to 32-bit
+    aexx_rounding_buff[1] = (int32_t)((rounding >> 32) & 0xFF);         // 8 higher bits (acchi) shift by 32 and apply the mask
+    fir->rounding_buff = aexx_rounding_buff;
+    fir->free_status |= 0x0004;
+
+#if dsps_fird_s16_aes3_enabled
+
+    if (fir->delay == NULL) {                                   // New delay buffer is allocated if the current delay line is NULL
+        int16_t *new_delay_buff = (int16_t *)memalign(16, coeffs_len * sizeof(int16_t));
+        fir->delay = new_delay_buff;
+        fir->free_status |= 0x0001;
+    } else {
+        if ((int)fir->delay & 0xf) {                            // Delay line array must be aligned
+            return ESP_ERR_DSP_ARRAY_NOT_ALIGNED;
+        }
+    }
+
+    if ((int)fir->coeffs & 0xf) {                               // Coefficients array must be aligned
+        return ESP_ERR_DSP_ARRAY_NOT_ALIGNED;
+    }
+
+    // If the number of coefficients is not divisible by 8, a new delay line a new coefficients arrays are allocated
+    // the newly allocated arrays are divisible by 8. Coefficients are copied from the original fir structure to
+    // the new coeffs array and the remaining space is filled with zeroes
+    // dsps_fird_s16_free_coeffs_delay must be called to free the memory after the FIR function is finished
+    if (fir->coeffs_len % 8) {                                           // Number of coefficients must be devisible by 8
+        int16_t zero_coeffs = (8 - (fir->coeffs_len % 8));
+        int16_t new_coeffs_len = fir->coeffs_len + zero_coeffs;
+        int16_t *aes3_delay_buff = (int16_t *)memalign(16, new_coeffs_len * sizeof(int16_t));
+        int16_t *aes3_coeffs_buff = (int16_t *)memalign(16, new_coeffs_len * sizeof(int16_t));
+
+        for (int i = 0; i < fir->coeffs_len; i++) {                      // copy fir->coeffs to aes3_coeffs_buff
+            aes3_coeffs_buff[i] = fir->coeffs[i];
+        }
+
+        for (int i = fir->coeffs_len; i < new_coeffs_len; i++) {                  // add zeroes to the end
+            aes3_coeffs_buff[i] = 0;
+        }
+
+        fir->delay = aes3_delay_buff;
+        fir->coeffs = aes3_coeffs_buff;
+        fir->coeffs_len = new_coeffs_len;
+        fir->free_status |= 0x0002;
+    }
+
+#endif      // dsps_fird_s16_aes3_enabled
+#endif      // CONFIG_DSP_OPTIMIZED
+
+    for (int i = 0; i < fir->coeffs_len; i++) {                                  // Initialize the dealy line to zero
+        fir->delay[i] = 0;
+    }
+
+    return ESP_OK;
+}
+
+esp_err_t dsps_fird_s16_aexx_free(fir_s16_t *fir)
+{
+
+    if (fir->free_status == 0) {
+        return ESP_OK;
+    }
+
+    if (fir->free_status & 0x0003) {
+
+        if (fir->free_status & 0x0002) {
+            free(fir->coeffs);
+        }
+#if dsps_fird_s16_arp4_enabled
+        fir->delay = &fir->delay[-8];
+#endif
+        free(fir->delay);
+    }
+
+    if (fir->free_status & 0x0004) {
+        free(fir->rounding_buff);
+    }
+    fir->free_status = 0;
+
+    return ESP_OK;
+}
+
+
+esp_err_t dsps_16_array_rev(int16_t *arr, int16_t len)
+{
+
+    int16_t temp;
+
+    for (int i = 0; i < (int)(len / 2); i++) {
+        temp = arr[i];
+        arr[i] = arr[len - 1 - i];
+        arr[len - 1 - i] = temp;
+    }
+    return ESP_OK;
+}
@@ -0,0 +1,181 @@
+/*
+ * SPDX-FileCopyrightText: 2022 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "dsps_fir_platform.h"
+#if (dsps_fird_s16_ae32_enabled == 1)
+
+#include "dsps_fir_s16_m_ae32.S"
+ 
+// This is FIR filter for ESP32 processor.
+	.text
+	.align  4
+	.global dsps_fird_s16_ae32
+	.type   dsps_fird_s16_ae32,@function
+// The function implements the following C code:
+//int32_t dsps_fird_s16_ansi(fir_s16_t *fir, const int16_t *input, int16_t *output, int32_t len)
+
+
+dsps_fird_s16_ae32: 
+// Input params					Variables
+//
+// fir      - a2				N			- a6
+// input    - a3				pos			- a7
+// output   - a4				rounding_lo - a8
+// len      - a5				d_pos		- a9
+//								&coeffs[N]	- a10
+//								delay		- a11
+//								decim		- a12
+//								rounding_hi - a13	
+//								final_shift - a14 (shift)
+
+	entry    a1,    32				
+
+	l16si    a7,    a2,    10					// a7  - pos
+	l16si	 a6,    a2,    8					// a6  - N
+	l32i	 a10,   a2,    0					// a10 - coeffs
+	addx2	 a10,   a6,    a10					// a10 - coeffs[N+1]
+	addi     a10,   a10,  -4					// a10 - coeffs[N]
+	s32i	 a10,   a1,    0					// save pointer to a1
+	l32i     a11,   a2,    4         			// a11 - delay line
+	l16si    a12,   a2,    12        			// a12 - decimation		
+	l16si    a9,    a2,    14        			// a9  - d_pos			
+	l16si    a14,   a2,    16					// a14 - shift		    
+		
+	// prepare rounding value		
+	l32i     a15,   a2,    20                   // get address of rounding array to a15
+	l32i 	 a8,    a15,    0					// a8 =  lower 32 bits of the rounding value (acclo)
+	l32i     a13,   a15,    4					// a13 = higher 8 bits of the rounding value (acchi), offset 4 (32 bits)
+		
+	// prepare final_shift value					
+	addi 	 a14,   a14,  -15					// shift - 15
+	abs		 a15, 	a14
+	blti	 a15,    32,   _shift_lower_than_32_init		// check if lower than 32
+
+												// greater than 32 could only be negative shift ((-40 to +40) - 15) -> -55 to +25 
+	addi	 a14, 	a14,   32					// if greater than 32, add 32 (SRC is not defined for SAR greater than 32)
+	_shift_lower_than_32_init:
+
+	bltz	 a14,   _shift_negative_init		// branch if lower than zero (not including zero)
+	beqz	 a14,	_shift_negative_init		// branch if equal to zero (add zero to the previous statement)
+	ssl		 a14								// if positive, set SAR register to left shift value (SAR = 32 - shift)
+	
+	j _end_of_shift_init
+
+	_shift_negative_init:						// negative shift
+	abs		 a14,   a14							// absolute value
+	ssr		 a14								// SAR = -shift
+	// final_shift is saved to SAR register, SAR is not being changed during the execution
+
+	_end_of_shift_init:	
+	l16si    a14,   a2,    16					// a14 - load shift value
+	addi 	 a14,   a14,  -15					// shift - 15
+	
+	s32i     a5,    a1,    4             		// save len to a1, used as the return value
+
+
+ 	// first delay line load (decim - d_pos) when d_pos is not 0
+	beqz	a9,     _fird_loop_len
+	sub		a15,    a12, a9						// a15 = decim - d_pos
+
+	loopnez a15,  ._loop_d_pos
+		 
+		blt    a7,   a6,   reset_fir_pos_d_pos	// branch if fir->pos >= fir->N
+			movi.n   a7,   0					// fir->pos = 0
+			l32i     a11,  a2,   4      		// reset delay line to the beginning
+		reset_fir_pos_d_pos:		
+
+		l16si	 a15,  a3,   0					// load 16 bits from input (a3) to a15
+		addi 	 a7,   a7,   1					// fir->pos++
+		s16i	 a15,  a11,  0					// save 16 bits from a15 to delay line (a11)
+		addi	 a3,   a3,   2					// increment input pointer
+		addi	 a11, a11,   2					// increment delay line pointer
+	._loop_d_pos:		
+
+	j .fill_delay_line							// skip the first iteration of the delay line filling routine
+
+	// outer loop
+	_fird_loop_len:
+
+		loopnez a12, .fill_delay_line
+
+			blt a7, a6, reset_fir_pos			// branch if fir->pos >= fir->N
+				movi.n   a7,   0				// fir->pos = 0
+				l32i	 a11,  a2,  4       	// reset delay line to the beginning
+			reset_fir_pos:		
+
+			l16si	 a15,  a3,    0				// load 16 bits from input (a3) to a15
+			addi 	 a7,   a7,    1				// fir->pos++
+			s16i	 a15,  a11,   0				// save 16 bits from a15 to delay line (a11)
+			addi	 a3,   a3,    2				// increment input pointer
+			addi	 a11,  a11,   2				// increment delay line pointer
+		.fill_delay_line:
+
+		// prepare MAC unit
+		wsr	    a8,   acclo						// acclo = a8
+		wsr		a13,  acchi						// acchi = a13
+
+		addi    a11,  a11,  -4 					// preset delay line pointer, samples (array is being incremented)
+		sub     a9,   a6,    a7   				// a9 = full_count = fir->N - fir->pos
+
+		// (Count / 4) - 1		
+		srli    a15,  a9,    2					// a15 = count = full_count /4
+		addi    a10,  a10,   4 					// preset coeffs pointer, samples (array is being decremented)
+		addi    a15,  a15,  -1					// count - 1
+
+		// x1, x2, count, full_count, ID
+		fir_s16_ae32_full a11, a10, a15, a9, __LINE__
+
+		l32i	a10,  a2,    0         			// load coeffs
+		l32i 	a11,  a2,    4					// reset delay line to the beginning
+		addx2	a10,  a7,    a10				// move coeffs pointer to the end
+		
+		srli 	a15,  a7,    2					// a15 = count = full_count (fir->pos) / 4
+		addi    a11,  a11,  -4 					// preset delay line pointer, samples (array is being incremented)
+		addi    a15,  a15,  -1					// count - 1
+
+		// x1, x2, count, full_count, ID
+		fir_s16_ae32_full a11, a10, a15, a7, __LINE__
+
+		// SAR already set from the beginning to final_shift value
+		abs		a15,  a14						// absolute value of shift
+		l32i	a10,  a1, 	 0					// reset coefficient pointer
+		blti    a15,  32,   _shift_lower_than_32
+		rsr 	a9,   acchi						// get only higher 8 bits of the acc register
+		movi.n	a15,  0xFF						// higher 8 bits mask
+		and		a9,   a9,  a15					// apply mask
+		srl		a15,  a9	
+		j 		_shift_set
+
+		_shift_lower_than_32:
+		rsr 	a9,   acchi						// get higher 8 bits of the acc register
+		movi.n	a11,  0xFF						// higher 8 bits mask
+		rsr 	a15,  acclo						// get lower 32 bits of the acc register
+		and		a9,   a9,  a11					// apply mask
+
+
+		bltz	a14,  _shift_negative 			// branch if lower than zero (if negative)
+		beqz	a14,  _shift_negative
+		src		a15,  a15,  a9					// funnel shift left
+		j 		_shift_set
+
+		_shift_negative:						// negative shift
+		src		a15,  a9,  a15					// funnel shift right
+
+		_shift_set:
+		
+		l32i    a11,  a2,    4					// Load initial position of the delay line
+		s16i	a15,  a4, 	 0					// save the shifted value to the output array (a4)
+		addi 	a5,   a5,   -1					// len--
+		addi	a4,   a4, 	 2					// increase pointer of the output array		
+		addx2	a11,  a7,    a11				// p_delay[fir->pos] - (two times the fir->pos)	
+
+		// counter				
+		bnez    a5,   _fird_loop_len			// break if a5 == 0
+
+	l32i.n     a2,  a1,  4                     	// load return value to a2
+	retw.n
+
+#endif // dsps_fird_s16_ae32_enabled
@@ -0,0 +1,54 @@
+/*
+ * SPDX-FileCopyrightText: 2022-2023 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "dsps_fir.h"
+
+int32_t dsps_fird_s16_ansi(fir_s16_t *fir, const int16_t *input, int16_t *output, int32_t len)
+{
+    int32_t result = 0;
+    int32_t input_pos = 0;
+    long long rounding = 0;
+    const int32_t final_shift = fir->shift - 15;
+
+    rounding = (long long)(fir->rounding_val);
+
+    if (fir->shift >= 0) {
+        rounding = (rounding >> fir->shift) & 0xFFFFFFFFFF;         // 40-bit mask
+    } else {
+        rounding = (rounding << (-fir->shift)) & 0xFFFFFFFFFF;      // 40-bit mask
+    }
+
+    // len is already a length of the *output array, calculated as (length of the input array / decimation)
+    for (int i = 0; i < len; i++) {
+
+        for (int j = 0; j < fir->decim - fir->d_pos; j++) {
+
+            if (fir->pos >= fir->coeffs_len) {
+                fir->pos = 0;
+            }
+            fir->delay[fir->pos++] = input[input_pos++];
+        }
+        fir->d_pos = 0;
+
+        long long acc = rounding;
+        int16_t coeff_pos = fir->coeffs_len - 1;
+
+        for (int n = fir->pos; n < fir->coeffs_len ; n++) {
+            acc += (int32_t)fir->coeffs[coeff_pos--] * (int32_t)fir->delay[n];
+        }
+        for (int n = 0; n < fir->pos ; n++) {
+            acc += (int32_t)fir->coeffs[coeff_pos--] * (int32_t)fir->delay[n];
+        }
+
+        if (final_shift > 0) {
+            output[result++] = (int16_t)(acc << final_shift);
+        } else {
+            output[result++] = (int16_t)(acc >> (-final_shift));
+        }
+
+    }
+    return result;
+}
@@ -0,0 +1,150 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dsps_fir_platform.h"
+#if (dsps_fird_s16_arp4_enabled == 1)
+
+// This is FIR filter for esp32p4 processor.
+    .text
+    .align  4
+    .global dsps_fird_s16_arp4
+    .global dsps_fird_s16_ansi
+    .type   dsps_fird_s16_arp4,@function
+// The function implements the following C code:
+// int32_t dsps_fird_s16_arp4(fir_s16_t *fir, const int16_t *input, int16_t *output, int32_t len);
+
+dsps_fird_s16_arp4:
+    // In case of filter length different then 8*K
+    lh  t2, 8(a0)   // t2 - coeffs_len
+    andi t2, t2, 7
+    beqz    t2, .dsps_fird_s16_arp4_body
+    j   dsps_fird_s16_ansi
+
+.dsps_fird_s16_arp4_body:
+    add sp,sp,-48
+    sw  s0, 0(sp)
+    sw  s1, 4(sp)
+    sw  s2, 8(sp)
+    sw  s3, 12(sp)
+    sw  s4, 16(sp)
+    sw  s5, 20(sp)
+    sw  s6, 24(sp)
+    sw  s7, 28(sp)
+    sw  s8, 32(sp)
+    sw  s9, 36(sp)
+    sw  s10, 40(sp)
+    sw  s11, 44(sp)
+
+    // Enable analigned data access
+    esp.movx.r.cfg t6
+    or t6, t6, 2
+    esp.movx.w.cfg t6
+
+    lw  t1, 4(a0)       // t1 - delay_line
+    lh  t2, 8(a0)       // t2 - coeffs_len
+    lh  t3, 10(a0)      // t3 - pos
+    lh  t6, 16(a0)      // t6 - shift
+    add t6, t6, -15
+    neg t6, t6
+    lw  t5, 20(a0)      // t5 - rounding_buff
+    lw  s2, 4(a0)       // s2 - delay_line* current position
+    add s2, s2, t3      // s2 = delay_line + pos*2
+    add s2, s2, t3      //
+    add s4, t2, t2      // s4 = coeff_len*2
+    add s0, t1, s4      // s0 - &delay[coeffs_len]
+
+    lh  a4,  0(t1)
+.loop_len:
+         lh  t4, 12(a0)          // t4 - decim
+        .loop_decim_copy:
+            lh   s1, 0(a1)      // load input data
+            add  a1, a1, 2
+
+            sh   s1, 0(s2)            
+            add  s2, s2, 2     // preincrement of delay line
+            bgt  s0, s2, .skeep_reset
+                lw  s2, 4(a0)       // s2 - delay_line
+            .skeep_reset:
+            add  t4, t4, -1
+        bgtz t4, .loop_decim_copy
+
+        // s5 - count1 = length - pos
+        // s6 = count1 >> 3 :  
+        sub  t3, s2, t1
+        srli t3, t3, 1          // t3 = (pos*2)>>1
+        sub  s5, t2, t3
+        srli s6, s5, 3          // s6 = (coeff_len - pos)>>3
+
+        srli s7, t3, 3          // s7 = pos>>3
+        and  s8, t3, 0x07       // s8 = pos&0x07
+
+        esp.ld.xacc.ip           t5, 0                                          // load rounding value to accx
+
+        lw  s10, 0(a0)          // s10 - coeffs
+        esp.vld.128.ip          q0, s10, 16 //q0 - coeffs
+        mv      s9, s2          // s9 - pointer to delay line
+        esp.vld.128.ip          q1, s9, 16  // q1 - delay line data
+
+        beqz s6, .skip_main_loop1
+        esp.lp.setup    0, s6, .main_loop1
+            esp.vmulas.s16.xacc.ld.ip     q0, s10, 16, q0, q1   // q0 - coeffs, q1 - data
+        .main_loop1:     esp.vld.128.ip  q1, s9, 16              // Load delay line    
+.skip_main_loop1: nop
+
+
+        add     s9, s9, -16
+        sub     s9, s9, s4
+        beqz s8, .skip_rest_add
+            esp.vld.128.ip          q2, s9, 16
+            esp.vadd.s16            q1, q2, q1
+            esp.vmulas.s16.xacc.ld.ip     q0, s10, 16, q0, q1   // q0 - coeffs, q1 - data
+        .skip_rest_add: 
+        esp.vld.128.ip          q1, s9, 16
+        
+        beqz s7, .skip_main_loop3
+        esp.lp.setup    1, s7, .main_loop3
+            esp.vmulas.s16.xacc.ld.ip     q0, s10, 16, q0, q1 // q0 - coeffs, q1 - data
+            esp.vld.128.ip  q1, s9, 16
+        .main_loop3: nop
+.skip_main_loop3: nop
+
+        // Shift and Store result
+        esp.srs.s.xacc       s11, t6   // shift accx register by final_shift amount (a6), save the lower 32bits to a15
+        sh  s11, 0(a2)      // store result to output buffer 
+        add a2, a2, 2
+
+        add  a3, a3, -1
+    bgtz a3, .loop_len
+    sh   t3, 10(a0)
+
+.fast_exit:
+    mv  a0, a6
+
+    lw  s0, 0(sp)
+    lw  s1, 4(sp)
+    lw  s2, 8(sp)
+    lw  s3, 12(sp)
+    lw  s4, 16(sp)
+    lw  s5, 20(sp)
+    lw  s6, 24(sp)
+    lw  s7, 28(sp)
+    lw  s8, 32(sp)
+    lw  s9, 36(sp)
+    lw  s10, 40(sp)
+    lw  s11, 44(sp)
+
+    add sp,sp,48
+    ret
+
+#endif //