add some code

2025-09-05 13:25:11 +08:00
parent 9ff0a99e7a
commit 3cf1229a85
8911 changed files with 2535396 additions and 0 deletions
--- a/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fir_f32_ae32.S
+++ b/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fir_f32_ae32.S
@@ -0,0 +1,95 @@
+// Copyright 2018-2023 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dsps_fir_platform.h"
+#if (dsps_fir_f32_ae32_enabled == 1)
+
+#include "dsps_dotprod_f32_m_ae32.S"
+
+// This is FIR filter for ESP32 processor.
+	.text
+	.align  4
+	.global dsps_fir_f32_ae32
+	.type   dsps_fir_f32_ae32,@function
+// The function implements the following C code:
+//esp_err_t dsps_fir_f32_ae32(fir_f32_t* fir, const float* input, float* output, int len);
+
+dsps_fir_f32_ae32: 
+// fir      - a2
+// input    - a3
+// output   - a4
+// len      - a5
+
+	entry	a1, 16
+	// Array increment for floating point data should be 4
+	l32i    a7,  a2, 12 // a7  - pos
+	movi    a10, 4
+	mull    a13, a7, a10// a13 - a7*4
+	l32i    a6,  a2, 8  // a6  - N
+	mull    a6, a6, a10// a6 = a6*4
+	l32i    a10, a2, 0  // a10 - coeffs
+	l32i    a6,  a2, 8  // a6  - N
+
+	movi.n a9, 0
+	movi.n a8, 4
+	movi.n a12, 4
+
+//  a13 - delay index
+fir_loop_len:
+		// Store to delay line
+		l32i    a11, a2, 4      // a11 - delay line
+		lsi     f0, a3, 0       // f0 = x[i]
+		addi    a3, a3, 4       // x++
+		ssx     f0, a11, a13    // delay[a13] = f0;
+		addi    a13, a13, 4     // a13++
+		addi    a7, a7, 1       // a7++
+		// verify deley line
+		blt     a7, a6, do_not_reset_a13
+			movi    a13, 0
+			movi    a7,  0
+	do_not_reset_a13:
+		// Calc amount for delay line before end
+		mov     a15, a10        // a15 - coeffs
+		wfr	    f2, a9 // f2 = 0;
+		sub   a14, a6, a7   // a14 = N-pos
+
+		// a11 = &delay[pos]
+		add     a11, a11, a13
+
+		loopnez  a14, first_fir_loop // pos...N-1
+			lsxp     f1, a15, a8     // f1 = *(coeffs--)
+			lsxp     f0, a11, a12    // load delay f0 = *(delay++)
+			madd.s  f2, f0, f1       // f2 += f0*f1
+first_fir_loop:
+		l32i    a11, a2, 4           // a11 - delay line
+		loopnez  a7, second_fir_loop // 0..pos
+			lsxp     f1, a15, a8     // f1 = *(coeffs--)
+			lsxp     f0, a11, a12    // load delay f0 = *(delay++)
+			madd.s  f2, f0, f1      // f2 += f0*f1
+second_fir_loop:
+
+		// and after end
+		// Store result
+		ssi     f2, a4, 0
+		addi    a4, a4, 4 // y++ - increment output pointer
+		// Check loop 
+		addi   a5, a5, -1
+	bnez    a5, fir_loop_len
+	// store state
+
+	s32i    a7,  a2, 12 // pos = a7
+	movi.n	a2, 0 // return status ESP_OK
+	retw.n
+
+#endif // dsps_fir_f32_ae32_enabled
--- a/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fir_f32_aes3.S
+++ b/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fir_f32_aes3.S
@@ -0,0 +1,233 @@
+// Copyright 2018-2023 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dsps_fir_platform.h"
+#if (dsps_fir_f32_aes3_enabled == 1)
+
+// This is FIR filter for Esp32s3 processor.
+    .text
+    .align  4
+    .global dsps_fir_f32_aes3
+    .type   dsps_fir_f32_aes3,@function
+// The function implements the following C code:
+//esp_err_t dsps_fir_f32_aes3(fir_f32_t* fir, const float* input, float* output, int len);
+
+dsps_fir_f32_aes3:
+// fir      - a2
+// input    - a3
+// output   - a4
+// len      - a5
+
+// a2 - fir structure
+// a3 - input
+// a4 - output
+// a5 - length
+
+// a6 - fir length
+// a7 - position in delay line
+// a8 - temp
+// a9 - const 0
+// a10 - coeffs ptr
+// a11 - delay line ptr
+// a12 - const
+// a13 - 
+// a14 - temp for loops
+// a15 - delay line rounded to 16
+
+    entry	a1, 16
+    // Array increment for floating point data should be 4
+    l32i    a7,  a2, 12 // a7  - pos
+
+    l32i    a6,  a2, 8  // a6  - N - amount of coefficients
+    l32i    a10, a2, 0  // a10 - coeffs
+    l32i    a11, a2, 4  // a11 - delay line
+    addx4	a11, a7, a11 // a11 = a11 + a7*4	
+    l32i    a6,  a2, 8   // a6  - N
+
+    movi.n	a9, 0
+    movi.n	a12, 3
+
+    movi.n	a12, -16
+    movi.n	a13, 15
+// Main loop for input samples
+.fir_loop_len:
+        // Store to delay line
+        lsip	f15,  a3, 4		// a3  += 4, f15 = input[n]
+        ssip	f15, a11, 4		// a11 += 4, *a11 = f15
+        addi    a7,  a7,  1     // a7++ - position in delay line
+
+        //
+        blt     a7, a6, .do_not_reset_a11
+            l32i    a11, a2, 4	// Load delay line
+            movi    a7,  0
+    .do_not_reset_a11:
+        // Load rounded delay line address
+        and     a15, a11, a12
+
+        l32i    a10, a2, 0  // a10 - coeffs
+
+        // Clear f4, f5 for multiplications
+        const.s f4, 0
+        const.s f5, 0
+        const.s f6, 0
+        const.s f7, 0
+
+        and		a8, a11, a13		// a8 = a11 & 15
+        beqz   	a8, .offset_0
+        addi   	a8, a8, -4
+        beqz   	a8, .offset_1
+        addi   	a8, a8, -4
+        beqz   	a8, .offset_2
+        addi   	a8, a8, -4
+        beqz   	a8, .offset_3
+
+// a10 - coeffs
+// a11 - delay line
+.offset_0:
+        sub   a14, a6, a7   // a14 = N-pos
+        srli  a14, a14, 2
+        loopnez  a14, .first_fir_loop_0 // pos...N-1
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+            madd.s  f4, f0, f8
+            madd.s  f5, f1, f9
+            madd.s  f6, f2, f10
+            madd.s  f7, f3, f11
+        .first_fir_loop_0:
+        
+        l32i    a15, a2, 4  // a11 - delay line [0]	
+        srli  a14, a7, 2
+        loopnez  a14, .second_fir_loop_0 // 0..pos
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+            madd.s  f4, f0, f8
+            madd.s  f5, f1, f9
+            madd.s  f6, f2, f10
+            madd.s  f7, f3, f11
+        .second_fir_loop_0:
+        j    .store_fir_result;
+
+.offset_1:
+        sub   a14, a6, a7   // a14 = N-pos
+        addi  a14, a14, 3
+        srli  a14, a14, 2
+        EE.LDF.128.IP f11, f10, f9, f12, a15, 16 // Load data from delay line
+        // f12 - delay[N-1], store for the last operation
+        // f9..f11 - delay[0..2]
+        loopnez  a14, .first_fir_loop_1 // pos...N-1
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            madd.s  f4, f0, f9
+            madd.s  f5, f1, f10
+            madd.s  f6, f2, f11
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+            madd.s  f7, f3, f8
+        .first_fir_loop_1:
+        
+        l32i    a15, a2, 4  // a11 - delay line [0]
+        EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line		
+        srli  a14, a7, 2
+        loopnez  a14, .second_fir_loop_1 // 0..pos
+            madd.s  f4, f3, f8
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            madd.s  f5, f0, f9
+            madd.s  f6, f1, f10
+            madd.s  f7, f2, f11
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+        .second_fir_loop_1:
+
+        madd.s  f4, f3, f12
+        j    .store_fir_result;
+
+.offset_2:
+        sub   a14, a6, a7   // a14 = N-pos
+        addi  a14, a14, 3
+        srli  a14, a14, 2
+        EE.LDF.128.IP f11, f10, f13, f12, a15, 16 // Load data from delay line
+        // f12, f13 - delay[N-1], delay[N-2], store for the last operation
+        // f10..f11 - delay[0..1]
+        loopnez  a14, .first_fir_loop_2 // pos...N-1
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            madd.s  f4, f0, f10
+            madd.s  f5, f1, f11
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+            madd.s  f6, f2, f8
+            madd.s  f7, f3, f9
+        .first_fir_loop_2:
+        
+        l32i    a15, a2, 4  // a11 - delay line [0]
+        EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line		
+        srli  a14, a7, 2
+        loopnez  a14, .second_fir_loop_2 // 0..pos
+            madd.s  f4, f2, f8
+            madd.s  f5, f3, f9
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            madd.s  f6, f0, f10
+            madd.s  f7, f1, f11
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+        .second_fir_loop_2:
+
+        madd.s  f4, f2, f12
+        madd.s  f5, f3, f13
+        j    .store_fir_result;
+
+.offset_3:
+        sub   a14, a6, a7   // a14 = N-pos
+        addi  a14, a14, 3
+        srli  a14, a14, 2
+        EE.LDF.128.IP f11, f14, f13, f12, a15, 16 // Load data from delay line
+        // f12, f13, f14 - delay[N-1], delay[N-2], delay[N-3], store for the last operation
+        // f11 - delay[0]
+        loopnez  a14, .first_fir_loop_3 // pos...N-1
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            madd.s  f4, f0, f11
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+            madd.s  f5, f1, f8
+            madd.s  f6, f2, f9
+            madd.s  f7, f3, f10
+        .first_fir_loop_3:
+        
+        l32i    a15, a2, 4  // a11 - delay line [0]
+        EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line		
+        srli  a14, a7, 2
+        loopnez  a14, .second_fir_loop_3 // 0..pos
+            madd.s  f4, f1, f8
+            madd.s  f5, f2, f9
+            madd.s  f6, f3, f10
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            madd.s  f7, f0, f11
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+        .second_fir_loop_3:
+
+        madd.s  f4, f1, f12
+        madd.s  f5, f2, f13
+        madd.s  f4, f3, f14
+
+.store_fir_result:
+
+    add.s   f4, f4, f5
+    add.s   f6, f6, f7
+    add.s   f4, f4, f6
+
+    // Store result
+    ssip     f4, a4, 4  // y++ - save result and increment output pointer
+    // Check loop length
+    addi   a5, a5, -1
+    bnez    a5, .fir_loop_len
+    // store state
+
+    s32i    a7,  a2, 12 // pos = a7
+    movi.n	a2, 0 // return status ESP_OK
+    retw.n
+
+#endif // dsps_fir_f32_aes3_enabled
--- a/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fir_f32_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fir_f32_ansi.c
@@ -0,0 +1,36 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dsps_fir.h"
+
+esp_err_t dsps_fir_f32_ansi(fir_f32_t *fir, const float *input, float *output, int len)
+{
+    for (int i = 0 ; i < len ; i++) {
+        float acc = 0;
+        int coeff_pos = 0;
+        fir->delay[fir->pos] = input[i];
+        fir->pos++;
+        if (fir->pos >= fir->N) {
+            fir->pos = 0;
+        }
+        for (int n = fir->pos; n < fir->N ; n++) {
+            acc += fir->coeffs[coeff_pos++] * fir->delay[n];
+        }
+        for (int n = 0; n < fir->pos ; n++) {
+            acc += fir->coeffs[coeff_pos++] * fir->delay[n];
+        }
+        output[i] = acc;
+    }
+    return ESP_OK;
+}
--- a/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fir_init_f32.c
+++ b/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fir_init_f32.c
@@ -0,0 +1,67 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dsps_fir.h"
+#include "malloc.h"
+
+
+esp_err_t dsps_fir_init_f32(fir_f32_t *fir, float *coeffs, float *delay, int coeffs_len)
+{
+    // Allocate delay line in case if it's NULL
+    if (delay == NULL) {
+#ifdef CONFIG_IDF_TARGET_ESP32S3
+        delay = (float *)memalign(16, (coeffs_len + 4) * sizeof(float));
+#else
+        delay = (float *)malloc((coeffs_len + 4) * sizeof(float));
+#endif // CONFIG_IDF_TARGET_ESP32S3
+        fir->use_delay = 1;
+    } else {
+        fir->use_delay = 0;
+    }
+    for (int i = 0; i < (coeffs_len + 4); i++) {
+        delay[i] = 0;
+    }
+    fir->coeffs = coeffs;
+    fir->delay = delay;
+    fir->N = coeffs_len;
+    fir->pos = 0;
+
+#ifdef CONFIG_IDF_TARGET_ESP32S3
+    if (fir->N % 4 != 0) {
+        return ESP_ERR_DSP_INVALID_LENGTH;
+    }
+    // The coeffs array should be aligned to 16
+    if (((uint32_t)coeffs) & 0x0f) {
+        return ESP_ERR_DSP_ARRAY_NOT_ALIGNED;
+    }
+    // The delay array should be aligned to 16
+    if (((uint32_t)delay) & 0x0f) {
+        return ESP_ERR_DSP_ARRAY_NOT_ALIGNED;
+    }
+#endif // CONFIG_IDF_TARGET_ESP32S3
+
+    for (int i = 0 ; i < coeffs_len; i++) {
+        fir->delay[i] = 0;
+    }
+    return ESP_OK;
+}
+
+esp_err_t dsps_fir_f32_free(fir_f32_t *fir)
+{
+    if (fir->use_delay != 0) {
+        fir->use_delay = 0;
+        free(fir->delay);
+    }
+    return ESP_OK;
+}
--- a/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fird_f32_ae32.S
+++ b/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fird_f32_ae32.S
@@ -0,0 +1,98 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dsps_fir_platform.h"
+#if (dsps_fird_f32_ae32_enabled == 1)
+
+#include "dsps_dotprod_f32_m_ae32.S"
+
+// This is FIR filter for ESP32 processor.
+	.text
+	.align  4
+	.global dsps_fird_f32_ae32
+	.type   dsps_fird_f32_ae32,@function
+// The function implements the following C code:
+//esp_err_t dsps_fird_f32_ae32(fir_f32_t* fir, const float* input, float* output, int len);
+
+dsps_fird_f32_ae32: 
+// fir      - a2
+// input    - a3
+// output   - a4
+// len      - a5
+
+	entry	a1, 16
+	// Array increment for floating point data should be 4
+	l32i    a7,  a2, 12 // a7  - pos
+	movi    a10, 4
+	mull    a13, a7, a10// a13 - a7*4
+	l32i    a6,  a2, 8  // a6  - N
+	mull    a6, a6, a10// a6 = a6*4
+	l32i    a10, a2, 0  // a10 - coeffs
+	l32i    a11, a2, 4  // a11 - delay line
+	l32i    a6,  a2, 8  // a6  - N
+	l32i    a12, a2, 16  // a12  - decimation
+	movi    a8, 0         // result = 0;
+
+//  a13 - delay index
+fird_loop_len:
+		// Store to delay line
+		
+        loopnez  a12, .fird_load_data // K loops
+			lsip    f0, a3, 4       // f0 = x[i++]
+			ssx     f0, a11, a13    // delay[a13] = f0;
+			addi    a13, a13, 4     // a13++
+			addi    a7, a7, 1       // a7++
+			// verify deley line
+			blt     a7, a6, do_not_reset_a13
+				movi    a13, 0
+				movi    a7,  0
+			do_not_reset_a13:
+        	const.s f2, 0
+		.fird_load_data:
+
+		addi    a8, a8, 1
+
+		// Calc amount for delay line before end
+		mov     a15, a10        // a15 - coeffs
+		sub   a14, a6, a7   // a14 = N-pos
+		loopnez  a14, first_fird_loop // pos...N-1
+			lsip    f1, a15, 4		// a15++
+			lsx     f0, a11, a13    // load delay f0 = delay[pos]
+			addi    a13, a13, 4     // a13++, pos++
+			madd.s  f2, f0, f1      // f2 += f0*f1
+first_fird_loop:
+		movi a13, 0    // load delay line counter to 0
+		loopnez  a7, second_fird_loop // 0..pos
+			lsip    f1, a15, 4		// a15++
+			lsx     f0, a11, a13    // load delay f0 = delay[pos]
+			addi    a13, a13, 4     // a13++, pos++
+			madd.s  f2, f0, f1      // f2 += f0*f1
+second_fird_loop:
+
+		// and after end
+		// Store result
+		ssi     f2, a4, 0
+		addi    a4, a4, 4 // y++ - increment output pointer
+next_itt_fir32:        
+		// Check loop 
+		addi   a5, a5, -1
+	bnez    a5, fird_loop_len
+	// store state
+
+	s32i    a7,  a2, 12 // pos = a7
+	
+	mov 	a2, a8 // return status ESP_OK
+	retw.n
+
+#endif // dsps_fird_f32_ae32_enabled
--- a/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fird_f32_aes3.S
+++ b/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fird_f32_aes3.S
@@ -0,0 +1,239 @@
+// Copyright 2018-2023 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dsps_fir_platform.h"
+#if (dsps_fird_f32_aes3_enabled == 1)
+
+// This is FIR filter for Esp32s3 processor.
+    .text
+    .align  4
+    .global dsps_fird_f32_aes3
+    .type   dsps_fird_f32_aes3,@function
+// The function implements the following C code:
+//esp_err_t dsps_fird_f32_aes3(fir_f32_t* fir, const float* input, float* output, int len);
+
+dsps_fird_f32_aes3:
+// fir      - a2
+// input    - a3
+// output   - a4
+// len      - a5
+
+// a2 - fir structure
+// a3 - input
+// a4 - output
+// a5 - length
+
+// a6 - fir length
+// a7 - position in delay line
+// a8 - temp
+// a10 - coeffs ptr
+// a11 - delay line ptr
+// a12 - const
+// a13 - 
+// a14 - temp for loops
+// a15 - delay line rounded to 16
+
+    entry	a1, 16
+    // Array increment for floating point data should be 4
+    l32i    a7,  a2, 12 // a7  - pos
+
+    l32i    a6,  a2, 8  // a6  - N - amount of coefficients
+    l32i    a10, a2, 0  // a10 - coeffs
+    l32i    a11, a2, 4  // a11 - delay line
+    addx4	a11, a7, a11 // a11 = a11 + a7*4	
+    l32i    a6,  a2, 8   // a6  - N
+
+    mov.n    a9, a5
+    movi.n	a12, 3
+
+    movi.n	a12, -16
+    movi.n	a13, 15
+// Main loop for input samples
+.fird_loop_len:
+        // Store K values from input to delay line:
+
+        l32i    a14,  a2, 16   // a14  - decimation
+        loopnez  a14, .fird_load_data // K loops
+            // Store to delay line
+            lsip	f15,  a3, 4		// a3  += 4, f15 = input[n]
+            ssip	f15, a11, 4		// a11 += 4, *a11 = f15
+            addi    a7,  a7,  1     // a7++ - position in delay line
+
+            blt     a7, a6, .do_not_reset_a11
+                l32i    a11, a2, 4	// Load delay line
+                movi    a7,  0
+            .do_not_reset_a11:
+            and     a15, a11, a12
+        .fird_load_data:
+        //
+        // Process data
+        //
+        // Load rounded delay line address
+
+        l32i    a10, a2, 0  // a10 - coeffs
+
+        // Clear f4, f5 for multiplications
+        const.s f4, 0
+        const.s f5, 0
+        const.s f6, 0
+        const.s f7, 0
+
+        and		a8, a11, a13		// a8 = a11 & 15
+        beqz   	a8, .offset_0
+        addi   	a8, a8, -4
+        beqz   	a8, .offset_1
+        addi   	a8, a8, -4
+        beqz   	a8, .offset_2
+        addi   	a8, a8, -4
+        beqz   	a8, .offset_3
+
+// a10 - coeffs
+// a11 - delay line
+.offset_0:
+        sub   a14, a6, a7   // a14 = N-pos
+        srli  a14, a14, 2
+        loopnez  a14, .first_fir_loop_0 // pos...N-1
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+            madd.s  f4, f0, f8
+            madd.s  f5, f1, f9
+            madd.s  f6, f2, f10
+            madd.s  f7, f3, f11
+        .first_fir_loop_0:
+        
+        l32i    a15, a2, 4  // a11 - delay line [0]	
+        srli  a14, a7, 2
+        loopnez  a14, .second_fir_loop_0 // 0..pos
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+            madd.s  f4, f0, f8
+            madd.s  f5, f1, f9
+            madd.s  f6, f2, f10
+            madd.s  f7, f3, f11
+        .second_fir_loop_0:
+        j    .store_fir_result;
+
+.offset_1:
+        sub   a14, a6, a7   // a14 = N-pos
+        addi  a14, a14, 3
+        srli  a14, a14, 2
+        EE.LDF.128.IP f11, f10, f9, f12, a15, 16 // Load data from delay line
+        // f12 - delay[N-1], store for the last operation
+        // f9..f11 - delay[0..2]
+        loopnez  a14, .first_fir_loop_1 // pos...N-1
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            madd.s  f4, f0, f9
+            madd.s  f5, f1, f10
+            madd.s  f6, f2, f11
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+            madd.s  f7, f3, f8
+        .first_fir_loop_1:
+        
+        l32i    a15, a2, 4  // a11 - delay line [0]
+        EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line		
+        srli  a14, a7, 2
+        loopnez  a14, .second_fir_loop_1 // 0..pos
+            madd.s  f4, f3, f8
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            madd.s  f5, f0, f9
+            madd.s  f6, f1, f10
+            madd.s  f7, f2, f11
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+        .second_fir_loop_1:
+
+        madd.s  f4, f3, f12
+        j    .store_fir_result;
+
+.offset_2:
+        sub   a14, a6, a7   // a14 = N-pos
+        addi  a14, a14, 3
+        srli  a14, a14, 2
+        EE.LDF.128.IP f11, f10, f13, f12, a15, 16 // Load data from delay line
+        // f12, f13 - delay[N-1], delay[N-2], store for the last operation
+        // f10..f11 - delay[0..1]
+        loopnez  a14, .first_fir_loop_2 // pos...N-1
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            madd.s  f4, f0, f10
+            madd.s  f5, f1, f11
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+            madd.s  f6, f2, f8
+            madd.s  f7, f3, f9
+        .first_fir_loop_2:
+        
+        l32i    a15, a2, 4  // a11 - delay line [0]
+        EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line		
+        srli  a14, a7, 2
+        loopnez  a14, .second_fir_loop_2 // 0..pos
+            madd.s  f4, f2, f8
+            madd.s  f5, f3, f9
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            madd.s  f6, f0, f10
+            madd.s  f7, f1, f11
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+        .second_fir_loop_2:
+
+        madd.s  f4, f2, f12
+        madd.s  f5, f3, f13
+        j    .store_fir_result;
+
+.offset_3:
+        sub   a14, a6, a7   // a14 = N-pos
+        addi  a14, a14, 3
+        srli  a14, a14, 2
+        EE.LDF.128.IP f11, f14, f13, f12, a15, 16 // Load data from delay line
+        // f12, f13, f14 - delay[N-1], delay[N-2], delay[N-3], store for the last operation
+        // f11 - delay[0]
+        loopnez  a14, .first_fir_loop_3 // pos...N-1
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            madd.s  f4, f0, f11
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+            madd.s  f5, f1, f8
+            madd.s  f6, f2, f9
+            madd.s  f7, f3, f10
+        .first_fir_loop_3:
+        
+        l32i    a15, a2, 4  // a11 - delay line [0]
+        EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line		
+        srli  a14, a7, 2
+        loopnez  a14, .second_fir_loop_3 // 0..pos
+            madd.s  f4, f1, f8
+            madd.s  f5, f2, f9
+            madd.s  f6, f3, f10
+            EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
+            madd.s  f7, f0, f11
+            EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
+        .second_fir_loop_3:
+
+        madd.s  f4, f1, f12
+        madd.s  f5, f2, f13
+        madd.s  f4, f3, f14
+
+.store_fir_result:
+
+    add.s   f4, f4, f5
+    add.s   f6, f6, f7
+    add.s   f4, f4, f6
+
+    // Store result
+    ssip     f4, a4, 4  // y++ - save result and increment output pointer
+    // Check loop length
+    addi   a5, a5, -1
+    bnez    a5, .fird_loop_len
+    // store state
+
+    s32i    a7,  a2, 12 // pos = a7
+    mov.n    a2,  a9
+    retw.n
+
+#endif // dsps_fir_f32_aes3_enabled
--- a/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fird_f32_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fird_f32_ansi.c
@@ -0,0 +1,38 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dsps_fir.h"
+
+int dsps_fird_f32_ansi(fir_f32_t *fir, const float *input, float *output, int len)
+{
+    int result = 0;
+    for (int i = 0; i < len ; i++) {
+        for (int k = 0 ; k < fir->decim ; k++) {
+            fir->delay[fir->pos++] = *input++;
+            if (fir->pos >= fir->N) {
+                fir->pos = 0;
+            }
+        }
+        float acc = 0;
+        int coeff_pos = 0;
+        for (int n = fir->pos; n < fir->N ; n++) {
+            acc += fir->coeffs[coeff_pos++] * fir->delay[n];
+        }
+        for (int n = 0; n < fir->pos ; n++) {
+            acc += fir->coeffs[coeff_pos++] * fir->delay[n];
+        }
+        output[result++] = acc;
+    }
+    return result;
+}
--- a/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fird_f32_arp4.S
+++ b/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fird_f32_arp4.S
@@ -0,0 +1,99 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dsps_fir_platform.h"
+#if (dsps_fird_f32_arp4_enabled == 1)
+
+// This is FIR filter for esp32p4 processor.
+    .text
+    .align  4
+    .global dsps_fird_f32_arp4
+    .type   dsps_fird_f32_arp4,@function
+// The function implements the following C code:
+//esp_err_t dsps_fird_f32_arp4(fir_f32_t* fir, const float* input, float* output, int len);
+
+dsps_fird_f32_arp4:
+    add sp,sp,-16
+    
+    mv  a6, a3
+    lw  t1, 4(a0)   // t1 - delay
+    lw  a4, 4(a0)   // a4 - delay 
+    lw  t2, 8(a0)   // t2 - N :FIR filter coefficients amount
+    lw  t3, 12(a0)  // t3 - pos
+    lw  t4, 16(a0)  // t4 - decim
+    slli    t3, t3, 2   // t5 = pos*4 (bytes)
+    add     t1, t1, t3  // delay[pos]
+    slli    t6, t2, 2   // t6 = N*4 (bytes)
+    add     t3, a4, t6  // last position for the daly[N]
+
+    nop
+.fird_loop_len:
+//    p.lw      a1, 4(a1)
+//fmv.w.x   fa5,zero
+    flw         fa0, 0(a1)  // f0 = x[i],  first load
+    esp.lp.setup    0, t4, .fird_load_data      // label to the last executed instruction
+        add         a1, a1, 4                   // i++
+        fsw         fa0, 0(t1)                  // delay[pos]
+        add         t1, t1, 4
+        blt         t1, t3, .do_not_reset_pos # if t0 < t1 then target
+            lw  t1, 4(a0)   // t1 - delay
+    .do_not_reset_pos:
+    .fird_load_data:    flw         fa0, 0(a1)                  // f0 = x[i]
+
+    lw      t0, 0(a0)   // t0 - coeffs
+    sub     t5, t3, t1  // (last_pos - pos)*4
+    srli    t5, t5, 2   // N-pos
+    sub     t6, t1, a4
+    srli    t6, t6, 2   // pos
+
+    fmv.w.x fa2,zero
+    
+    lw  a5, 0(a0)   // a5 - coeffs 
+    esp.lp.setup    0, t5, .first_fird_loop
+        flw     fa1, 0(a5)
+        flw     fa0, 0(t1)
+        addi    a5, a5, 4
+        fmadd.s   fa2, fa1, fa0, fa2
+.first_fird_loop:  addi      t1, t1, 4
+
+
+    lw  t1, 4(a0)   // t1 - delay
+
+    beqz    t6, .skeep_loop
+    esp.lp.setup    0, t6, .second_fird_loop
+        flw     fa1, 0(a5)
+        flw     fa0, 0(t1)
+        addi    a5, a5, 4
+        fmadd.s   fa2, fa1, fa0, fa2
+.second_fird_loop:   addi      t1, t1, 4
+
+.skeep_loop:
+    // Store result
+
+    fsw     fa2, 0(a2)
+    addi    a2, a2, 4
+
+    addi    a3, a3, -1
+    BNEZ    a3, .fird_loop_len// Jump if > 0
+
+    sub     t6,  t1, a4
+    srli    t6, t6, 2   // pos
+
+    sw  t6, 12(a0)  // t3 - pos
+
+    mv  a0, a6
+    add sp,sp,16
+    ret
+
+#endif // 
--- a/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fird_init_f32.c
+++ b/managed_components/espressif__esp-dsp/modules/fir/float/dsps_fird_init_f32.c
@@ -0,0 +1,46 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dsps_fir.h"
+
+
+esp_err_t dsps_fird_init_f32(fir_f32_t *fir, float *coeffs, float *delay, int N, int decim)
+{
+    fir->coeffs = coeffs;
+    fir->delay = delay;
+    fir->N = N;
+    fir->pos = 0;
+    fir->decim = decim;
+
+#ifdef CONFIG_IDF_TARGET_ESP32S3
+    // The amount of coefficients should be divided to 4,
+    // if not, add zero coefficients to round length to 0
+    if (fir->N % 4 != 0) {
+        return ESP_ERR_DSP_INVALID_LENGTH;
+    }
+    // The coeffs array should be aligned to 16
+    if (((uint32_t)coeffs) & 0x0f) {
+        return ESP_ERR_DSP_ARRAY_NOT_ALIGNED;
+    }
+    // The delay array should be aligned to 16
+    if (((uint32_t)delay) & 0x0f) {
+        return ESP_ERR_DSP_ARRAY_NOT_ALIGNED;
+    }
+#endif // CONFIG_IDF_TARGET_ESP32S3
+
+    for (int i = 0 ; i < N; i++) {
+        fir->delay[i] = 0;
+    }
+    return ESP_OK;
+}