add some code

2025-09-05 13:25:11 +08:00
parent 9ff0a99e7a
commit 3cf1229a85
8911 changed files with 2535396 additions and 0 deletions
@@ -0,0 +1,75 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspm_mult_platform.h"
+#if (dspm_mult_3x3x1_f32_ae32_enabled == 1)
+
+// This is matrix multipliction function for ESP32 processor.
+	.text
+	.align  4
+	.global dspm_mult_3x3x1_f32_ae32
+	.type   dspm_mult_3x3x1_f32_ae32,@function
+// The function implements the following C code:
+// esp_err_t dspm_mult_3x3x1_f32_ansi(const float* A, const float* B, float* C, int m, int n, int k)
+// {
+	// for (int i=0 ; i< m ; i++)
+	// {
+	//     for (int j=0 ; j< k ; j++)
+	//     {
+	//         C[i*k + j] = A[i*n]*B[j];
+	//         for (int s=1; s< n ; s++)
+	//         {
+	//             C[i*k + j] += A[i*n + s]*B[s*k + j];
+	//         }
+	//     }
+	// }
+//     return ESP_OK;
+// }
+
+dspm_mult_3x3x1_f32_ae32: 
+// A - a2
+// B - a3
+// C - a4
+
+// a5 - 0
+// a6 - 3
+	entry	a1, 16
+
+	movi a5, 0
+	movi a6, 3
+	
+	lsi	    f13,a3, 0 // B[0]
+	lsi	    f14,a3, 4 // B[1]
+	lsi	    f15,a3, 8 // B[2]
+
+//    addi	    a2, a2, -12 // To compensate first increment
+	loopnez     a6, loop_mac_3x3x1_end_m_ae32
+		wfr	    f0, a5
+		lsi	    f2, a2, 0
+		madd.s	f0, f2, f13
+		lsi	    f3, a2, 4
+		madd.s	f0, f3, f14
+		lsi	    f4, a2, 8
+		madd.s	f0, f4, f15
+		
+		addi	a2, a2, 12        
+		ssi	    f0, a4, 0
+		addi    a4, a4, 4
+
+	loop_mac_3x3x1_end_m_ae32:
+
+	movi.n	a2, 0 // return status ESP_OK
+	retw.n
+
+#endif //
@@ -0,0 +1,85 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspm_mult_platform.h"
+#if (dspm_mult_3x3x3_f32_ae32_enabled == 1)
+
+// This is matrix multipliction function for ESP32 processor.
+	.text
+	.align  4
+	.global dspm_mult_3x3x3_f32_ae32
+	.type   dspm_mult_3x3x3_f32_ae32,@function
+// The function implements the following C code:
+// esp_err_t dspm_mult_3x3x1_f32_ansi(const float* A, const float* B, float* C, int m, int n, int k)
+// {
+	// for (int i=0 ; i< m ; i++)
+	// {
+	//     for (int j=0 ; j< k ; j++)
+	//     {
+	//         C[i*k + j] = A[i*n]*B[j];
+	//         for (int s=1; s< n ; s++)
+	//         {
+	//             C[i*k + j] += A[i*n + s]*B[s*k + j];
+	//         }
+	//     }
+	// }
+//     return ESP_OK;
+// }
+
+dspm_mult_3x3x3_f32_ae32: 
+// A - a2
+// B - a3
+// C - a4
+
+// a5 - 0
+// a6 - 3 - internal loop for n
+// a7 - 3 - external loop for M
+	entry	a1, 16
+
+	movi a5, 0
+	movi a6, 3
+	movi a7, 3 // loop ccount
+	
+m_loop_3x3x3:
+		mov a12, a2 // A
+		mov a14, a4 // output pointer
+
+		lsi	    f12, a3, 0  // B[0][0]
+		lsi	    f13, a3, 12 // B[1][0]
+		lsi	    f14, a3, 24 // B[2][0]
+
+		loopnez     a6, loop_mac_3x3x3_end_m_ae32
+			wfr	    f0, a5
+			
+			lsi	    f2, a12, 0
+			madd.s	f0, f2, f12
+			lsi	    f3, a12, 4
+			madd.s	f0, f3, f13
+			lsi	    f4, a12, 8
+			madd.s	f0, f4, f14
+			
+			addi	a12, a12, 12
+			ssi	    f0, a14, 0
+			addi    a14, a14, 12
+		loop_mac_3x3x3_end_m_ae32:
+
+		addi a3,  a3,  4  // increment input pointer B 
+		addi a4, a4, 4
+		addi a7, a7, -1
+	bnez    a7, m_loop_3x3x3
+
+	movi.n	a2, 0 // return status ESP_OK
+	retw.n
+
+#endif //
@@ -0,0 +1,77 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspm_mult_platform.h"
+#if (dspm_mult_4x4x1_f32_ae32_enabled == 1)
+
+// This is matrix multipliction function for ESP32 processor.
+	.text
+	.align  4
+	.global dspm_mult_4x4x1_f32_ae32
+	.type   dspm_mult_4x4x1_f32_ae32,@function
+// The function implements the following C code:
+// esp_err_t dspm_mult_3x3x1_f32_ansi(const float* A, const float* B, float* C, int m, int n, int k)
+// {
+	// for (int i=0 ; i< m ; i++)
+	// {
+	//     for (int j=0 ; j< k ; j++)
+	//     {
+	//         C[i*k + j] = A[i*n]*B[j];
+	//         for (int s=1; s< n ; s++)
+	//         {
+	//             C[i*k + j] += A[i*n + s]*B[s*k + j];
+	//         }
+	//     }
+	// }
+//     return ESP_OK;
+// }
+
+dspm_mult_4x4x1_f32_ae32: 
+// A - a2
+// B - a3
+// C - a4
+
+// a5 - 0
+// a6 - 3
+	entry	a1, 16
+
+	movi a5, 0
+	movi a6, 4
+	
+	lsi	    f12,a3, 0  // B[0]
+	lsi	    f13,a3, 4  // B[1]
+	lsi	    f14,a3, 8  // B[2]
+	lsi	    f15,a3, 12 // B[3]
+
+	loopnez     a6, loop_mac_4x4x1_end_m_ae32
+		wfr	    f0, a5
+		lsi	    f2, a2, 0
+		madd.s	f0, f2, f12
+		lsi	    f3, a2, 4
+		madd.s	f0, f3, f13
+		lsi	    f4, a2, 8
+		madd.s	f0, f4, f14
+		lsi	    f5, a2, 12
+		madd.s	f0, f5, f15
+		
+		addi	a2, a2, 16        
+		ssi	    f0, a4, 0
+		addi    a4, a4, 4
+
+	loop_mac_4x4x1_end_m_ae32:
+
+	movi.n	a2, 0 // return status ESP_OK
+	retw.n
+
+#endif //
@@ -0,0 +1,88 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspm_mult_platform.h"
+#if (dspm_mult_4x4x4_f32_ae32_enabled == 1)
+
+// This is matrix multipliction function for ESP32 processor.
+	.text
+	.align  4
+	.global dspm_mult_4x4x4_f32_ae32
+	.type   dspm_mult_4x4x4_f32_ae32,@function
+// The function implements the following C code:
+// esp_err_t dspm_mult_3x3x1_f32_ansi(const float* A, const float* B, float* C, int m, int n, int k)
+// {
+	// for (int i=0 ; i< m ; i++)
+	// {
+	//     for (int j=0 ; j< k ; j++)
+	//     {
+	//         C[i*k + j] = A[i*n]*B[j];
+	//         for (int s=1; s< n ; s++)
+	//         {
+	//             C[i*k + j] += A[i*n + s]*B[s*k + j];
+	//         }
+	//     }
+	// }
+//     return ESP_OK;
+// }
+
+dspm_mult_4x4x4_f32_ae32: 
+// A - a2
+// B - a3
+// C - a4
+
+// a5 - 0
+// a6 - 4 - internal loop for n
+// a7 - 4 - external loop for M
+	entry	a1, 16
+
+	movi a5, 0
+	movi a6, 4
+	movi a7, 4 // loop ccount
+	
+m_loop_4x4x4:
+		mov a12, a2 // A
+		mov a14, a4 // output pointer
+
+		lsi	    f12, a3, 0  // B[0][0]
+		lsi	    f13, a3, 16 // B[1][0]
+		lsi	    f14, a3, 32 // B[2][0]
+		lsi	    f15, a3, 48 // B[3][0]
+
+		loopnez     a6, loop_mac_4x4x4_end_m_ae32
+			wfr	    f0, a5
+			
+			lsi	    f2, a12, 0
+			madd.s	f0, f2, f12
+			lsi	    f3, a12, 4
+			madd.s	f0, f3, f13
+			lsi	    f4, a12, 8
+			madd.s	f0, f4, f14
+			lsi	    f5, a12, 12
+			madd.s	f0, f5, f15
+			
+			addi	a12, a12, 16
+			ssi	    f0, a14, 0
+			addi    a14, a14, 16
+		loop_mac_4x4x4_end_m_ae32:
+
+		addi a3,  a3,  4  // increment input pointer B 
+		addi a4, a4, 4
+		addi a7, a7, -1
+	bnez    a7, m_loop_4x4x4
+
+	movi.n	a2, 0 // return status ESP_OK
+	retw.n
+
+#endif //
@@ -0,0 +1,88 @@
+/*
+ * SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "dspm_mult_platform.h"
+#if (dspm_mult_f32_ae32_enabled == 1)
+
+#include "dsps_dotprode_f32_m_ae32.S"
+
+ // This is matrix multiplication function for ESP32 processor.
+    .text
+    .align  4
+    .global  dspm_mult_ex_f32_ae32
+    .global .dspm_mult_ex_f32_ae32_body
+    .type    dspm_mult_ex_f32_ae32,@function
+// The function implements the following C code:
+//esp_err_t dspm_mult_ex_f32_ae32(const float *A, const float *B, float *C, int m, int n, int k, int A_padd, int B_padd, int C_padd);
+
+dspm_mult_ex_f32_ae32: 
+
+// A         - a2
+// B         - a3
+// C         - a4
+// m         - a5
+// n         - a6
+// k         - a7
+// A_padding - a14
+// B_padding - a15
+// C_padding - a8
+
+// a10 = 4
+// a9  - counter loop1: 0..m
+// a11 - counter loop2: 0..k
+// a12 - A
+// a13 - B
+// a4  - C
+
+    entry   a1, 16
+    // Array increment for floating point data should be 4
+.dspm_mult_ex_f32_ae32_body:
+
+    l32i.n  a14, a1, 16     // A_padding
+    l32i.n  a15, a1, 20     // B_padding
+    l32i.n  a8,  a1, 24     // C_padding
+
+    add     a14, a14, a6    // A_step = A_padding + A_cols (n)
+    add     a15, a15, a7    // B_step = B_padding + B_cols (k)
+    slli    a15, a15, 2     // Pointer increment for B (B_step * 4)
+
+    movi.n  a10, 4          // Increment = 4
+    movi.n  a9, 0           // counter loop1
+    const.s f3, 0           // Innitial state of accumulator, f3 = 0
+
+.mult_ex_loop1:
+    movi.n  a11, 0 // reset counter for loop2
+    .mult_ex_loop2:
+        // Clear initial state of the result register
+        // a2 - A
+        // a3 - B
+        // a6 - n
+        // a10 - step == 4 bytes
+        
+        mov     a12, a2             // load A
+        addx4   a13, a11, a3        // loop count to pointer value
+        mov.s   f1, f3              // reset f1
+
+        // Calculating dotproduct...
+        //dotprode_f32_ae32( x1   x2   count step1 step2)
+        dotprode_f32_ae32    a12, a13, a6,   a10,  a15;
+
+        addi.n  a11, a11, 1         // Increment loop2 counter
+        ssip    f1,  a4,  4         // Store restul from f1 to memory at a4 and increment a4
+
+        // check loop 2
+        blt   a11, a7, .mult_ex_loop2
+
+    // check loop 1
+    addx4   a2, a14, a2      // A += (A_step << 2)
+    addx4   a4, a8,  a4      // output += (C_padding << 2)
+    addi.n  a9, a9, 1        // Increment loop1 counter
+    blt     a9, a5, .mult_ex_loop1
+
+    movi.n  a2, 0   // return status ESP_OK
+    retw.n
+
+#endif //dspm_mult_f32_ae32_enabled
@@ -0,0 +1,166 @@
+/*
+ * SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "dspm_mult_platform.h"
+
+#if (dspm_mult_f32_aes3_enabled == 1)
+
+// This is matrix multiplication function for ESP32S3 processor.
+    .text
+    .align  4
+    .global  dspm_mult_ex_f32_aes3
+    .global .dspm_mult_ex_f32_ae32_body
+    .type    dspm_mult_ex_f32_aes3,@function
+// The function implements the following C code:
+//esp_err_t dspm_mult_ex_f32_ansi(const float* A, const float* B, float* C, int A_rows, int A_cols, int B_cols, int A_padding, int B_padding, int C_padding)
+//{
+//    const int A_step = A_cols + A_padding;
+//    const int B_step = B_cols + B_padding;
+//    const int C_step = B_cols + C_padding;
+//
+//    for (int i = 0; i < A_rows; i++) {
+//        for (int j = 0; j < B_cols; j++) {
+//            C[i * C_step + j] = A[i * A_step] * B[j];
+//            for (int s = 1; s < A_cols; s++) {
+//                C[i * C_step + j] += A[i * A_step + s] * B[s * B_step + j];
+//            }
+//        }
+//    }
+//    return ESP_OK;
+//}
+
+// A - a2
+// B - a3
+// C - a4
+// m - a5
+// n - a6
+// k - a7
+// A_padd = a8
+// B_padd = a9
+// C_padd = a15
+
+dspm_mult_ex_f32_aes3:
+
+    entry   a1, 16
+    l32i.n  a8, a1, 16     // A_padding
+    l32i.n  a9, a1, 20     // B_padding
+    l32i.n  a15,  a1, 24   // C_padding
+
+    // Check if we can use S3 memory model
+    // Check matrices dimensions and paddings all of them must be divisible by 4
+    or      a12, a5, a6         // a12 = m OR n
+    or      a14, a8, a9         // a14 = A_padd OR B_padd
+    or      a12, a12, a7        // a12 = m OR n OR k
+    or      a14, a14, a15       // a14 = A_padd OR B_padd OR C_padd
+    or      a12, a12, a14       // a12 = m OR n OR k OR A_padd OR B_padd OR C_padd
+    movi.n  a11, 3              // a11 = byte mask
+    and     a12, a12, a11       // a12 = a12 AND 3 (byte mask)
+
+    // Check alignment of A B C matrices data pointers
+    movi.n  a11, 15             // a11 = byte mask
+    or      a10, a3,  a2        // a10 = A pointer OR B pointer
+    or      a10, a10, a4        // a10 = A pointer OR B pointer OR C pointer
+    and     a10, a10, a11       // a10 = a10 AND 15 (byte mask)
+    or      a12, a12, a10       // a12 = mat_dim OR alignment
+    beqz    a12, .s3_mmult_ex   // if zero, jump to s3_mult
+    // Call Esp32 function
+    J      .dspm_mult_ex_f32_ae32_body
+
+.s3_mmult_ex:
+// f0, f1, f2, f3 - multiplication result
+// f4, f5, f6, f7 - input for matrix B
+// f8, f9, f10,f11- input far matrix A
+    movi.n      a14, 0          // B pointer increment for y loop
+
+    add         a15, a15, a7    // a15 = k + C_padding
+    slli        a10, a15, 2     // a10 = (K + C_padding) * 4 - step for rows
+
+    mov         a15, a9         // a15 = B_padd
+    slli        a15, a15, 2     // a15 = B_padd * 4
+
+    add         a7, a7, a9      // a7 = k + B_padding
+    slli        a12, a7, 2      // a12 = (K + B_padding) * 4 - step for rows
+    srli        a11, a6, 2      // a11 = n / 4
+    addi.n      a11, a11, -1    // a11 = innter loop count (n)
+
+    slli        a6, a8, 2       // a6 = A_padding *4 = A_pointer step
+    mov         a13, a3         // backup B pointer
+    mov         a7, a4          // backup C pointer
+
+.loop_x_mult_ex:
+    movi.n      a9,  0          // reset loop1 counter
+    mov         a8,  a2         // move A matrix back to the beginning
+    .loop_y_mult_ex:
+
+        add  a13, a3, a14       // Reload Y pointer to Y11 + A14
+        EE.LDF.128.IP f11, f10, f9, f8, a8, 16  // Load A values: X11, X12, X13, X14
+        EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y11, Y12, Y13, Y14
+        mul.s   f0, f4, f8      // f0 = X11*Y11
+        mul.s   f1, f5, f8      // f1 = X12*Y11
+        mul.s   f2, f6, f8      // f2 = X13*Y11
+        mul.s   f3, f7, f8      // f3 = X14*Y11
+
+        EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y21, Y22, Y23, Y24
+        madd.s  f0, f4, f9      // f0 = X11*Y11 + X12*Y21
+        madd.s  f1, f5, f9      // f1 = X11*Y12 + X12*Y22
+        madd.s  f2, f6, f9      // f2 = X11*Y13 + X12*Y23
+        madd.s  f3, f7, f9      // f3 = X11*Y14 + X12*Y24
+
+        EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y31, Y32, Y33, Y34
+        madd.s  f0, f4, f10     // f0 = X11*Y11 + X12*Y21 + X13*Y31
+        madd.s  f1, f5, f10     // f1 = X11*Y12 + X12*Y22 + X13*Y32
+        madd.s  f2, f6, f10     // f2 = X11*Y13 + X12*Y23 + X13*Y33
+        madd.s  f3, f7, f10     // f3 = X11*Y14 + X12*Y24 + X13*Y34
+
+        EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y41, Y42, Y43, Y44
+        madd.s  f0, f4, f11     // f0 = X11*Y11 + X12*Y21 + X13*Y31 + X14*Y41
+        madd.s  f1, f5, f11     // f1 = X11*Y12 + X12*Y22 + X13*Y32 + X14*Y42
+        madd.s  f2, f6, f11     // f2 = X11*Y13 + X12*Y23 + X13*Y33 + X14*Y43
+        madd.s  f3, f7, f11     // f3 = X11*Y14 + X12*Y24 + X13*Y34 + X14*Y44
+
+        loopnez a11, .iner_loop_mult_ex
+            EE.LDF.128.IP f11, f10, f9, f8, a8, 16  // Load A values: X15, X16, X17, X18
+
+            EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y51, Y52, Y53, Y54
+            madd.s  f0, f4, f8      // f0 += X15*Y51
+            madd.s  f1, f5, f8      // f1 += X15*Y52
+            madd.s  f2, f6, f8      // f2 += X15*Y53
+            madd.s  f3, f7, f8      // f3 += X15*Y54
+
+            EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y61, Y62, Y63, Y64
+            madd.s  f0, f4, f9      // f0 += X16*Y61
+            madd.s  f1, f5, f9      // f1 += X16*Y62
+            madd.s  f2, f6, f9      // f2 += X16*Y63
+            madd.s  f3, f7, f9      // f3 += X16*Y64
+
+            EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y71, Y72, Y73, Y74
+            madd.s  f0, f4, f10     // f0 =
+            madd.s  f1, f5, f10     // f1 =
+            madd.s  f2, f6, f10     // f2 =
+            madd.s  f3, f7, f10     // f3 =
+
+            EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y81, Y82, Y83, Y84
+            madd.s  f0, f4, f11     // f0 =
+            madd.s  f1, f5, f11     // f1 =
+            madd.s  f2, f6, f11     // f2 =
+            madd.s  f3, f7, f11     // f3 =
+        .iner_loop_mult_ex:
+        EE.STF.128.XP f3, f2, f1, f0, a4, a10 // Store result 
+
+        addi.n  a9,  a9, 1          // Increment loop1 counter
+        add     a8,  a8, a6         // (increase A pointer by A_padding * 4 times)
+    blt   a9, a5, .loop_y_mult_ex
+
+    addi.n  a7,  a7,  16            // Increase C pinter by 16
+    mov     a4,  a7
+    addi.n  a14, a14, 16            // Increase B pointer by 16
+    addi.n  a15, a15, 16            // Increment loop2 counter by 16
+
+blt   a15, a12, .loop_x_mult_ex
+    movi.n  a2, 0 // return status ESP_OK
+    retw.n
+
+#endif //dspm_mult_f32_aes3_enabled
@@ -0,0 +1,57 @@
+/*
+ * SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "dspm_mult.h"
+
+// Matrix A(m,n), m - amount or rows, n - amount of columns
+// C(m,k) = A(m,n)*B(n,k)
+// c(i * c_step,j) = sum(a(i * a_step,s)*b(s * b_step,j)) , s=1..n
+esp_err_t dspm_mult_ex_f32_ansi(const float *A, const float *B, float *C, int A_rows, int A_cols, int B_cols, int A_padding, int B_padding, int C_padding)
+{
+    if (NULL == A) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (NULL == B) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (NULL == C) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+
+    if (A_rows <= 0) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (A_cols <= 0) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (B_cols <= 0) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+
+    if (A_padding < 0) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (B_padding < 0) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (C_padding < 0) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+
+    const int A_step = A_cols + A_padding;
+    const int B_step = B_cols + B_padding;
+    const int C_step = B_cols + C_padding;
+
+    for (int i = 0; i < A_rows; i++) {
+        for (int j = 0; j < B_cols; j++) {
+            C[i * C_step + j] = A[i * A_step] * B[j];
+            for (int s = 1; s < A_cols; s++) {
+                C[i * C_step + j] += A[i * A_step + s] * B[s * B_step + j];
+            }
+        }
+    }
+    return ESP_OK;
+}
@@ -0,0 +1,115 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspm_mult_platform.h"
+#if (dspm_mult_f32_arp4_enabled == 1)
+
+// This is matrix multipliction function for ESP32 processor.
+    .text
+    .align  4
+    .global dspm_mult_ex_f32_arp4
+    .global .dspm_mult_ex_f32_arp4_body
+    .type   dspm_mult_ex_f32_arp4,@function
+// The function implements the following C code:
+// esp_err_t dspm_mult_f32_ansi(const float *A, const float *B, float *C, int m, int n, int k, int A_padd, int B_padd, int C_padd)
+// {
+    // for (int i=0 ; i< m ; i++)
+    // {
+    //     for (int j=0 ; j< k ; j++)
+    //     {
+    //         C[i*k + j] = A[i*n]*B[j];
+    //         for (int s=1; s< n ; s++)
+    //         {
+    //             C[i*k + j] += A[i*n + s]*B[s*k + j];
+    //         }
+    //     }
+    // }
+//     return ESP_OK;
+// }
+
+dspm_mult_ex_f32_arp4: 
+// A - a2: a0
+// B - a3: a1
+// C - a4: a2
+// m - a5: a3
+// n - a6: a4
+// k - a7: a5
+
+// a8:a6  = n*4
+// a10:t0 = 4
+// a9:a7  - counter loop1: 0..m
+// a11:t1 - counter loop2: 0..k
+// a12:t2 - A
+// a13:t3 - B
+// a14:t4
+// a15:t5
+
+    add sp,sp,-16
+    // Array increment for floating point data should be 4
+.dspm_mult_ex_f32_arp4_body:
+
+    mv      t5, a7
+
+    add     t4, a6, a4    // A_step = A_padding + A_cols (n)
+    add     t5, t5, a5    // B_step = B_padding + B_cols (k)
+    slli    t5, t5, 2     // Pointer increment for B (B_step * 4)
+    slli    t4, t4, 2     // A_step << 2
+    lw      a6,  16(sp)   // C_padding from stack
+    slli    a6, a6, 2     // C_step << 2
+    
+    li  a7, 0  // counter loop1
+
+.dpf_loop1:    
+    li  t1, 0 // reset counter for loop2
+.dpf_loop2:
+
+        // Clear initial state of the result register
+        // a2 - A
+        // a3 - B
+        // a6 - n
+        // a10 - step == 4 bytes
+        // a8 -  step n*4
+        mv      t2, a0 // load A
+
+        slli     t3, t1, 2 // loop count to pointer value
+        add      t3, a1, t3 // load A
+
+        fmv.w.x fa2,zero // reset fa2
+        // Calculating dotproduct...
+        esp.lp.setup    0, a4, .matrix_mul_loop
+            flw     fa0, 0(t2)
+            add     t2, t2, 4
+            flw     fa1, 0(t3)
+            fmadd.s   fa2, fa1, fa0, fa2
+        .matrix_mul_loop: add       t3, t3, t5
+
+        fsw     fa2, 0(a2)
+        addi    a2, a2, 4 // increment a2 for next time
+        // check loop 2
+        addi  t1, t1, 1 // Increment loop2 counter
+        blt   t1, a5, .dpf_loop2
+
+    // check loop 1
+    add   a0, a0, t4      // A += (A_step << 2)
+    add   a2, a2, a6      // output += (C_padding << 2)
+
+    add   a7, a7, 1 // Increment loop1 counter
+    blt   a7, a3, .dpf_loop1
+
+    // Exit
+    li  a0, 0       // return status ESP_OK
+    add sp,sp,16
+    ret
+
+#endif //dspm_mult_ex_f32_arp4_enabled
@@ -0,0 +1,104 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspm_mult_platform.h"
+#if (dspm_mult_f32_ae32_enabled == 1)
+
+#include "dsps_dotprode_f32_m_ae32.S"
+
+// This is matrix multipliction function for ESP32 processor.
+	.text
+	.align  4
+	.global dspm_mult_f32_ae32
+	.global .dspm_mult_f32_ae32_body
+	.type   dspm_mult_f32_ae32,@function
+// The function implements the following C code:
+// esp_err_t dspm_mult_f32_ansi(const float* A, const float* B, float* C, int m, int n, int k)
+// {
+	// for (int i=0 ; i< m ; i++)
+	// {
+	//     for (int j=0 ; j< k ; j++)
+	//     {
+	//         C[i*k + j] = A[i*n]*B[j];
+	//         for (int s=1; s< n ; s++)
+	//         {
+	//             C[i*k + j] += A[i*n + s]*B[s*k + j];
+	//         }
+	//     }
+	// }
+//     return ESP_OK;
+// }
+
+dspm_mult_f32_ae32: 
+// A - a2
+// B - a3
+// C - a4
+// m - a5
+// n - a6
+// k - a7
+
+// a8  = n*4
+// a10 = 4
+// a9  - counter loop1: 0..m
+// a11 - counter loop2: 0..k
+// a12 - A
+// a13 - B
+// a4  - C
+
+	entry	a1, 16
+	// Array increment for floating point data should be 4
+.dspm_mult_f32_ae32_body:
+	slli    a8, a6, 2 // Pointer increment for A
+	slli    a15,a7, 2 // Pointer increment for B
+
+	movi.n	a14, 0 // Innitial state of accumulator f1
+	movi.n	a10, 4 // Increment = 4
+	movi.n	a9, 0  // counter loop1
+
+.dpf_loop1:    
+	movi.n	a11, 0 // reset counter for loop2
+.dpf_loop2:
+
+	// Clear initial state of the result register
+	// a2 - A
+	// a3 - B
+	// a6 - n
+	// a10 - step == 4 bytes
+	// a8 -  step n*4
+	mov      a12, a2 // load A
+
+	slli     a13, a11, 2 // loop count to pointer value
+	add.n    a13, a3, a13 // load A
+
+	wfr	    f1, a14 // reset f1
+	// Calculating dotproduct...
+	dotprode_f32_ae32 a12, a13, a6, a10, a15;
+
+	ssi	    f1, a4, 0 // Store result from f1 to memory at a4
+	addi    a4, a4, 4 // increment a4 for next time
+
+	// check loop 2
+	addi  a11, a11, 1 // Increment loop2 counter
+	blt   a11, a7, .dpf_loop2
+
+	// check loop 1
+	add.n   a2, a2, a8 // Increment A, A = A[i*n]
+
+	addi  a9, a9, 1 // Increment loop1 counter
+	blt   a9, a5, .dpf_loop1
+
+	movi.n	a2, 0 // return status ESP_OK
+	retw.n
+
+#endif //dspm_mult_f32_ae32_enabled
@@ -0,0 +1,150 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspm_mult_platform.h"
+
+#if (dspm_mult_f32_aes3_enabled == 1)
+
+
+// This is matrix multipliction function for ESP32 processor.
+	.text
+	.align  4
+	.global dspm_mult_f32_aes3
+	.global .dspm_mult_f32_ae32_body
+	.type   dspm_mult_f32_aes3,@function
+// The function implements the following C code:
+// esp_err_t dspm_mult_f32_ansi(const float* A, const float* B, float* C, int m, int n, int k)
+// {
+	// for (int i=0 ; i< m ; i++)
+	// {
+	//     for (int j=0 ; j< k ; j++)
+	//     {
+	//         C[i*k + j] = A[i*n]*B[j];
+	//         for (int s=1; s< n ; s++)
+	//         {
+	//             C[i*k + j] += A[i*n + s]*B[s*k + j];
+	//         }
+	//     }
+	// }
+//     return ESP_OK;
+// }
+
+dspm_mult_f32_aes3: 
+	entry	a1, 16
+// A - a2
+// B - a3
+// C - a4
+// m - a5
+// n - a6
+// k - a7
+
+	// Ccheck if we can use S3 memory model:
+	or a12, a5, a6
+	or a12, a7, a12
+	movi.n	a11, 3
+	and a12, a12, a11
+	movi.n   a11, 15
+	or       a10, a3, a2
+	or       a10, a10, a4
+	and		 a10, a10, a11
+	or		 a12, a12, a10
+	beqz  a12, .s3_mmult
+	// Call Esp32 function
+	J 	.dspm_mult_f32_ae32_body
+
+.s3_mmult:
+// f0, f1, f2, f3 - multiplication result
+// f4, f5, f6, f7 - input for matrix B
+// f8, f9, f10,f11- input far matrix A
+	movi.n	a14, 0
+
+	slli     	a12, a7, 2		// a12 = K*4 - step for rows
+	slli     	a10, a7, 2		// a10 = K*4 - step for rows
+	srli	    a11, a6, 2		// N count
+	addi.n		a11, a11, -1
+
+	movi.n		a15, 0
+	mov	 a13, a3
+	mov  a7, a4
+
+.loop_x_aes3:
+	movi.n		a9, 0
+	mov      	a8,  a2		// A matirx
+	.loop_y_aes3:
+		add	 a13, a3, a14		// Reload Y pointer to Y11 + A14
+		EE.LDF.128.IP f11, f10, f9, f8, a8, 16  // Load A values: X11, X12, X13, X14
+		EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y11, Y12, Y13, Y14
+		mul.s	f0, f4, f8		// f0 = X11*Y11
+		mul.s	f1, f5, f8		// f1 = X12*Y11
+		mul.s	f2, f6, f8		// f2 = X13*Y11
+		mul.s	f3, f7, f8		// f3 = X14*Y11
+
+		EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y21, Y22, Y23, Y24
+		madd.s	f0, f4, f9		// f0 = X11*Y11 + X12*Y21
+		madd.s	f1, f5, f9		// f1 = X11*Y12 + X12*Y22
+		madd.s	f2, f6, f9		// f2 = X11*Y13 + X12*Y23
+		madd.s	f3, f7, f9		// f3 = X11*Y14 + X12*Y24
+
+		EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y31, Y32, Y33, Y34
+		madd.s	f0, f4, f10		// f0 = X11*Y11 + X12*Y21 + X13*Y31
+		madd.s	f1, f5, f10		// f1 = X11*Y12 + X12*Y22 + X13*Y32
+		madd.s	f2, f6, f10		// f2 = X11*Y13 + X12*Y23 + X13*Y33
+		madd.s	f3, f7, f10		// f3 = X11*Y14 + X12*Y24 + X13*Y34
+
+		EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y41, Y42, Y43, Y44
+		madd.s	f0, f4, f11		// f0 = X11*Y11 + X12*Y21 + X13*Y31 + X14*Y41
+		madd.s	f1, f5, f11		// f1 = X11*Y12 + X12*Y22 + X13*Y32 + X14*Y42
+		madd.s	f2, f6, f11		// f2 = X11*Y13 + X12*Y23 + X13*Y33 + X14*Y43
+		madd.s	f3, f7, f11		// f3 = X11*Y14 + X12*Y24 + X13*Y34 + X14*Y44
+		
+		loopnez a11, .loop_end_m_aes3
+			EE.LDF.128.IP f11, f10, f9, f8, a8, 16  // Load A values: X15, X16, X17, X18
+
+			EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y51, Y52, Y53, Y54
+			madd.s	f0, f4, f8		// f0 += X15*Y51
+			madd.s	f1, f5, f8		// f1 += X15*Y52
+			madd.s	f2, f6, f8		// f2 += X15*Y53
+			madd.s	f3, f7, f8		// f3 += X15*Y54
+
+			EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y61, Y62, Y63, Y64
+			madd.s	f0, f4, f9		// f0 += X16*Y61
+			madd.s	f1, f5, f9		// f1 += X16*Y62 
+			madd.s	f2, f6, f9		// f2 += X16*Y63 
+			madd.s	f3, f7, f9		// f3 += X16*Y64 
+
+			EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y71, Y72, Y73, Y74
+			madd.s	f0, f4, f10		// f0 = 
+			madd.s	f1, f5, f10		// f1 = 
+			madd.s	f2, f6, f10		// f2 = 
+			madd.s	f3, f7, f10		// f3 = 
+
+			EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y81, Y82, Y83, Y84
+			madd.s	f0, f4, f11		// f0 = 
+			madd.s	f1, f5, f11		// f1 = 
+			madd.s	f2, f6, f11		// f2 = 
+			madd.s	f3, f7, f11		// f3 = 
+		.loop_end_m_aes3:
+		EE.STF.128.XP f3, f2, f1, f0, a4, a10 // Store result 
+
+		addi  a9, a9, 1 // Increment loop1 counter
+	blt   a9, a5, .loop_y_aes3
+	addi.n  a7, a7, 16
+	mov		a4, a7
+	addi.n  a14, a14, 16			// B shift for 4
+	addi  a15, a15, 16 // Increment loop1 counter
+blt   a15, a12, .loop_x_aes3
+	movi.n	a2, 0 // return status ESP_OK
+	retw.n
+
+#endif //dspm_mult_f32_aes3_enabled
@@ -0,0 +1,33 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "dsps_dotprod.h"
+#include "dspm_mult.h"
+
+// Matrinx A(m,n), m - amount or rows, n - amount of columns
+// C(m,k) = A(m,n)*B(n,k)
+// c(i,j) = sum(a(i,s)*b(s,j)) , s=1..n
+esp_err_t dspm_mult_f32_ansi(const float *A, const float *B, float *C, int m, int n, int k)
+{
+    for (int i = 0 ; i < m ; i++) {
+        for (int j = 0 ; j < k ; j++) {
+            C[i * k + j] = A[i * n] * B[j];
+            for (int s = 1; s < n ; s++) {
+                C[i * k + j] += A[i * n + s] * B[s * k + j];
+            }
+        }
+    }
+    return ESP_OK;
+}
@@ -0,0 +1,109 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspm_mult_platform.h"
+#if (dspm_mult_f32_arp4_enabled == 1)
+
+// This is matrix multipliction function for ESP32 processor.
+    .text
+    .align  4
+    .global dspm_mult_f32_arp4
+    .global .dspm_mult_f32_arp4_body
+    .type   dspm_mult_f32_arp4,@function
+// The function implements the following C code:
+// esp_err_t dspm_mult_f32_ansi(const float* A, const float* B, float* C, int m, int n, int k)
+// {
+    // for (int i=0 ; i< m ; i++)
+    // {
+    //     for (int j=0 ; j< k ; j++)
+    //     {
+    //         C[i*k + j] = A[i*n]*B[j];
+    //         for (int s=1; s< n ; s++)
+    //         {
+    //             C[i*k + j] += A[i*n + s]*B[s*k + j];
+    //         }
+    //     }
+    // }
+//     return ESP_OK;
+// }
+
+dspm_mult_f32_arp4: 
+// A - a2: a0
+// B - a3: a1
+// C - a4: a2
+// m - a5: a3
+// n - a6: a4
+// k - a7: a5
+
+// a8:a6  = n*4
+// a10:t0 = 4
+// a9:a7  - counter loop1: 0..m
+// a11:t1 - counter loop2: 0..k
+// a12:t2 - A
+// a13:t3 - B
+// a14:t4
+// a15:t5
+
+    add sp,sp,-16
+    // Array increment for floating point data should be 4
+.dspm_mult_f32_arp4_body:
+    slli    a6, a4, 2 // Pointer increment for A
+    slli    t5,a5, 2 // Pointer increment for B
+
+    li  t4, 0 // Innitial state of accumulator f1
+    li  t0, 4 // Increment = 4
+    li  a7, 0  // counter loop1
+
+.dpf_loop1:    
+    li  t1, 0 // reset counter for loop2
+.dpf_loop2:
+
+        // Clear initial state of the result register
+        // a2 - A
+        // a3 - B
+        // a6 - n
+        // a10 - step == 4 bytes
+        // a8 -  step n*4
+        mv      t2, a0 // load A
+
+        slli     t3, t1, 2 // loop count to pointer value
+        add      t3, a1, t3 // load A
+
+        fmv.w.x fa2,zero // reset fa2
+        // Calculating dotproduct...
+        esp.lp.setup    0, a4, .matrix_mul_loop
+            flw     fa0, 0(t2)
+            add     t2, t2, t0
+            flw     fa1, 0(t3)
+            fmadd.s   fa2, fa1, fa0, fa2
+        .matrix_mul_loop: add       t3, t3, t5
+
+        fsw     fa2, 0(a2)
+        addi    a2, a2, 4 // increment a2 for next time
+        // check loop 2
+        addi  t1, t1, 1 // Increment loop2 counter
+        blt   t1, a5, .dpf_loop2
+
+    // check loop 1
+    add   a0, a0, a6 // Increment A, A = A[i*n]
+
+    add   a7, a7, 1 // Increment loop1 counter
+    blt   a7, a3, .dpf_loop1
+
+    // Exit
+    mv  a0, a6      // return status ESP_OK
+    add sp,sp,16
+    ret
+
+#endif //dspm_mult_f32_arp4_enabled