add some code

2025-09-05 13:25:11 +08:00
parent 9ff0a99e7a
commit 3cf1229a85
8911 changed files with 2535396 additions and 0 deletions
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/fixed/dspm_mult_s16_ae32.S
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/fixed/dspm_mult_s16_ae32.S
@@ -0,0 +1,174 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspm_mult_platform.h"
+#if (dspm_mult_s16_ae32_enabled == 1)
+
+#include "dsps_dotprod_s16_m_ae32.S"
+#include "dspm_mult_s16_m_ae32_vector.S"
+//esp_err_t dspm_mult_s16_ae32(const int16_t* A, const int16_t* B, int16_t* C, int m, int n, int k, int shift);
+
+// This is matrix multipliction function for ESP32 processor.
+	.text
+	.align  4
+	.global dspm_mult_s16_ae32
+	.global .dspm_mult_s16_ae32_body
+	.type   dspm_mult_s16_ae32,@function
+
+dspm_mult_s16_ae32: 
+// A - a2
+// B - a3
+// C - a4
+// m - a5 - any > 0
+// n - a6 - 1,2,3, any
+// k - a7 - 1, any
+// shift - stack (a8) 
+
+// a14 - n*4 - pointer increment
+//
+	entry	a1, 80
+// ======     process matrices when k == 1   ============
+.dspm_mult_s16_ae32_body:
+	l32i.n	a8, a1, 80 // Load shift to the a8 register
+	
+
+	// Prepare and load round value
+	ssr a8 // store shift to ssa
+	movi a15, 0x7fff
+	srl a15, a15
+
+	neg  a8, a8 
+	addi a8, a8, 15
+	ssr a8 // store shift to ssa
+	movi a8, 0  // Clear a8 
+
+	slli    a14, a6, 1 // Pointer increment for n
+	movi.n	a10, 2 // Increment = 2
+	movi.n	a9, 0  // initial counter loop1
+
+	movi     a12, 1
+	beq      a7, a12, vector_mult
+	// We have normal path with k > 1
+	// a2, a3, a4 - A,B,C
+	// a5 - m
+	// a6 - n
+	// a7 - k
+	// a8 - temp
+	// a9 - temp
+	// a10- k counter
+	// a11- m counter
+	// a12- B
+	// a13- A 
+	// a14 - pointer increment for n
+	// a15 - round value
+
+	bbsi  a6, 0, even_N_samples
+//  ----------------  for odd N
+	srli    a6, a6, 1 // counter a6 = a6/2. We have to do it only once
+	slli    a7, a7, 1 // counter a7 = a7*2. We have to do it only once
+	
+	// loop for M
+m_loop_mmult:
+	movi    a10, 0  // reset k loop counter
+	mov     a13, a3 // set pointer to the first column
+// loop for K
+k_loop_mmult:
+
+		addi     a12, a2, -4 // every loop the same start position
+
+		movi    a8, 0
+		wsr     a8, acchi
+		wsr     a15, acclo // initialize acc with shifted round value
+
+		loopnez a6, .loop_end_mmult // loop for N
+		.loop_mmult:
+			ldinc       m3, a12
+			l16si       a8, a13, 0
+			add         a13, a13, a7
+			mula.ad.ll  a8, m3
+			l16si       a8, a13, 0
+			add         a13, a13, a7            
+			mula.ad.lh  a8, m3
+		.loop_end_mmult:
+
+		rsr     a8, acchi
+		rsr     a9, acclo
+		src     a8, a8, a9        
+		s16i	a8, a4, 0
+		addi    a4, a4, 2
+		// check and increment for K
+		
+		addi    a10, a10, 2
+		add     a13, a3, a10 // we shift collumn 
+		bne     a10, a7, k_loop_mmult
+
+		// Check and increment for M
+		add     a2, a2, a14 // move to the next raw
+		addi    a5, a5, -1
+		bnez.n  a5, m_loop_mmult
+
+	movi.n	a2, 0 // return status ESP_OK
+	retw.n
+
+even_N_samples:
+//  ----------------  for odd N
+	slli    a7, a7, 1 // counter a7 = a7*2. We have to do it only once
+	
+	// loop for M
+m_loop_mmult_even:
+	movi    a10, 0  // reset k loop counter
+	mov     a13, a3 // set pointer to the first column
+// loop for K
+k_loop_mmult_even:
+
+		mov     a12, a2     // every loop the same start position
+
+		movi    a8, 0
+		wsr     a8,  acchi
+		wsr     a15, acclo // initialize acc with shifted round value
+
+		loopnez a6, .loop_end_mmult_even // loop for N
+		.loop_mmult_even:
+			l16si       a9, a12, 0
+			l16si       a8, a13, 0
+			addi        a12, a12, 2
+			add         a13, a13, a7
+			mula.aa.ll  a8, a9
+		.loop_end_mmult_even:
+
+		rsr     a8, acchi
+		rsr     a9, acclo
+		src     a8, a8, a9        
+		s16i	a8, a4, 0
+		addi    a4, a4, 2
+		// check and increment for K
+		
+		addi    a10, a10, 2
+		add     a13, a3, a10 // we shift collumn 
+		bne     a10, a7, k_loop_mmult_even
+
+		// Check and increment for M
+		add     a2, a2, a14 // move to the next raw
+		addi    a5, a5, -1
+		bnez.n  a5, m_loop_mmult_even
+
+	movi.n	a2, 0 // return status ESP_OK
+	retw.n
+
+// The path where n > 1
+vector_mult:
+	dspm_mult_s16_m_ae32_vector;
+
+
+#endif // dspm_mult_s16_ae32_enabled
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/fixed/dspm_mult_s16_aes3.S
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/fixed/dspm_mult_s16_aes3.S
@@ -0,0 +1,142 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspm_mult_platform.h"
+#if (dspm_mult_s16_aes3_enabled == 1)
+#include "dsps_dotprod_s16_m_ae32.S"
+#include "dspm_mult_s16_m_ae32_vector.S"
+
+//esp_err_t dspm_mult_s16_ae32(const int16_t* A, const int16_t* B, int16_t* C, int m, int n, int k, int shift);
+
+// This is matrix multipliction function for ESP32 processor.
+	.text
+	.align	4
+	.literal_position
+	.literal	.LC0_1_38, 32767
+	.literal	.LC1_1_39, 16383
+
+	.global  dspm_mult_s16_aes3
+	.global .dspm_mult_s16_ae32_body
+	.type    dspm_mult_s16_aes3,@function
+
+dspm_mult_s16_aes3: 
+
+	entry	a1,80                   	#  
+
+	movi.n	a10, 7
+	and a10, a10, a7
+	beqz  a10, .dspm_mult_s16_aes3_body
+	// Call Esp32 function
+	J 	.dspm_mult_s16_ae32_body
+
+.dspm_mult_s16_aes3_body:
+	mov.n	a10,a4                  	# [0]  
+	mov.n	a11,a5                  	# [1]  
+	l32i	a5,a1,80                 	# [2]  id:77 shift+0x0
+	s32i.n	a3,a1,32               	# [3]  gra_spill_temp_0
+	
+	bltz	a5,.Lt_0_6146            	# [4]  
+
+#.LBB3_dspm_mult_s16_aes3:	# 0x13
+	l32r	a9,.LC0_1_38             	# [0]  
+	ssr	a5                        	# [1]  
+	sra	a9,a9                     	# [2]  
+
+.LBB23_dspm_mult_s16_aes3:	# 0x1c
+	s16i	a9,a1,0                  	# [0]  id:78 round_data_64+0x0
+	s16i	a9,a1,2                  	# [1]  id:78 round_data_64+0x0
+	s16i	a9,a1,4                  	# [2]  id:78 round_data_64+0x0
+	s16i	a9,a1,6                  	# [3]  id:78 round_data_64+0x0
+	s16i	a9,a1,8                  	# [4]  id:78 round_data_64+0x0
+	s16i	a9,a1,10                 	# [5]  id:78 round_data_64+0x0
+	s16i	a9,a1,12                 	# [6]  id:78 round_data_64+0x0
+	s16i	a9,a1,14                 	# [7]  id:78 round_data_64+0x0
+
+	blti	a11,1,.Lt_0_7426         	# [0]  
+
+	mov.n	a13,a2                  	# [0]  
+	slli	a4,a7,1                  	# [1]  
+	mov.n	a12,a1                  	# [2]  
+	l32i.n	a14,a1,32              	# [3]  gra_spill_temp_0
+	movi.n	a15,15                 	# [4]  
+	movi.n	a8,0                   	# [5]  
+	slli	a9,a6,1                  	# [6]  
+	s32i.n	a9,a1,36               	# [7]  gra_spill_temp_1
+	s32i.n	a8,a1,44               	# [8]  gra_spill_temp_3
+	sub	a15,a15,a5                	# [9]  
+	addi.n	a8,a7,7                	# [10]  
+	movgez	a8,a7,a7               	# [11]  
+	srai	a8,a8,3                  	# [12]  
+	s32i.n	a8,a1,40               	# [13]  gra_spill_temp_2
+	slli	a8,a8,4                  	# [14]  
+	add.n	a14,a14,a8              	# [15]  
+
+.Lt_0_7938:	# 0x5d
+	l32i.n	a8,a1,40               	# [0]  gra_spill_temp_2
+	beqz.n	a8,.Lt_0_8194          	# [2]  
+
+	l32i.n	a7,a1,32               	# [0]  gra_spill_temp_0
+	mov.n	a2,a13                  	# [1]  
+
+.Lt_0_8706:	# 0x65
+	ee.ldqa.u16.128.ip	a12,0      	# [0]  id:80
+	ee.vldbc.16.ip	q1,a2,2        	# [1]  id:79
+	mov.n	a3,a7                   	# [2]  
+	ee.vld.128.xp	q0,a3,a4        	# [3]  id:81
+	addi	a7,a7,16                 	# [4]  
+	blti	a6,1,.Lt_0_8962          	# [5]  
+
+	srai	a5,a6,1                  	# [0]  
+	bbci	a6,0,.LBB68_dspm_mult_s16_aes3 	# [1]  
+
+	ee.vmulas.s16.qacc.ldbc.incp	q1,a2,q0,q1 	# [0]  id:82
+	ee.vld.128.xp	q0,a3,a4        	# [1]  id:83
+
+.LBB68_dspm_mult_s16_aes3:	# 0x82
+	loopgtz	a5,.LBB74_dspm_mult_s16_aes3 	# [0]  
+
+.LBB64_dspm_mult_s16_aes3:	# 0x85
+	ee.vld.128.xp	q2,a3,a4        	# [0*II+0]  id:83
+	ee.vmulas.s16.qacc.ldbc.incp	q1,a2,q0,q1 	# [0*II+1]  id:82
+	ee.vld.128.xp	q0,a3,a4        	# [0*II+2]  id:83
+	ee.vmulas.s16.qacc.ldbc.incp	q1,a2,q2,q1 	# [0*II+3]  id:82
+
+.LBB74_dspm_mult_s16_aes3:	# 0x91
+
+.Lt_0_8962:	# 0x91
+	mov.n	a2,a13                  	# [0]  
+	ee.srcmb.s16.qacc	q0,a15,1    	# [1]  
+	ee.vst.128.ip	q0,a10,16       	# [2]  id:85
+	bne	a7,a14,.Lt_0_8706         	# [3]  
+
+.Lt_0_8194:	# 0x9c
+	l32i.n	a8,a1,36               	# [0]  gra_spill_temp_1
+	l32i.n	a9,a1,44               	# [1]  gra_spill_temp_3
+	add.n	a13,a13,a8              	# [2]  
+	addi.n	a9,a9,1                	# [3]  
+	s32i.n	a9,a1,44               	# [4]  gra_spill_temp_3
+	bne	a11,a9,.Lt_0_7938         	# [5]  
+
+.Lt_0_7426:	# 0xa9
+	movi.n	a2,0                   	# [0]  
+	retw.n                        	# [1]  
+
+.Lt_0_6146:	# 0xad
+	l32r	a9,.LC1_1_39             	# [0]  
+	ssr	a5                        	# [1]  
+	sra	a9,a9                     	# [2]  
+	j	.LBB23_dspm_mult_s16_aes3   	# [3] 
+
+
+#endif // dspm_mult_s16_ae32_enabled
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/fixed/dspm_mult_s16_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/fixed/dspm_mult_s16_ansi.c
@@ -0,0 +1,40 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dsps_dotprod.h"
+#include "dspm_mult.h"
+
+// Matrinx A(m,n), m - amount or rows, n - amount of columns
+// C(m,k) = A(m,n)*B(n,k)
+// c(i,j) = sum(a(i,s)*b(s,j)) , s=1..n
+esp_err_t dspm_mult_s16_ansi(const int16_t *A, const int16_t *B, int16_t *C, int m, int n, int k, int shift)
+{
+    int final_shift = shift - 15;
+    for (int i = 0 ; i < m ; i++) {
+        for (int j = 0 ; j < k ; j++) {
+            // This code also could be used
+            //dsps_dotprode_f32_ae32(&A[i*n],&B[j],&C[i*k + j],n,1,n);
+            long long acc = 0x7fff >> shift;
+            for (int s = 0; s < n ; s++) {
+                acc += (int32_t)A[i * n + s] * (int32_t)B[s * k + j];
+            }
+            if (final_shift > 0) {
+                C[i * k + j] = (acc << final_shift);
+            } else {
+                C[i * k + j] = (acc >> (-final_shift));
+            }
+        }
+    }
+    return ESP_OK;
+}
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/fixed/dspm_mult_s16_arp4.S
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/fixed/dspm_mult_s16_arp4.S
@@ -0,0 +1,121 @@
+// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspm_mult_platform.h"
+#if (dspm_mult_s16_arp4_enabled == 1)
+
+// This is matrix multipliction function for Risc-V processor core.
+    .text
+    .align  4
+    .global dspm_mult_s16_arp4
+    .global dspm_mult_s16_ansi  
+    .global .dspm_mult_s16_arp4_body
+    .type   dspm_mult_s16_arp4,@function
+// The function implements the following C code:
+// esp_err_t dspm_mult_f32_ansi(const int16_t *A, const int16_t *B, int16_t *C, int m, int n, int k, int shift)
+// {
+//    int final_shift = shift - 15;
+//    for (int i = 0 ; i < m ; i++) {
+//        for (int j = 0 ; j < k ; j++) {
+//            // This code also could be used
+//            //dsps_dotprode_f32_ae32(&A[i*n],&B[j],&C[i*k + j],n,1,n);
+//            long long acc = 0x7fff >> shift;
+//            for (int s = 0; s < n ; s++) {
+//                acc += (int32_t)A[i * n + s] * (int32_t)B[s * k + j];
+//            }
+//            if (final_shift > 0) {
+//                C[i * k + j] = (acc << final_shift);
+//            } else {
+//                C[i * k + j] = (acc >> (-final_shift));
+//            }
+//        }
+//    }
+//     return ESP_OK;
+// }
+
+dspm_mult_s16_arp4: 
+// A - a0
+// B - a1
+// C - a2
+// m - a3
+// n - a4
+// k - a5
+// shift - a6
+
+// a7 - counter loop1: 0..m
+// t1 - counter loop2: 0..k
+// t0 - counter loop3: 0..n
+// x25(s9) - matrix step for input2
+// x24(s8) - pointer to current B
+// x29(t4) - pointer to initial B
+// x30(t5) - pointer to A
+// x31(t6) = 2 for increment....
+// x26(s10)- final_shift
+
+    or      t0, a3, a4
+    or      t0, t0, a5
+    andi    t0, t0, 0x7
+    beqz    t0, .dspm_mult_s16_arp4_body
+    j   dspm_mult_s16_ansi
+    //ret
+
+.dspm_mult_s16_arp4_body:
+    add sp,sp,-16
+    sw  s8, 4(sp)
+    sw  s9, 8(sp)
+    sw  s10, 12(sp)
+    mv      t0, a4
+    li      a7, 0  // counter loop1
+    slli    x25, a5, 1 // step = step*2
+    li      x31, 2
+    // final_shift = shift - 15
+    add     x26, a6, -15
+
+.dpf_loop1: // loop for m
+    li      t1, 0 // reset counter for loop2
+    mv      x29, a1
+.dpf_loop2: // loop for k
+        mv  x30, a0
+        mv  x24, x29        // load B
+        // Calculating dotproduct...
+        esp.zero.qacc                       // qacc = 0;
+        esp.vldbc.16.xp     q0, x30, x31    // q0 = a[mx..mx]
+        esp.vld.128.xp      q1, x24, x25    // q1 = b[x0..x7],
+        esp.lp.setup    0, t0, .matrix_mul_loop
+            esp.vmulas.s16.qacc.ldbc.incp   q0,x30,     q0,q1
+        .matrix_mul_loop:   esp.vld.128.xp  q1,x24,x25
+            
+        esp.srcmb.s16.qacc  q2, x26, 0          //   q2 = qacc >> shift
+        esp.vst.128.ip      q2, a2, 16          //  save k0..k7
+        add     x29,x29, 16
+
+        // check loop 2
+        addi  t1, t1, 8 // Increment loop2 counter
+        blt   t1, a5, .dpf_loop2
+    add   x30, x30, -2
+    mv    a0, x30   // 
+
+    // check loop 1
+    add   a7, a7, 1 // Increment loop1 counter
+    blt   a7, a3, .dpf_loop1
+
+    // Exit
+    mv  a0, a6      // return status ESP_OK
+    lw  s10, 12(sp)
+    lw  s9, 8(sp)
+    lw  s8, 4(sp)
+    add sp,sp,16
+    ret
+
+#endif //dspm_mult_s16_arp4_enabled
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/fixed/dspm_mult_s16_m_ae32.S
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/fixed/dspm_mult_s16_m_ae32.S
@@ -0,0 +1,58 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+
+.macro dspm_mult_s16_ae32_MxNxN
+// A - a2
+// B - a3
+// C - a4
+// m - a5
+// n - a6
+// k - a7
+// shift - stack (a8) 
+
+	movi  a10, 4 // load 4 as a constant
+	// Check if n >=4 then acceleration is possible and 
+	blt   a6, a10, do_dotproduct 
+	// Here we make operations one by one...
+
+
+	movi.n	a2, 0 // return status ESP_OK
+	retw.n
+
+do_dotproduct:
+
+		mov  a12, a2
+		mov  a13, a3
+		
+		srli a9, a6, 2  // a9 - count/4 - 1
+		addi a9, a9, -1
+
+		movi.n	a10, 0 // load 0 to the a10 to increment second array
+		dotprod_s16_ae32_full a12, a13, a9, a10, a6
+
+		/* Get accumulator */
+		ssr a6
+		rsr a2, acchi
+		rsr a3, acclo
+		src a2, a2, a3
+		
+		s16i	a2, a4, 0
+		movi.n	a2, 0
+
+
+	movi.n	a2, 0 // return status ESP_OK
+	retw.n
+
+.endm // dspm_mult_s16_ae32_MxNxN
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/fixed/dspm_mult_s16_m_ae32_vector.S
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/fixed/dspm_mult_s16_m_ae32_vector.S
@@ -0,0 +1,105 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+
+.macro dspm_mult_s16_m_ae32_vector
+// m - a5 - any > 0
+// n - a6 - 1,2,3, any
+// k - a7 - 1, any
+
+
+	// Define path for n < 4
+	movi a7, 4
+	blt  a6, a7, small_process_loop // jump for n < 4
+
+	srli a7, a6,  2
+	addi a7, a7, -1
+ 
+
+mmultv_loop1:
+	wsr a8, acchi
+	wsr a15, acclo // initialize acc with shifted round value
+
+	// Clear initial state of the result register
+	// a2 - A
+	// a3 - B
+	// a4 - C
+	// a6 - n
+	// a7 - n/4 - 1
+	// a8 - 0
+	// a15- 0x7fff>>shift
+
+		mov      a12, a2 // load A
+		mov      a13, a3 // Load B
+
+		dotprod_s16_ae32_full a12, a13, a7, a6
+
+	// check loop 1
+		/* Get accumulator */
+		rsr a12, acchi
+		rsr a13, acclo
+		src a12, a12, a13
+		
+		s16i	a12, a4, 0
+		addi    a4, a4, 2
+
+		add.n   a2, a2, a14 // Increment A, A = A[i*n]
+		addi    a9, a9, 1 // Increment loop1 counter    
+	blt     a9, a5, mmultv_loop1
+
+
+	movi.n	a2, 0 // return status ESP_OK
+	retw.n
+
+small_process_loop:
+
+	wsr a8, acchi
+	wsr a15, acclo // initialize acc with shifted round value
+
+	mov      a12, a2 // load A
+	mov      a13, a3 // Load B
+
+	addi  a12, a12, -4 // To arrange fist pointer
+	addi  a13, a13, -4 // To arrange fist pointer
+
+		bbci  a6, 1, .mod2chk_short
+		ldinc m0, a12
+		ldinc m2, a13
+		mula.dd.hh m0, m2
+		mula.dd.ll m0, m2
+	.mod2chk_short:
+		bbci  a6, 0, .mod1chk_short
+		ldinc m0, a12
+		ldinc m2, a13
+		mula.dd.ll m0, m2
+	.mod1chk_short:
+
+	// check loop 1
+		/* Get accumulator */
+		rsr a12, acchi
+		rsr a13, acclo
+		src a12, a12, a13
+		
+		s16i	a12, a4, 0
+		addi     a4, a4, 2
+
+		add.n   a2, a2, a14 // Increment A, A = A[i*n]
+		addi    a9, a9, 1 // Increment loop1 counter    
+	blt     a9, a5, small_process_loop
+
+	movi.n	a2, 0 // return status ESP_OK
+	retw.n
+
+
+.endm // dspm_mult_s16_m_ae32_vector
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/float/dspm_mult_3x3x1_f32_ae32.S
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/float/dspm_mult_3x3x1_f32_ae32.S
@@ -0,0 +1,75 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspm_mult_platform.h"
+#if (dspm_mult_3x3x1_f32_ae32_enabled == 1)
+
+// This is matrix multipliction function for ESP32 processor.
+	.text
+	.align  4
+	.global dspm_mult_3x3x1_f32_ae32
+	.type   dspm_mult_3x3x1_f32_ae32,@function
+// The function implements the following C code:
+// esp_err_t dspm_mult_3x3x1_f32_ansi(const float* A, const float* B, float* C, int m, int n, int k)
+// {
+	// for (int i=0 ; i< m ; i++)
+	// {
+	//     for (int j=0 ; j< k ; j++)
+	//     {
+	//         C[i*k + j] = A[i*n]*B[j];
+	//         for (int s=1; s< n ; s++)
+	//         {
+	//             C[i*k + j] += A[i*n + s]*B[s*k + j];
+	//         }
+	//     }
+	// }
+//     return ESP_OK;
+// }
+
+dspm_mult_3x3x1_f32_ae32: 
+// A - a2
+// B - a3
+// C - a4
+
+// a5 - 0
+// a6 - 3
+	entry	a1, 16
+
+	movi a5, 0
+	movi a6, 3
+	
+	lsi	    f13,a3, 0 // B[0]
+	lsi	    f14,a3, 4 // B[1]
+	lsi	    f15,a3, 8 // B[2]
+
+//    addi	    a2, a2, -12 // To compensate first increment
+	loopnez     a6, loop_mac_3x3x1_end_m_ae32
+		wfr	    f0, a5
+		lsi	    f2, a2, 0
+		madd.s	f0, f2, f13
+		lsi	    f3, a2, 4
+		madd.s	f0, f3, f14
+		lsi	    f4, a2, 8
+		madd.s	f0, f4, f15
+		
+		addi	a2, a2, 12        
+		ssi	    f0, a4, 0
+		addi    a4, a4, 4
+
+	loop_mac_3x3x1_end_m_ae32:
+
+	movi.n	a2, 0 // return status ESP_OK
+	retw.n
+
+#endif //
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/float/dspm_mult_3x3x3_f32_ae32.S
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/float/dspm_mult_3x3x3_f32_ae32.S
@@ -0,0 +1,85 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspm_mult_platform.h"
+#if (dspm_mult_3x3x3_f32_ae32_enabled == 1)
+
+// This is matrix multipliction function for ESP32 processor.
+	.text
+	.align  4
+	.global dspm_mult_3x3x3_f32_ae32
+	.type   dspm_mult_3x3x3_f32_ae32,@function
+// The function implements the following C code:
+// esp_err_t dspm_mult_3x3x1_f32_ansi(const float* A, const float* B, float* C, int m, int n, int k)
+// {
+	// for (int i=0 ; i< m ; i++)
+	// {
+	//     for (int j=0 ; j< k ; j++)
+	//     {
+	//         C[i*k + j] = A[i*n]*B[j];
+	//         for (int s=1; s< n ; s++)
+	//         {
+	//             C[i*k + j] += A[i*n + s]*B[s*k + j];
+	//         }
+	//     }
+	// }
+//     return ESP_OK;
+// }
+
+dspm_mult_3x3x3_f32_ae32: 
+// A - a2
+// B - a3
+// C - a4
+
+// a5 - 0
+// a6 - 3 - internal loop for n
+// a7 - 3 - external loop for M
+	entry	a1, 16
+
+	movi a5, 0
+	movi a6, 3
+	movi a7, 3 // loop ccount
+	
+m_loop_3x3x3:
+		mov a12, a2 // A
+		mov a14, a4 // output pointer
+
+		lsi	    f12, a3, 0  // B[0][0]
+		lsi	    f13, a3, 12 // B[1][0]
+		lsi	    f14, a3, 24 // B[2][0]
+
+		loopnez     a6, loop_mac_3x3x3_end_m_ae32
+			wfr	    f0, a5
+			
+			lsi	    f2, a12, 0
+			madd.s	f0, f2, f12
+			lsi	    f3, a12, 4
+			madd.s	f0, f3, f13
+			lsi	    f4, a12, 8
+			madd.s	f0, f4, f14
+			
+			addi	a12, a12, 12
+			ssi	    f0, a14, 0
+			addi    a14, a14, 12
+		loop_mac_3x3x3_end_m_ae32:
+
+		addi a3,  a3,  4  // increment input pointer B 
+		addi a4, a4, 4
+		addi a7, a7, -1
+	bnez    a7, m_loop_3x3x3
+
+	movi.n	a2, 0 // return status ESP_OK
+	retw.n
+
+#endif //
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/float/dspm_mult_4x4x1_f32_ae32.S
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/float/dspm_mult_4x4x1_f32_ae32.S
@@ -0,0 +1,77 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspm_mult_platform.h"
+#if (dspm_mult_4x4x1_f32_ae32_enabled == 1)
+
+// This is matrix multipliction function for ESP32 processor.
+	.text
+	.align  4
+	.global dspm_mult_4x4x1_f32_ae32
+	.type   dspm_mult_4x4x1_f32_ae32,@function
+// The function implements the following C code:
+// esp_err_t dspm_mult_3x3x1_f32_ansi(const float* A, const float* B, float* C, int m, int n, int k)
+// {
+	// for (int i=0 ; i< m ; i++)
+	// {
+	//     for (int j=0 ; j< k ; j++)
+	//     {
+	//         C[i*k + j] = A[i*n]*B[j];
+	//         for (int s=1; s< n ; s++)
+	//         {
+	//             C[i*k + j] += A[i*n + s]*B[s*k + j];
+	//         }
+	//     }
+	// }
+//     return ESP_OK;
+// }
+
+dspm_mult_4x4x1_f32_ae32: 
+// A - a2
+// B - a3
+// C - a4
+
+// a5 - 0
+// a6 - 3
+	entry	a1, 16
+
+	movi a5, 0
+	movi a6, 4
+	
+	lsi	    f12,a3, 0  // B[0]
+	lsi	    f13,a3, 4  // B[1]
+	lsi	    f14,a3, 8  // B[2]
+	lsi	    f15,a3, 12 // B[3]
+
+	loopnez     a6, loop_mac_4x4x1_end_m_ae32
+		wfr	    f0, a5
+		lsi	    f2, a2, 0
+		madd.s	f0, f2, f12
+		lsi	    f3, a2, 4
+		madd.s	f0, f3, f13
+		lsi	    f4, a2, 8
+		madd.s	f0, f4, f14
+		lsi	    f5, a2, 12
+		madd.s	f0, f5, f15
+		
+		addi	a2, a2, 16        
+		ssi	    f0, a4, 0
+		addi    a4, a4, 4
+
+	loop_mac_4x4x1_end_m_ae32:
+
+	movi.n	a2, 0 // return status ESP_OK
+	retw.n
+
+#endif //
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/float/dspm_mult_4x4x4_f32_ae32.S
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/float/dspm_mult_4x4x4_f32_ae32.S
@@ -0,0 +1,88 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspm_mult_platform.h"
+#if (dspm_mult_4x4x4_f32_ae32_enabled == 1)
+
+// This is matrix multipliction function for ESP32 processor.
+	.text
+	.align  4
+	.global dspm_mult_4x4x4_f32_ae32
+	.type   dspm_mult_4x4x4_f32_ae32,@function
+// The function implements the following C code:
+// esp_err_t dspm_mult_3x3x1_f32_ansi(const float* A, const float* B, float* C, int m, int n, int k)
+// {
+	// for (int i=0 ; i< m ; i++)
+	// {
+	//     for (int j=0 ; j< k ; j++)
+	//     {
+	//         C[i*k + j] = A[i*n]*B[j];
+	//         for (int s=1; s< n ; s++)
+	//         {
+	//             C[i*k + j] += A[i*n + s]*B[s*k + j];
+	//         }
+	//     }
+	// }
+//     return ESP_OK;
+// }
+
+dspm_mult_4x4x4_f32_ae32: 
+// A - a2
+// B - a3
+// C - a4
+
+// a5 - 0
+// a6 - 4 - internal loop for n
+// a7 - 4 - external loop for M
+	entry	a1, 16
+
+	movi a5, 0
+	movi a6, 4
+	movi a7, 4 // loop ccount
+	
+m_loop_4x4x4:
+		mov a12, a2 // A
+		mov a14, a4 // output pointer
+
+		lsi	    f12, a3, 0  // B[0][0]
+		lsi	    f13, a3, 16 // B[1][0]
+		lsi	    f14, a3, 32 // B[2][0]
+		lsi	    f15, a3, 48 // B[3][0]
+
+		loopnez     a6, loop_mac_4x4x4_end_m_ae32
+			wfr	    f0, a5
+			
+			lsi	    f2, a12, 0
+			madd.s	f0, f2, f12
+			lsi	    f3, a12, 4
+			madd.s	f0, f3, f13
+			lsi	    f4, a12, 8
+			madd.s	f0, f4, f14
+			lsi	    f5, a12, 12
+			madd.s	f0, f5, f15
+			
+			addi	a12, a12, 16
+			ssi	    f0, a14, 0
+			addi    a14, a14, 16
+		loop_mac_4x4x4_end_m_ae32:
+
+		addi a3,  a3,  4  // increment input pointer B 
+		addi a4, a4, 4
+		addi a7, a7, -1
+	bnez    a7, m_loop_4x4x4
+
+	movi.n	a2, 0 // return status ESP_OK
+	retw.n
+
+#endif //
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/float/dspm_mult_ex_f32_ae32.S
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/float/dspm_mult_ex_f32_ae32.S
@@ -0,0 +1,88 @@
+/*
+ * SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "dspm_mult_platform.h"
+#if (dspm_mult_f32_ae32_enabled == 1)
+
+#include "dsps_dotprode_f32_m_ae32.S"
+
+ // This is matrix multiplication function for ESP32 processor.
+    .text
+    .align  4
+    .global  dspm_mult_ex_f32_ae32
+    .global .dspm_mult_ex_f32_ae32_body
+    .type    dspm_mult_ex_f32_ae32,@function
+// The function implements the following C code:
+//esp_err_t dspm_mult_ex_f32_ae32(const float *A, const float *B, float *C, int m, int n, int k, int A_padd, int B_padd, int C_padd);
+
+dspm_mult_ex_f32_ae32: 
+
+// A         - a2
+// B         - a3
+// C         - a4
+// m         - a5
+// n         - a6
+// k         - a7
+// A_padding - a14
+// B_padding - a15
+// C_padding - a8
+
+// a10 = 4
+// a9  - counter loop1: 0..m
+// a11 - counter loop2: 0..k
+// a12 - A
+// a13 - B
+// a4  - C
+
+    entry   a1, 16
+    // Array increment for floating point data should be 4
+.dspm_mult_ex_f32_ae32_body:
+
+    l32i.n  a14, a1, 16     // A_padding
+    l32i.n  a15, a1, 20     // B_padding
+    l32i.n  a8,  a1, 24     // C_padding
+
+    add     a14, a14, a6    // A_step = A_padding + A_cols (n)
+    add     a15, a15, a7    // B_step = B_padding + B_cols (k)
+    slli    a15, a15, 2     // Pointer increment for B (B_step * 4)
+
+    movi.n  a10, 4          // Increment = 4
+    movi.n  a9, 0           // counter loop1
+    const.s f3, 0           // Innitial state of accumulator, f3 = 0
+
+.mult_ex_loop1:
+    movi.n  a11, 0 // reset counter for loop2
+    .mult_ex_loop2:
+        // Clear initial state of the result register
+        // a2 - A
+        // a3 - B
+        // a6 - n
+        // a10 - step == 4 bytes
+        
+        mov     a12, a2             // load A
+        addx4   a13, a11, a3        // loop count to pointer value
+        mov.s   f1, f3              // reset f1
+
+        // Calculating dotproduct...
+        //dotprode_f32_ae32( x1   x2   count step1 step2)
+        dotprode_f32_ae32    a12, a13, a6,   a10,  a15;
+
+        addi.n  a11, a11, 1         // Increment loop2 counter
+        ssip    f1,  a4,  4         // Store restul from f1 to memory at a4 and increment a4
+
+        // check loop 2
+        blt   a11, a7, .mult_ex_loop2
+
+    // check loop 1
+    addx4   a2, a14, a2      // A += (A_step << 2)
+    addx4   a4, a8,  a4      // output += (C_padding << 2)
+    addi.n  a9, a9, 1        // Increment loop1 counter
+    blt     a9, a5, .mult_ex_loop1
+
+    movi.n  a2, 0   // return status ESP_OK
+    retw.n
+
+#endif //dspm_mult_f32_ae32_enabled
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/float/dspm_mult_ex_f32_aes3.S
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/float/dspm_mult_ex_f32_aes3.S
@@ -0,0 +1,166 @@
+/*
+ * SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "dspm_mult_platform.h"
+
+#if (dspm_mult_f32_aes3_enabled == 1)
+
+// This is matrix multiplication function for ESP32S3 processor.
+    .text
+    .align  4
+    .global  dspm_mult_ex_f32_aes3
+    .global .dspm_mult_ex_f32_ae32_body
+    .type    dspm_mult_ex_f32_aes3,@function
+// The function implements the following C code:
+//esp_err_t dspm_mult_ex_f32_ansi(const float* A, const float* B, float* C, int A_rows, int A_cols, int B_cols, int A_padding, int B_padding, int C_padding)
+//{
+//    const int A_step = A_cols + A_padding;
+//    const int B_step = B_cols + B_padding;
+//    const int C_step = B_cols + C_padding;
+//
+//    for (int i = 0; i < A_rows; i++) {
+//        for (int j = 0; j < B_cols; j++) {
+//            C[i * C_step + j] = A[i * A_step] * B[j];
+//            for (int s = 1; s < A_cols; s++) {
+//                C[i * C_step + j] += A[i * A_step + s] * B[s * B_step + j];
+//            }
+//        }
+//    }
+//    return ESP_OK;
+//}
+
+// A - a2
+// B - a3
+// C - a4
+// m - a5
+// n - a6
+// k - a7
+// A_padd = a8
+// B_padd = a9
+// C_padd = a15
+
+dspm_mult_ex_f32_aes3:
+
+    entry   a1, 16
+    l32i.n  a8, a1, 16     // A_padding
+    l32i.n  a9, a1, 20     // B_padding
+    l32i.n  a15,  a1, 24   // C_padding
+
+    // Check if we can use S3 memory model
+    // Check matrices dimensions and paddings all of them must be divisible by 4
+    or      a12, a5, a6         // a12 = m OR n
+    or      a14, a8, a9         // a14 = A_padd OR B_padd
+    or      a12, a12, a7        // a12 = m OR n OR k
+    or      a14, a14, a15       // a14 = A_padd OR B_padd OR C_padd
+    or      a12, a12, a14       // a12 = m OR n OR k OR A_padd OR B_padd OR C_padd
+    movi.n  a11, 3              // a11 = byte mask
+    and     a12, a12, a11       // a12 = a12 AND 3 (byte mask)
+
+    // Check alignment of A B C matrices data pointers
+    movi.n  a11, 15             // a11 = byte mask
+    or      a10, a3,  a2        // a10 = A pointer OR B pointer
+    or      a10, a10, a4        // a10 = A pointer OR B pointer OR C pointer
+    and     a10, a10, a11       // a10 = a10 AND 15 (byte mask)
+    or      a12, a12, a10       // a12 = mat_dim OR alignment
+    beqz    a12, .s3_mmult_ex   // if zero, jump to s3_mult
+    // Call Esp32 function
+    J      .dspm_mult_ex_f32_ae32_body
+
+.s3_mmult_ex:
+// f0, f1, f2, f3 - multiplication result
+// f4, f5, f6, f7 - input for matrix B
+// f8, f9, f10,f11- input far matrix A
+    movi.n      a14, 0          // B pointer increment for y loop
+
+    add         a15, a15, a7    // a15 = k + C_padding
+    slli        a10, a15, 2     // a10 = (K + C_padding) * 4 - step for rows
+
+    mov         a15, a9         // a15 = B_padd
+    slli        a15, a15, 2     // a15 = B_padd * 4
+
+    add         a7, a7, a9      // a7 = k + B_padding
+    slli        a12, a7, 2      // a12 = (K + B_padding) * 4 - step for rows
+    srli        a11, a6, 2      // a11 = n / 4
+    addi.n      a11, a11, -1    // a11 = innter loop count (n)
+
+    slli        a6, a8, 2       // a6 = A_padding *4 = A_pointer step
+    mov         a13, a3         // backup B pointer
+    mov         a7, a4          // backup C pointer
+
+.loop_x_mult_ex:
+    movi.n      a9,  0          // reset loop1 counter
+    mov         a8,  a2         // move A matrix back to the beginning
+    .loop_y_mult_ex:
+
+        add  a13, a3, a14       // Reload Y pointer to Y11 + A14
+        EE.LDF.128.IP f11, f10, f9, f8, a8, 16  // Load A values: X11, X12, X13, X14
+        EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y11, Y12, Y13, Y14
+        mul.s   f0, f4, f8      // f0 = X11*Y11
+        mul.s   f1, f5, f8      // f1 = X12*Y11
+        mul.s   f2, f6, f8      // f2 = X13*Y11
+        mul.s   f3, f7, f8      // f3 = X14*Y11
+
+        EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y21, Y22, Y23, Y24
+        madd.s  f0, f4, f9      // f0 = X11*Y11 + X12*Y21
+        madd.s  f1, f5, f9      // f1 = X11*Y12 + X12*Y22
+        madd.s  f2, f6, f9      // f2 = X11*Y13 + X12*Y23
+        madd.s  f3, f7, f9      // f3 = X11*Y14 + X12*Y24
+
+        EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y31, Y32, Y33, Y34
+        madd.s  f0, f4, f10     // f0 = X11*Y11 + X12*Y21 + X13*Y31
+        madd.s  f1, f5, f10     // f1 = X11*Y12 + X12*Y22 + X13*Y32
+        madd.s  f2, f6, f10     // f2 = X11*Y13 + X12*Y23 + X13*Y33
+        madd.s  f3, f7, f10     // f3 = X11*Y14 + X12*Y24 + X13*Y34
+
+        EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y41, Y42, Y43, Y44
+        madd.s  f0, f4, f11     // f0 = X11*Y11 + X12*Y21 + X13*Y31 + X14*Y41
+        madd.s  f1, f5, f11     // f1 = X11*Y12 + X12*Y22 + X13*Y32 + X14*Y42
+        madd.s  f2, f6, f11     // f2 = X11*Y13 + X12*Y23 + X13*Y33 + X14*Y43
+        madd.s  f3, f7, f11     // f3 = X11*Y14 + X12*Y24 + X13*Y34 + X14*Y44
+
+        loopnez a11, .iner_loop_mult_ex
+            EE.LDF.128.IP f11, f10, f9, f8, a8, 16  // Load A values: X15, X16, X17, X18
+
+            EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y51, Y52, Y53, Y54
+            madd.s  f0, f4, f8      // f0 += X15*Y51
+            madd.s  f1, f5, f8      // f1 += X15*Y52
+            madd.s  f2, f6, f8      // f2 += X15*Y53
+            madd.s  f3, f7, f8      // f3 += X15*Y54
+
+            EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y61, Y62, Y63, Y64
+            madd.s  f0, f4, f9      // f0 += X16*Y61
+            madd.s  f1, f5, f9      // f1 += X16*Y62
+            madd.s  f2, f6, f9      // f2 += X16*Y63
+            madd.s  f3, f7, f9      // f3 += X16*Y64
+
+            EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y71, Y72, Y73, Y74
+            madd.s  f0, f4, f10     // f0 =
+            madd.s  f1, f5, f10     // f1 =
+            madd.s  f2, f6, f10     // f2 =
+            madd.s  f3, f7, f10     // f3 =
+
+            EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y81, Y82, Y83, Y84
+            madd.s  f0, f4, f11     // f0 =
+            madd.s  f1, f5, f11     // f1 =
+            madd.s  f2, f6, f11     // f2 =
+            madd.s  f3, f7, f11     // f3 =
+        .iner_loop_mult_ex:
+        EE.STF.128.XP f3, f2, f1, f0, a4, a10 // Store result 
+
+        addi.n  a9,  a9, 1          // Increment loop1 counter
+        add     a8,  a8, a6         // (increase A pointer by A_padding * 4 times)
+    blt   a9, a5, .loop_y_mult_ex
+
+    addi.n  a7,  a7,  16            // Increase C pinter by 16
+    mov     a4,  a7
+    addi.n  a14, a14, 16            // Increase B pointer by 16
+    addi.n  a15, a15, 16            // Increment loop2 counter by 16
+
+blt   a15, a12, .loop_x_mult_ex
+    movi.n  a2, 0 // return status ESP_OK
+    retw.n
+
+#endif //dspm_mult_f32_aes3_enabled
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/float/dspm_mult_ex_f32_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/float/dspm_mult_ex_f32_ansi.c
@@ -0,0 +1,57 @@
+/*
+ * SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "dspm_mult.h"
+
+// Matrix A(m,n), m - amount or rows, n - amount of columns
+// C(m,k) = A(m,n)*B(n,k)
+// c(i * c_step,j) = sum(a(i * a_step,s)*b(s * b_step,j)) , s=1..n
+esp_err_t dspm_mult_ex_f32_ansi(const float *A, const float *B, float *C, int A_rows, int A_cols, int B_cols, int A_padding, int B_padding, int C_padding)
+{
+    if (NULL == A) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (NULL == B) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (NULL == C) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+
+    if (A_rows <= 0) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (A_cols <= 0) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (B_cols <= 0) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+
+    if (A_padding < 0) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (B_padding < 0) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (C_padding < 0) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+
+    const int A_step = A_cols + A_padding;
+    const int B_step = B_cols + B_padding;
+    const int C_step = B_cols + C_padding;
+
+    for (int i = 0; i < A_rows; i++) {
+        for (int j = 0; j < B_cols; j++) {
+            C[i * C_step + j] = A[i * A_step] * B[j];
+            for (int s = 1; s < A_cols; s++) {
+                C[i * C_step + j] += A[i * A_step + s] * B[s * B_step + j];
+            }
+        }
+    }
+    return ESP_OK;
+}
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/float/dspm_mult_ex_f32_arp4.S
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/float/dspm_mult_ex_f32_arp4.S
@@ -0,0 +1,115 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspm_mult_platform.h"
+#if (dspm_mult_f32_arp4_enabled == 1)
+
+// This is matrix multipliction function for ESP32 processor.
+    .text
+    .align  4
+    .global dspm_mult_ex_f32_arp4
+    .global .dspm_mult_ex_f32_arp4_body
+    .type   dspm_mult_ex_f32_arp4,@function
+// The function implements the following C code:
+// esp_err_t dspm_mult_f32_ansi(const float *A, const float *B, float *C, int m, int n, int k, int A_padd, int B_padd, int C_padd)
+// {
+    // for (int i=0 ; i< m ; i++)
+    // {
+    //     for (int j=0 ; j< k ; j++)
+    //     {
+    //         C[i*k + j] = A[i*n]*B[j];
+    //         for (int s=1; s< n ; s++)
+    //         {
+    //             C[i*k + j] += A[i*n + s]*B[s*k + j];
+    //         }
+    //     }
+    // }
+//     return ESP_OK;
+// }
+
+dspm_mult_ex_f32_arp4: 
+// A - a2: a0
+// B - a3: a1
+// C - a4: a2
+// m - a5: a3
+// n - a6: a4
+// k - a7: a5
+
+// a8:a6  = n*4
+// a10:t0 = 4
+// a9:a7  - counter loop1: 0..m
+// a11:t1 - counter loop2: 0..k
+// a12:t2 - A
+// a13:t3 - B
+// a14:t4
+// a15:t5
+
+    add sp,sp,-16
+    // Array increment for floating point data should be 4
+.dspm_mult_ex_f32_arp4_body:
+
+    mv      t5, a7
+
+    add     t4, a6, a4    // A_step = A_padding + A_cols (n)
+    add     t5, t5, a5    // B_step = B_padding + B_cols (k)
+    slli    t5, t5, 2     // Pointer increment for B (B_step * 4)
+    slli    t4, t4, 2     // A_step << 2
+    lw      a6,  16(sp)   // C_padding from stack
+    slli    a6, a6, 2     // C_step << 2
+    
+    li  a7, 0  // counter loop1
+
+.dpf_loop1:    
+    li  t1, 0 // reset counter for loop2
+.dpf_loop2:
+
+        // Clear initial state of the result register
+        // a2 - A
+        // a3 - B
+        // a6 - n
+        // a10 - step == 4 bytes
+        // a8 -  step n*4
+        mv      t2, a0 // load A
+
+        slli     t3, t1, 2 // loop count to pointer value
+        add      t3, a1, t3 // load A
+
+        fmv.w.x fa2,zero // reset fa2
+        // Calculating dotproduct...
+        esp.lp.setup    0, a4, .matrix_mul_loop
+            flw     fa0, 0(t2)
+            add     t2, t2, 4
+            flw     fa1, 0(t3)
+            fmadd.s   fa2, fa1, fa0, fa2
+        .matrix_mul_loop: add       t3, t3, t5
+
+        fsw     fa2, 0(a2)
+        addi    a2, a2, 4 // increment a2 for next time
+        // check loop 2
+        addi  t1, t1, 1 // Increment loop2 counter
+        blt   t1, a5, .dpf_loop2
+
+    // check loop 1
+    add   a0, a0, t4      // A += (A_step << 2)
+    add   a2, a2, a6      // output += (C_padding << 2)
+
+    add   a7, a7, 1 // Increment loop1 counter
+    blt   a7, a3, .dpf_loop1
+
+    // Exit
+    li  a0, 0       // return status ESP_OK
+    add sp,sp,16
+    ret
+
+#endif //dspm_mult_ex_f32_arp4_enabled
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/float/dspm_mult_f32_ae32.S
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/float/dspm_mult_f32_ae32.S
@@ -0,0 +1,104 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspm_mult_platform.h"
+#if (dspm_mult_f32_ae32_enabled == 1)
+
+#include "dsps_dotprode_f32_m_ae32.S"
+
+// This is matrix multipliction function for ESP32 processor.
+	.text
+	.align  4
+	.global dspm_mult_f32_ae32
+	.global .dspm_mult_f32_ae32_body
+	.type   dspm_mult_f32_ae32,@function
+// The function implements the following C code:
+// esp_err_t dspm_mult_f32_ansi(const float* A, const float* B, float* C, int m, int n, int k)
+// {
+	// for (int i=0 ; i< m ; i++)
+	// {
+	//     for (int j=0 ; j< k ; j++)
+	//     {
+	//         C[i*k + j] = A[i*n]*B[j];
+	//         for (int s=1; s< n ; s++)
+	//         {
+	//             C[i*k + j] += A[i*n + s]*B[s*k + j];
+	//         }
+	//     }
+	// }
+//     return ESP_OK;
+// }
+
+dspm_mult_f32_ae32: 
+// A - a2
+// B - a3
+// C - a4
+// m - a5
+// n - a6
+// k - a7
+
+// a8  = n*4
+// a10 = 4
+// a9  - counter loop1: 0..m
+// a11 - counter loop2: 0..k
+// a12 - A
+// a13 - B
+// a4  - C
+
+	entry	a1, 16
+	// Array increment for floating point data should be 4
+.dspm_mult_f32_ae32_body:
+	slli    a8, a6, 2 // Pointer increment for A
+	slli    a15,a7, 2 // Pointer increment for B
+
+	movi.n	a14, 0 // Innitial state of accumulator f1
+	movi.n	a10, 4 // Increment = 4
+	movi.n	a9, 0  // counter loop1
+
+.dpf_loop1:    
+	movi.n	a11, 0 // reset counter for loop2
+.dpf_loop2:
+
+	// Clear initial state of the result register
+	// a2 - A
+	// a3 - B
+	// a6 - n
+	// a10 - step == 4 bytes
+	// a8 -  step n*4
+	mov      a12, a2 // load A
+
+	slli     a13, a11, 2 // loop count to pointer value
+	add.n    a13, a3, a13 // load A
+
+	wfr	    f1, a14 // reset f1
+	// Calculating dotproduct...
+	dotprode_f32_ae32 a12, a13, a6, a10, a15;
+
+	ssi	    f1, a4, 0 // Store result from f1 to memory at a4
+	addi    a4, a4, 4 // increment a4 for next time
+
+	// check loop 2
+	addi  a11, a11, 1 // Increment loop2 counter
+	blt   a11, a7, .dpf_loop2
+
+	// check loop 1
+	add.n   a2, a2, a8 // Increment A, A = A[i*n]
+
+	addi  a9, a9, 1 // Increment loop1 counter
+	blt   a9, a5, .dpf_loop1
+
+	movi.n	a2, 0 // return status ESP_OK
+	retw.n
+
+#endif //dspm_mult_f32_ae32_enabled
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/float/dspm_mult_f32_aes3.S
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/float/dspm_mult_f32_aes3.S
@@ -0,0 +1,150 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspm_mult_platform.h"
+
+#if (dspm_mult_f32_aes3_enabled == 1)
+
+
+// This is matrix multipliction function for ESP32 processor.
+	.text
+	.align  4
+	.global dspm_mult_f32_aes3
+	.global .dspm_mult_f32_ae32_body
+	.type   dspm_mult_f32_aes3,@function
+// The function implements the following C code:
+// esp_err_t dspm_mult_f32_ansi(const float* A, const float* B, float* C, int m, int n, int k)
+// {
+	// for (int i=0 ; i< m ; i++)
+	// {
+	//     for (int j=0 ; j< k ; j++)
+	//     {
+	//         C[i*k + j] = A[i*n]*B[j];
+	//         for (int s=1; s< n ; s++)
+	//         {
+	//             C[i*k + j] += A[i*n + s]*B[s*k + j];
+	//         }
+	//     }
+	// }
+//     return ESP_OK;
+// }
+
+dspm_mult_f32_aes3: 
+	entry	a1, 16
+// A - a2
+// B - a3
+// C - a4
+// m - a5
+// n - a6
+// k - a7
+
+	// Ccheck if we can use S3 memory model:
+	or a12, a5, a6
+	or a12, a7, a12
+	movi.n	a11, 3
+	and a12, a12, a11
+	movi.n   a11, 15
+	or       a10, a3, a2
+	or       a10, a10, a4
+	and		 a10, a10, a11
+	or		 a12, a12, a10
+	beqz  a12, .s3_mmult
+	// Call Esp32 function
+	J 	.dspm_mult_f32_ae32_body
+
+.s3_mmult:
+// f0, f1, f2, f3 - multiplication result
+// f4, f5, f6, f7 - input for matrix B
+// f8, f9, f10,f11- input far matrix A
+	movi.n	a14, 0
+
+	slli     	a12, a7, 2		// a12 = K*4 - step for rows
+	slli     	a10, a7, 2		// a10 = K*4 - step for rows
+	srli	    a11, a6, 2		// N count
+	addi.n		a11, a11, -1
+
+	movi.n		a15, 0
+	mov	 a13, a3
+	mov  a7, a4
+
+.loop_x_aes3:
+	movi.n		a9, 0
+	mov      	a8,  a2		// A matirx
+	.loop_y_aes3:
+		add	 a13, a3, a14		// Reload Y pointer to Y11 + A14
+		EE.LDF.128.IP f11, f10, f9, f8, a8, 16  // Load A values: X11, X12, X13, X14
+		EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y11, Y12, Y13, Y14
+		mul.s	f0, f4, f8		// f0 = X11*Y11
+		mul.s	f1, f5, f8		// f1 = X12*Y11
+		mul.s	f2, f6, f8		// f2 = X13*Y11
+		mul.s	f3, f7, f8		// f3 = X14*Y11
+
+		EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y21, Y22, Y23, Y24
+		madd.s	f0, f4, f9		// f0 = X11*Y11 + X12*Y21
+		madd.s	f1, f5, f9		// f1 = X11*Y12 + X12*Y22
+		madd.s	f2, f6, f9		// f2 = X11*Y13 + X12*Y23
+		madd.s	f3, f7, f9		// f3 = X11*Y14 + X12*Y24
+
+		EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y31, Y32, Y33, Y34
+		madd.s	f0, f4, f10		// f0 = X11*Y11 + X12*Y21 + X13*Y31
+		madd.s	f1, f5, f10		// f1 = X11*Y12 + X12*Y22 + X13*Y32
+		madd.s	f2, f6, f10		// f2 = X11*Y13 + X12*Y23 + X13*Y33
+		madd.s	f3, f7, f10		// f3 = X11*Y14 + X12*Y24 + X13*Y34
+
+		EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y41, Y42, Y43, Y44
+		madd.s	f0, f4, f11		// f0 = X11*Y11 + X12*Y21 + X13*Y31 + X14*Y41
+		madd.s	f1, f5, f11		// f1 = X11*Y12 + X12*Y22 + X13*Y32 + X14*Y42
+		madd.s	f2, f6, f11		// f2 = X11*Y13 + X12*Y23 + X13*Y33 + X14*Y43
+		madd.s	f3, f7, f11		// f3 = X11*Y14 + X12*Y24 + X13*Y34 + X14*Y44
+		
+		loopnez a11, .loop_end_m_aes3
+			EE.LDF.128.IP f11, f10, f9, f8, a8, 16  // Load A values: X15, X16, X17, X18
+
+			EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y51, Y52, Y53, Y54
+			madd.s	f0, f4, f8		// f0 += X15*Y51
+			madd.s	f1, f5, f8		// f1 += X15*Y52
+			madd.s	f2, f6, f8		// f2 += X15*Y53
+			madd.s	f3, f7, f8		// f3 += X15*Y54
+
+			EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y61, Y62, Y63, Y64
+			madd.s	f0, f4, f9		// f0 += X16*Y61
+			madd.s	f1, f5, f9		// f1 += X16*Y62 
+			madd.s	f2, f6, f9		// f2 += X16*Y63 
+			madd.s	f3, f7, f9		// f3 += X16*Y64 
+
+			EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y71, Y72, Y73, Y74
+			madd.s	f0, f4, f10		// f0 = 
+			madd.s	f1, f5, f10		// f1 = 
+			madd.s	f2, f6, f10		// f2 = 
+			madd.s	f3, f7, f10		// f3 = 
+
+			EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y81, Y82, Y83, Y84
+			madd.s	f0, f4, f11		// f0 = 
+			madd.s	f1, f5, f11		// f1 = 
+			madd.s	f2, f6, f11		// f2 = 
+			madd.s	f3, f7, f11		// f3 = 
+		.loop_end_m_aes3:
+		EE.STF.128.XP f3, f2, f1, f0, a4, a10 // Store result 
+
+		addi  a9, a9, 1 // Increment loop1 counter
+	blt   a9, a5, .loop_y_aes3
+	addi.n  a7, a7, 16
+	mov		a4, a7
+	addi.n  a14, a14, 16			// B shift for 4
+	addi  a15, a15, 16 // Increment loop1 counter
+blt   a15, a12, .loop_x_aes3
+	movi.n	a2, 0 // return status ESP_OK
+	retw.n
+
+#endif //dspm_mult_f32_aes3_enabled
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/float/dspm_mult_f32_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/float/dspm_mult_f32_ansi.c
@@ -0,0 +1,33 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "dsps_dotprod.h"
+#include "dspm_mult.h"
+
+// Matrinx A(m,n), m - amount or rows, n - amount of columns
+// C(m,k) = A(m,n)*B(n,k)
+// c(i,j) = sum(a(i,s)*b(s,j)) , s=1..n
+esp_err_t dspm_mult_f32_ansi(const float *A, const float *B, float *C, int m, int n, int k)
+{
+    for (int i = 0 ; i < m ; i++) {
+        for (int j = 0 ; j < k ; j++) {
+            C[i * k + j] = A[i * n] * B[j];
+            for (int s = 1; s < n ; s++) {
+                C[i * k + j] += A[i * n + s] * B[s * k + j];
+            }
+        }
+    }
+    return ESP_OK;
+}
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/float/dspm_mult_f32_arp4.S
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/float/dspm_mult_f32_arp4.S
@@ -0,0 +1,109 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspm_mult_platform.h"
+#if (dspm_mult_f32_arp4_enabled == 1)
+
+// This is matrix multipliction function for ESP32 processor.
+    .text
+    .align  4
+    .global dspm_mult_f32_arp4
+    .global .dspm_mult_f32_arp4_body
+    .type   dspm_mult_f32_arp4,@function
+// The function implements the following C code:
+// esp_err_t dspm_mult_f32_ansi(const float* A, const float* B, float* C, int m, int n, int k)
+// {
+    // for (int i=0 ; i< m ; i++)
+    // {
+    //     for (int j=0 ; j< k ; j++)
+    //     {
+    //         C[i*k + j] = A[i*n]*B[j];
+    //         for (int s=1; s< n ; s++)
+    //         {
+    //             C[i*k + j] += A[i*n + s]*B[s*k + j];
+    //         }
+    //     }
+    // }
+//     return ESP_OK;
+// }
+
+dspm_mult_f32_arp4: 
+// A - a2: a0
+// B - a3: a1
+// C - a4: a2
+// m - a5: a3
+// n - a6: a4
+// k - a7: a5
+
+// a8:a6  = n*4
+// a10:t0 = 4
+// a9:a7  - counter loop1: 0..m
+// a11:t1 - counter loop2: 0..k
+// a12:t2 - A
+// a13:t3 - B
+// a14:t4
+// a15:t5
+
+    add sp,sp,-16
+    // Array increment for floating point data should be 4
+.dspm_mult_f32_arp4_body:
+    slli    a6, a4, 2 // Pointer increment for A
+    slli    t5,a5, 2 // Pointer increment for B
+
+    li  t4, 0 // Innitial state of accumulator f1
+    li  t0, 4 // Increment = 4
+    li  a7, 0  // counter loop1
+
+.dpf_loop1:    
+    li  t1, 0 // reset counter for loop2
+.dpf_loop2:
+
+        // Clear initial state of the result register
+        // a2 - A
+        // a3 - B
+        // a6 - n
+        // a10 - step == 4 bytes
+        // a8 -  step n*4
+        mv      t2, a0 // load A
+
+        slli     t3, t1, 2 // loop count to pointer value
+        add      t3, a1, t3 // load A
+
+        fmv.w.x fa2,zero // reset fa2
+        // Calculating dotproduct...
+        esp.lp.setup    0, a4, .matrix_mul_loop
+            flw     fa0, 0(t2)
+            add     t2, t2, t0
+            flw     fa1, 0(t3)
+            fmadd.s   fa2, fa1, fa0, fa2
+        .matrix_mul_loop: add       t3, t3, t5
+
+        fsw     fa2, 0(a2)
+        addi    a2, a2, 4 // increment a2 for next time
+        // check loop 2
+        addi  t1, t1, 1 // Increment loop2 counter
+        blt   t1, a5, .dpf_loop2
+
+    // check loop 1
+    add   a0, a0, a6 // Increment A, A = A[i*n]
+
+    add   a7, a7, 1 // Increment loop1 counter
+    blt   a7, a3, .dpf_loop1
+
+    // Exit
+    mv  a0, a6      // return status ESP_OK
+    add sp,sp,16
+    ret
+
+#endif //dspm_mult_f32_arp4_enabled
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/include/dspm_mult.h
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/include/dspm_mult.h
@@ -0,0 +1,232 @@
+// Copyright 2018-2023 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef _dspm_mult_H_
+#define _dspm_mult_H_
+
+#include "dsp_err.h"
+#include "dspm_mult_platform.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/**@{*/
+/**
+ * @brief   Matrix multiplication
+ *
+ * Matrix multiplication for two floating point matrices: C[m][k] = A[m][n] * B[n][k]
+ * The extension (_ansi) use ANSI C and could be compiled and run on any platform.
+ * The extension (_ae32) is optimized for ESP32 chip.
+ *
+ * @param[in] A  input matrix A[m][n]
+ * @param[in] B  input matrix B[n][k]
+ * @param C  result matrix C[m][k]
+ * @param[in] m  matrix dimension
+ * @param[in] n  matrix dimension
+ * @param[in] k  matrix dimension
+ * @return
+ *      - ESP_OK on success
+ *      - One of the error codes from DSP library
+ */
+esp_err_t dspm_mult_f32_ansi(const float *A, const float *B, float *C, int m, int n, int k);
+esp_err_t dspm_mult_f32_ae32(const float *A, const float *B, float *C, int m, int n, int k);
+esp_err_t dspm_mult_f32_aes3(const float *A, const float *B, float *C, int m, int n, int k);
+esp_err_t dspm_mult_f32_arp4(const float *A, const float *B, float *C, int m, int n, int k);
+/**@}*/
+
+
+/**
+ * @brief   Matrix multiplication A[3x3]xB[3x1]
+ *
+ * Matrix multiplication for two floating point matrices 3x3 and 3x1: C[1][3] = A[3][3] * B[3][1]
+ * The implementation is optimized for ESP32 chip.
+ *
+ * @param[in] A  input matrix A[3][3]
+ * @param[in] B  input matrix/vector B[3][1]
+ * @param C  result matrix/vector C[3][3]
+ * @return
+ *      - ESP_OK on success
+ *      - One of the error codes from DSP library
+ */
+esp_err_t dspm_mult_3x3x1_f32_ae32(const float *A, const float *B, float *C);
+
+/**
+ * @brief   Matrix multiplication A[3x3]xB[3x3]
+ *
+ * Matrix multiplication for two square 3x3 floating point matrices: C[3][3] = A[3][3] * B[3][3]
+ * The implementation is optimized for ESP32 chip.
+ *
+ * @param[in] A  input matrix A[3][3]
+ * @param[in] B  input matrix B[3][3]
+ * @param C  result matrix C[3][3]
+ * @return
+ *      - ESP_OK on success
+ *      - One of the error codes from DSP library
+ */
+esp_err_t dspm_mult_3x3x3_f32_ae32(const float *A, const float *B, float *C);
+
+/**
+ * @brief   Matrix multiplication A[4x4]xB[4x1]
+ *
+ * Matrix multiplication for two floating point matrices 4x4 and 4x1: C[1][4] = A[4][4] * B[4][1]
+ * The implementation is optimized for ESP32 chip.
+ *
+ * @param[in] A  input matrix A[4][4]
+ * @param[in] B  input matrix/vector B[4][1]
+ * @param C  result matrix/vector C[4][4]
+ * @return
+ *      - ESP_OK on success
+ *      - One of the error codes from DSP library
+ */
+
+esp_err_t dspm_mult_4x4x1_f32_ae32(const float *A, const float *B, float *C);
+
+/**
+ * @brief   Matrix multiplication A[4x4]xB[4x4]
+ *
+ * Matrix multiplication for two square 3x3 floating point matrices: C[4][4] = A[4][4] * B[4][4]
+ * The implementation is optimized for ESP32 chip.
+ *
+ * @param[in] A  input matrix A[4][4]
+ * @param[in] B  input matrix B[4][4]
+ * @param C  result matrix C[4][4]
+ * @return
+ *      - ESP_OK on success
+ *      - One of the error codes from DSP library
+ */
+esp_err_t dspm_mult_4x4x4_f32_ae32(const float *A, const float *B, float *C);
+
+/**@{*/
+/**
+ * @brief   Matrix multiplication 16 bit signeg int
+ *
+ * Matrix multiplication for two signed 16 bit fixed point matrices: C[m][k] = (A[m][n] * B[n][k]) >> (15- shift)
+ * The extension (_ansi) use ANSI C and could be compiled and run on any platform.
+ * The extension (_ae32) is optimized for ESP32 chip.
+ *
+ * @param[in] A  input matrix A[m][n]
+ * @param[in] B  input matrix B[n][k]
+ * @param C  result matrix C[m][k]
+ * @param[in] m  matrix dimension
+ * @param[in] n  matrix dimension
+ * @param[in] k  matrix dimension
+ * @param[in] shift every result will be shifted and stored as 16 bit signed value.
+ * @return
+ *      - ESP_OK on success
+ *      - One of the error codes from DSP library
+ */
+esp_err_t dspm_mult_s16_ansi(const int16_t *A, const int16_t *B, int16_t *C, int m, int n, int k, int shift);
+esp_err_t dspm_mult_s16_ae32(const int16_t *A, const int16_t *B, int16_t *C, int m, int n, int k, int shift);
+esp_err_t dspm_mult_s16_aes3(const int16_t *A, const int16_t *B, int16_t *C, int m, int n, int k, int shift);
+esp_err_t dspm_mult_s16_arp4(const int16_t *A, const int16_t *B, int16_t *C, int m, int n, int k, int shift);
+/**@}*/
+
+/**@{*/
+/**
+ * @brief   Matrix subset multiplication
+ *
+ * One or all of the matrices are matrix subsets, described with pointers and strides
+ * Matrix multiplication for two floating point matrices: C[m][k] = A[m][n] * B[n][k]
+ * The extension (_ansi) use ANSI C and could be compiled and run on any platform.
+ * The extension (_ae32) is optimized for ESP32 chip.
+ *
+ * @param[in]  A  input matrix A[m][n]
+ * @param[in]  B  input matrix B[n][k]
+ * @param[out] C  result matrix C[m][k]
+ * @param[in]  m  matrix dimension
+ * @param[in]  n  matrix dimension
+ * @param[in]  k  matrix dimension
+ * @param[in]  A_padd  input matrix A padding
+ * @param[in]  B_padd  input matrix B padding
+ * @param[in]  C_padd  result matrix C padding
+ * @return
+ *      - ESP_OK on success
+ *      - One of the error codes from DSP library
+ */
+esp_err_t dspm_mult_ex_f32_ansi(const float *A, const float *B, float *C, int m, int n, int k, int A_padd, int B_padd, int C_padd);
+esp_err_t dspm_mult_ex_f32_ae32(const float *A, const float *B, float *C, int m, int n, int k, int A_padd, int B_padd, int C_padd);
+esp_err_t dspm_mult_ex_f32_aes3(const float *A, const float *B, float *C, int m, int n, int k, int A_padd, int B_padd, int C_padd);
+esp_err_t dspm_mult_ex_f32_arp4(const float *A, const float *B, float *C, int m, int n, int k, int A_padd, int B_padd, int C_padd);
+
+#ifdef __cplusplus
+}
+#endif
+
+#if CONFIG_DSP_OPTIMIZED
+
+
+#if (dspm_mult_s16_aes3_enabled == 1)
+#define dspm_mult_s16 dspm_mult_s16_aes3
+#elif (dspm_mult_s16_ae32_enabled == 1)
+#define dspm_mult_s16 dspm_mult_s16_ae32
+#elif (dspm_mult_s16_arp4_enabled == 1)
+#define dspm_mult_s16 dspm_mult_s16_arp4
+#else
+#define dspm_mult_s16 dspm_mult_s16_ansi
+#endif
+
+#if (dspm_mult_f32_aes3_enabled == 1)
+#define dspm_mult_f32 dspm_mult_f32_aes3
+#define dspm_mult_ex_f32 dspm_mult_ex_f32_aes3
+#elif (dspm_mult_f32_ae32_enabled == 1)
+#define dspm_mult_f32 dspm_mult_f32_ae32
+#define dspm_mult_ex_f32 dspm_mult_ex_f32_ae32
+#elif (dspm_mult_f32_arp4_enabled == 1)
+#define dspm_mult_f32 dspm_mult_f32_arp4
+#define dspm_mult_ex_f32 dspm_mult_ex_f32_arp4
+#else
+#define dspm_mult_f32 dspm_mult_f32_ansi
+#define dspm_mult_ex_f32 dspm_mult_ex_f32_ansi
+#endif
+
+#if (dspm_mult_3x3x1_f32_ae32_enabled == 1)
+#define dspm_mult_3x3x1_f32 dspm_mult_3x3x1_f32_ae32
+#else
+#define dspm_mult_3x3x1_f32(A,B,C) dspm_mult_f32(A,B,C, 3, 3, 1)
+#endif
+#if (dspm_mult_3x3x3_f32_ae32_enabled == 1)
+#define dspm_mult_3x3x3_f32(A,B,C) dspm_mult_3x3x3_f32_ae32(A,B,C)
+#else
+#define dspm_mult_3x3x3_f32(A,B,C) dspm_mult_f32(A,B,C,3,3,3);
+#endif
+#if (dspm_mult_4x4x1_f32_ae32_enabled == 1)
+#define dspm_mult_4x4x1_f32(A,B,C) dspm_mult_4x4x1_f32_ae32(A,B,C)
+#else
+#define dspm_mult_4x4x1_f32(A,B,C) dspm_mult_f32(A,B,C, 4, 4, 1)
+#endif
+
+#if (dspm_mult_f32_aes3_enabled == 1)
+#define dspm_mult_4x4x4_f32(A,B,C) dspm_mult_f32_aes3(A,B,C, 4, 4, 4)
+#elif (dspm_mult_4x4x4_f32_ae32_enabled == 1)
+#define dspm_mult_4x4x4_f32 dspm_mult_4x4x4_f32_ae32
+#else
+#define dspm_mult_4x4x4_f32(A,B,C) dspm_mult_f32(A,B,C, 4, 4, 4)
+#endif
+
+#else
+#define dspm_mult_s16 dspm_mult_s16_ansi
+#define dspm_mult_f32 dspm_mult_f32_ansi
+#define dspm_mult_3x3x1_f32(A,B,C) dspm_mult_f32(A,B,C, 3, 3, 1)
+#define dsps_sub_f32 dsps_sub_f32_ansi
+#define dsps_add_f32 dsps_add_f32_ansi
+#define dspm_mult_4x4x4_f32(A,B,C) dspm_mult_f32(A,B,C, 4, 4, 4)
+#define dspm_mult_ex_f32 dspm_mult_ex_f32_ansi
+#define dspm_mult_3x3x3_f32(A,B,C) dspm_mult_f32(A,B,C,3,3,3);
+#define dspm_mult_4x4x1_f32(A,B,C) dspm_mult_f32(A,B,C, 4, 4, 1)
+#endif // CONFIG_DSP_OPTIMIZED
+
+
+#endif // _dspm_mult_H_
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/include/dspm_mult_platform.h
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/include/dspm_mult_platform.h
@@ -0,0 +1,44 @@
+#ifndef _dspm_mult_platform_H_
+#define _dspm_mult_platform_H_
+
+#include "sdkconfig.h"
+
+#ifdef __XTENSA__
+#include <xtensa/config/core-isa.h>
+#include <xtensa/config/core-matmap.h>
+
+
+#if ((XCHAL_HAVE_FP == 1) && (XCHAL_HAVE_LOOPS == 1))
+
+#define dspm_mult_f32_ae32_enabled 1
+#define dspm_mult_3x3x1_f32_ae32_enabled 1
+#define dspm_mult_3x3x3_f32_ae32_enabled 1
+#define dspm_mult_4x4x1_f32_ae32_enabled 1
+#define dspm_mult_4x4x4_f32_ae32_enabled 1
+
+#endif
+
+#if ((XCHAL_HAVE_LOOPS == 1) && (XCHAL_HAVE_MAC16 == 1))
+
+#define dspm_mult_s16_ae32_enabled 1
+
+#endif
+#endif // __XTENSA__
+
+#if CONFIG_IDF_TARGET_ESP32S3
+#define dspm_mult_f32_aes3_enabled 1
+#define dspm_mult_s16_aes3_enabled 1
+#endif
+
+#if CONFIG_IDF_TARGET_ESP32P4
+#ifdef CONFIG_DSP_OPTIMIZED
+#define dspm_mult_f32_arp4_enabled 1
+#define dspm_mult_s16_arp4_enabled 1
+#else
+#define dspm_mult_f32_arp4_enabled 0
+#define dspm_mult_s16_arp4_enabled 0
+#endif // CONFIG_DSP_OPTIMIZED
+
+#endif
+
+#endif // _dspm_mult_platform_H_
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/test/include/test_mat_common.h
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/test/include/test_mat_common.h
@@ -0,0 +1,84 @@
+/*
+ * SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef _test_mat_common_H_
+#define _test_mat_common_H_
+
+#include "dspm_mult.h"
+#include "dsp_err.h"
+#include "dspm_mult_platform.h"
+#include "esp_dsp.h"
+#include "dsp_platform.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/**
+ * @brief data type for testing operations with sub-matrices
+ *
+ * test evaluation in the test app for matrices check
+ * compare 2 matrices
+ */
+typedef struct m_test_data_s {
+    int var;
+    int A_start_row;
+    int A_start_col;
+    int B_start_row;
+    int B_start_col;
+    int C_start_row;
+    int C_start_col;
+    int m;
+    int n;
+    int k;
+} m_test_data_t;
+
+/**
+ * @brief check whether 2 matrices are equal
+ *
+ * test evaluation in the test app for matrices check
+ * compare 2 matrices
+ *
+ * @param[in] m_expected: reference matrix
+ * @param[in] m_actual: matrix to be evaluated
+ * @param[in] message: message for test app, in case the test fails
+ *
+ */
+void test_assert_equal_mat_mat(dspm::Mat &m_expected, dspm::Mat &m_actual, const char *message);
+
+/**
+ * @brief check whether a matrix is set to a constant
+ *
+ * test evaluation in the test app for matrices check
+ * compare matrix with constant
+ *
+ * @param[in] m_actual: matrix to be evaluated
+ * @param[in] num: reference constant
+ * @param[in] message: message for test app, if a test fails
+ *
+ */
+void test_assert_equal_mat_const(dspm::Mat &m_actual, float num, const char *message);
+
+/**
+ * @brief check if an area around a sub-matrix is unaffected
+ *
+ * test evaluation in the test app for matrices check
+ *
+ * @param[in] m_origin: original matrix
+ * @param[in] m_modified: sub-matrix, which is created from m_orign
+ * @param[in] start_row: sub-matrix start row
+ * @param[in] start_col: sub-matrix start col
+ * @param[in] message: message for test app, in case the test fails
+ *
+ */
+void test_assert_check_area_mat_mat(dspm::Mat &m_origin, dspm::Mat &m_modified, int start_row, int start_col, const char *message);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _test_mat_common_H_
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/test/test_mat_common.cpp
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/test/test_mat_common.cpp
@@ -0,0 +1,74 @@
+/*
+ * SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <string.h>
+#include "unity.h"
+#include "esp_log.h"
+
+#include "esp_attr.h"
+#include "dsp_tests.h"
+#include "test_mat_common.h"
+
+void test_assert_equal_mat_mat(dspm::Mat &m_expected, dspm::Mat &m_actual, const char *message)
+{
+    for (int row = 0; row < m_expected.rows; row++) {
+        for (int col = 0; col < m_expected.cols; col++) {
+            TEST_ASSERT_EQUAL_FLOAT_MESSAGE(m_expected(row, col), m_actual(row, col), message);
+        }
+    }
+}
+
+void test_assert_equal_mat_const(dspm::Mat &m_actual, float num, const char *message)
+{
+    for (int row = 0; row < m_actual.rows; row++) {
+        for (int col = 0; col < m_actual.cols; col++) {
+            TEST_ASSERT_EQUAL_FLOAT_MESSAGE(num, m_actual(row, col), message);
+        }
+    }
+}
+
+void test_assert_check_area_mat_mat(dspm::Mat &m_origin, dspm::Mat &m_modified, int start_row, int start_col, const char *message)
+{
+    float *m_origin_ptr = m_origin.data;
+    float *m_modified_ptr = m_modified.data;
+
+    // set ptr of modified matrix back to the beginning
+    const int ptr_shift = (start_row * m_origin.cols) + start_col;
+    m_modified_ptr -= ptr_shift;
+    const int end_of_matrix_space = m_origin.length - m_modified.length - ptr_shift - ((m_modified.rows - 1) * m_modified.padding);
+
+    // original matrix area before the sub-matrix
+    for (int index = 0; index < ptr_shift; index++) {
+        TEST_ASSERT_EQUAL_FLOAT_MESSAGE(*m_origin_ptr, *m_modified_ptr, message);
+        m_origin_ptr++;
+        m_modified_ptr++;
+    }
+
+    // in and between the sub-matrix area
+    for (int row = 0; row < m_modified.rows; row++) {
+        // The actual sub-matrix (accessed area)
+        for (int mat_col = 0; mat_col < m_modified.cols; mat_col++) {
+            m_origin_ptr++;
+            m_modified_ptr++;
+        }
+
+        // padding area
+        if (row != (m_modified.rows - 1)) {     // skip padding after last row
+            for (int padd_col = 0; padd_col < m_modified.padding; padd_col++) {
+                TEST_ASSERT_EQUAL_FLOAT_MESSAGE(*m_origin_ptr, *m_modified_ptr, message);
+                m_origin_ptr++;
+                m_modified_ptr++;
+            }
+        }
+    }
+
+    // original matrix area after the sub-matrix
+    for (int index = 0; index < end_of_matrix_space; index++) {
+        TEST_ASSERT_EQUAL_FLOAT_MESSAGE(*m_origin_ptr, *m_modified_ptr, message);
+        m_origin_ptr++;
+        m_modified_ptr++;
+    }
+}
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/test/test_mat_f32.cpp
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/test/test_mat_f32.cpp
@@ -0,0 +1,270 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "esp_dsp.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+
+#include "dspm_mult.h"
+#include "esp_attr.h"
+#include "dsp_tests.h"
+#include "mat.h"
+
+static const char *TAG = "dspm_Mat";
+
+TEST_CASE("Mat class ", "[dspm]")
+{
+    int m = 3;
+    int n = 3;
+    dspm::Mat mat(m, n);
+    std::cout << "Test matrix: rows: " << mat.rows << ", columns: " << mat.cols << std::endl;
+    std::cout << mat;
+}
+
+TEST_CASE("Mat class check solve ", "[dspm]")
+{
+    int m = 3;
+    int n = 3;
+    float data_a[9] = {3, 2, 1, 2, 3, 1, 2, 1, 3};
+    float data_b[9] = {5, -1, 4};
+    dspm::Mat A(data_a, m, n);
+    dspm::Mat b(data_b, m, 1);
+    dspm::Mat x1 = dspm::Mat::solve(A, b);
+    std::cout << "Solve result matrix: rows: " << x1.rows << ", columns: " << x1.cols << std::endl;
+    std::cout << (x1 * 12).t();
+    dspm::Mat x2 = dspm::Mat::roots(A, b);
+    std::cout << "Roots result matrix: rows: " << x2.rows << ", columns: " << x2.cols << std::endl;
+    std::cout << (x2 * 12).t();
+    dspm::Mat diff_b = x1 - x2;
+    std::cout << "Difference between solve() abd roots(): " << diff_b.t();
+    for (int m = 0 ; m < diff_b.rows; m++) {
+        for (int n = 0 ; n < diff_b.cols ; n++) {
+            if (fabs(diff_b(m, n)) > 0.000001) {
+                TEST_ASSERT_MESSAGE (false, "Calculation is incorrect! Error more then expected!");
+            }
+        }
+    }
+}
+
+TEST_CASE("Mat class basic operations", "[dspm]")
+{
+    int M = 4;
+    int N = 4;
+
+    dspm::Mat A(M, N);
+    dspm::Mat x(N, 1);
+    for (int m = 0 ; m < M ; m++) {
+        for (int n = 0 ; n < N ; n++) {
+            A(m, n) = N * (m + 1) + (n + 1);
+        }
+        x(m, 0) = m + 2;
+    }
+
+    A(0, 0) = 10;
+    A(0, 1) = 11;
+
+
+    dspm::Mat b = A * x;
+    dspm::Mat x1_ = dspm::Mat::solve(A, b);
+    dspm::Mat x2_ = dspm::Mat::roots(A, b);
+
+    ESP_LOGI(TAG, "Matrix A:");
+    std::cout << A;
+    ESP_LOGI(TAG, "Matrix x.t():");
+    std::cout << x.t();
+    ESP_LOGI(TAG, "Matrix b.t():");
+    std::cout << b.t();
+    ESP_LOGI(TAG, "Solve result:");
+    std::cout << x1_.t();
+    ESP_LOGI(TAG, "Roots result:");
+    std::cout << x2_.t();
+    dspm::Mat check_b = A * x1_;
+    ESP_LOGI(TAG, "Result b.t():");
+    std::cout << check_b.t();
+    dspm::Mat diff_b = check_b - b;
+    ESP_LOGI(TAG, "Difference:");
+    std::cout << diff_b.t();
+
+    for (int m = 0 ; m < diff_b.rows; m++) {
+        for (int n = 0 ; n < diff_b.cols ; n++) {
+            float error = fabs(diff_b(m, n));
+            if (fabs(diff_b(m, n)) > 0.0001) {
+                ESP_LOGE(TAG, "Solve calculation error: %f", error);
+                TEST_ASSERT_MESSAGE (false, "Calculation is incorrect! Error more then expected!");
+            }
+        }
+    }
+}
+
+TEST_CASE("Mat class operators", "[dspm]")
+{
+    int M = 4;
+    int N = 4;
+
+    dspm::Mat test1(M, N);
+    dspm::Mat test2(M, N);
+    dspm::Mat result(M, N);
+    float *check_array = new float[M * N];
+    for (int m = 0 ; m < M ; m++) {
+        for (int n = 0 ; n < N ; n++) {
+            test1(m, n) = (m * N + n) * 2;
+            test2(m, n) = m * N + n;
+            result(m, n) = 0;
+        }
+    }
+
+    result = test1 + test2;
+    for (int m = 0 ; m < M ; m++) {
+        for (int n = 0 ; n < N ; n++) {
+            if ((result(m, n) != (test1(m, n) + test2(m, n))) ||
+                    (result(m, n) != 3 * (m * N + n)) ||
+                    (result.data[m * N + n] != 3 * (m * N + n))) {
+                TEST_ASSERT_MESSAGE (false, "Error in + operator!");
+            }
+        }
+    }
+    result = test1 - test2;
+    for (int m = 0 ; m < M ; m++) {
+        for (int n = 0 ; n < N ; n++) {
+            if ((result(m, n) != (test1(m, n) - test2(m, n))) ||
+                    (result(m, n) != (m * N + n)) ||
+                    (result.data[m * N + n] != (m * N + n))) {
+                TEST_ASSERT_MESSAGE (false, "Error in - operator!");
+            }
+        }
+    }
+    // Check * operator (result = A*B;)
+    // result = I*test2
+    // result == test2
+    test1 = test1.eye(test1.rows);
+    result = test1 * test2;
+    dspm::Mat result2 = test1;
+    result2 *= test2;
+
+    for (int m = 0 ; m < M ; m++) {
+        for (int n = 0 ; n < N ; n++) {
+            // if (result(m,n) < 0.000000001)
+            // {
+            //     result(m,n) = 0;
+            // }
+            if ((result(m, n) != test2(m, n)) ||
+                    (result(m, n) != (m * N + n)) ||
+                    (result.data[m * N + n] != (m * N + n))) {
+                std::cout << "Error: " << result(m, n) << "!=" << test2(m, n) << " , "
+                          << result(m, n) << "!=" << (m * N + n) << " , "
+                          << result.data[m * N + n] << "!=" << (m * N + n) << std::endl;
+                TEST_ASSERT_MESSAGE (false, "Error in * operator!");
+            }
+        }
+    }
+    if (!(result == result2)) {
+        std::cout << "result matrix: " << std::endl << result << std::endl;
+        std::cout << "result2 matrix: " << std::endl << result2 << std::endl;
+        TEST_ASSERT_MESSAGE (false, "Error in *= or in == operator!");
+    }
+    // Check * and + operator (result = A*const1 + const2;)
+
+    test1 = test2;
+    float const1 = 2;
+    float const2 = 10;
+    result = test1 * const1 + const2;
+    result = (result - const2) / const1;
+    for (int m = 0 ; m < M ; m++) {
+        for (int n = 0 ; n < N ; n++) {
+            if ((result(m, n) != test2(m, n)) ||
+                    (result(m, n) != (m * N + n)) ||
+                    (result.data[m * N + n] != (m * N + n))
+               ) {
+                TEST_ASSERT_MESSAGE (false, "Error in + * const operator!");
+            }
+        }
+    }
+    // Test block(...):
+    int count = 0;
+    for (int m = 0 ; m < M ; m++) {
+        for (int n = 0 ; n < N ; n++) {
+            result(m, n) = count++;
+        }
+    }
+    std::cout << "Original matrix: " <<  std::endl;
+    std::cout << result << std::endl;
+    std::cout << "block: " << std::endl;
+    std::cout << result.block(1, 1, M - 1, N - 1) << std::endl;
+    // Test normalize()
+    result = dspm::Mat(2, 2);
+    for (int m = 0 ; m < result.rows ; m++) {
+        for (int n = 0 ; n < result.cols ; n++) {
+            result(m, n) = 1;
+        }
+    }
+    std::cout << "Befor normalize: " << std::endl;
+    std::cout << result << std::endl;
+    result.normalize();
+    std::cout << "normalize: " << std::endl;
+    std::cout << result << std::endl;
+
+    for (int m = 0 ; m < result.rows ; m++) {
+        for (int n = 0 ; n < result.cols ; n++) {
+            if (std::abs(result(m, n) - 0.5) > dspm::Mat::abs_tol) {
+                ESP_LOGE(TAG, "Error bigger then expected: %f", std::abs(result(m, n) - 0.5));
+                TEST_ASSERT_MESSAGE (false, "Error in normalize() operation! ");
+            }
+        }
+    }
+    // Test inverse()
+    float m_data[] = {2, 5, 7,
+                      6, 3, 4,
+                      5, -2, -3
+                     };
+    float m_result[] = {  1.0000,   -1.0000,    1.0000,
+                          -38.0000,   41.0000,  -34.0000,
+                          27.0000,  -29.0000,   24.0000
+                       };
+    result = dspm::Mat(m_data, 3, 3);
+    result = result.inverse();
+    std::cout << "inverse: " << std::endl;
+    std::cout << result << std::endl;
+    for (int i = 0 ; i < 3 * 3 ; i++) {
+        if (std::abs(result.data[i] - m_result[i]) > 1e-4) {
+            printf("Error at[%i] = %f, expected= %f, calculated = %f \n", i, std::abs(result.data[i] - m_result[i]), m_result[i], result.data[i]);
+            TEST_ASSERT_MESSAGE (false, "Error in inverse() operation!\n");
+        }
+    }
+
+    result = dspm::Mat(m_data, 3, 3);
+    result = result.pinv();
+    std::cout << "pinv: " << std::endl;
+    std::cout << result << std::endl;
+    for (int i = 0 ; i < 3 * 3 ; i++) {
+        if (std::abs(result.data[i] - m_result[i]) > 1e-2) {
+            printf("Error at[%i] = %f, expected= %f, calculated = %f \n", i, std::abs(result.data[i] - m_result[i]), m_result[i], result.data[i]);
+            TEST_ASSERT_MESSAGE (false, "Error in pinv() operation!\n");
+        }
+    }
+
+    delete[] check_array;
+}
+
+TEST_CASE("mat.cpp functionality", "[dsps]")
+{
+    int max_size = 10;
+    for (int i = 3 ; i < max_size ; i++) {
+        dspm::Mat A = dspm::Mat::eye(i);
+        float det = A.det(i);
+        printf("Det[%i] = %f\n", i, det);
+        TEST_ASSERT_EQUAL(det, 1);
+    }
+}
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/test/test_mat_sub_f32.cpp
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/test/test_mat_sub_f32.cpp
@@ -0,0 +1,917 @@
+/*
+ * SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <string.h>
+#include <malloc.h>
+#include "unity.h"
+#include "esp_dsp.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+
+#include "dspm_mult.h"
+#include "esp_attr.h"
+#include "dsp_tests.h"
+#include "mat.h"
+#include "test_mat_common.h"
+
+static const char *TAG = "[dspm]";
+
+#define MAT_ROW 6       // test_matrix rows
+#define MAT_COL 6       // test_matrix cols
+#define ROI_ROW 4       // sub_matrix rows
+#define ROI_COL 4       // sub_matrix cols
+#define START_ROI 1     // start row/col dimension to create sub matrix from test matrix
+
+dspm::Mat::Rect roi_rect(START_ROI, START_ROI, ROI_ROW, ROI_COL);
+
+// matrix subset
+TEST_CASE("Mat class matrix subset", TAG)
+{
+    float data[25] = {0, 1, 2, 3, 4,
+                      5, 6, 7, 8, 9,
+                      0, 1, 2, 3, 4,
+                      5, 6, 7, 8, 9,
+                      0, 1, 2, 3, 4
+                     };
+
+    // Test matrix dimensions
+    const int m = 5;
+    const int n = 5;
+
+    dspm::Mat mat(data, m, n);
+    std::cout << "Test matrix: rows: " << mat.rows << ", columns: " << mat.cols << std::endl;
+    std::cout << mat << std::endl;
+
+    // Sub matrix method 1 - sub-matrix dimensions
+    int start_row = 1;
+    int start_col = 1;
+    int roi_rows = 4;
+    int roi_cols = 3;
+
+    // Create matrix subset as a shallow copy of mat matrix (no matrix data are copied)
+    dspm::Mat mat_subset1 = mat.getROI(start_row, start_col, roi_rows, roi_cols);
+
+    // Create matrix subset as a deep copy of mat matrix (matrix data are copied)
+    dspm::Mat mat_subset1_check = mat.Get(start_row, roi_rows, start_col, roi_cols);
+
+    std::cout << "Matrix subset, method 1: rows: " << mat_subset1.rows << ", columns: " << mat_subset1.cols << std::endl;
+    std::cout << mat_subset1 << std::endl;
+
+    // Compare the deep and the shallow copies
+    test_assert_equal_mat_mat(mat_subset1_check, mat_subset1, "matrix subset 1");
+
+    // Sub matrix method 2 - sub-matrix dimensions as a matrix rectangle
+    int x = 1;
+    int y = 1;
+    int width = 4;
+    int height = 3;
+
+    // Create matrix ROI as a rectangle area
+    dspm::Mat::Rect roi_rect(x, y, width, height);
+    dspm::Mat mat_subset2 = mat.getROI(roi_rect);
+    std::cout << "Matrix subset method 2: rows: " << mat_subset2.rows << ", columns: " << mat_subset2.cols << std::endl;
+    std::cout << mat_subset2 << std::endl;
+    dspm::Mat mat_subset2_check = mat.Get(roi_rect);
+
+    test_assert_equal_mat_mat(mat_subset2_check, mat_subset2, "matrix subset 2");
+
+    // Sub matrix method 2 - sub-matrix dimensions with specified stride
+    start_row = 0;
+    start_col = 1;
+    roi_rows = 3;
+    roi_cols = 3;
+    int stride = 10;
+
+    dspm::Mat mat_subset3 = mat.getROI(start_row, start_col, roi_rows, roi_cols, stride);
+    std::cout << "Matrix subset method 3: rows: " << mat_subset1.rows << ", columns: " << mat_subset3.cols << std::endl;
+    std::cout << mat_subset3 << std::endl;
+    dspm::Mat mat_subset3_check = mat.Get(start_row, 5, start_col, roi_cols);
+
+    for (int row = 0; row < mat_subset3_check.rows; row++) {
+        if (row % 2) {
+            continue;
+        };
+        for (int col =  0; col < mat_subset3_check.cols; col++) {
+            TEST_ASSERT_EQUAL_FLOAT(mat_subset3_check(row, col), mat_subset3(row / 2, col));
+        }
+    }
+}
+
+static void test_mat_subset_operator_eq()
+{
+    dspm::Mat mat(2, 2);
+    for (int i = 0; i < mat.length; i++) {
+        mat.data[i] = 1;
+    }
+
+    dspm::Mat mat1(2, 2);
+    for (int i = 0; i < mat1.length; i++) {
+        mat1.data[i] = i + 1;
+    }
+
+    // matrices, dimensions are equal
+    // mat(2, 2), mat1(2, 2)
+    mat = mat1;
+    TEST_ASSERT_EQUAL_INT(2, mat.rows);
+    TEST_ASSERT_EQUAL_INT(mat1.rows, mat.rows);
+    TEST_ASSERT_EQUAL_INT(2, mat.cols);
+    TEST_ASSERT_EQUAL_INT(mat1.cols, mat.cols);
+    test_assert_equal_mat_mat(mat1, mat, "=operator, mat = mat (equal dim)");
+
+    dspm::Mat mat2(3, 3);
+    for (int i = 0; i < mat2.length; i++) {
+        mat2.data[i] = (i + 1) * 2;
+    }
+
+    // matrices, dimensions are not equal
+    // mat1(2, 2), mat2(3, 3)
+    mat1 = mat2;
+    TEST_ASSERT_EQUAL_INT(3, mat1.rows);
+    TEST_ASSERT_EQUAL_INT(mat2.rows, mat1.rows);
+    TEST_ASSERT_EQUAL_INT(3, mat1.cols);
+    TEST_ASSERT_EQUAL_INT(mat2.cols, mat1.cols);
+    test_assert_equal_mat_mat(mat2, mat1, "=operator, mat = mat (not equal dim)");
+
+    dspm::Mat mat3(4, 4);
+    dspm::Mat mat4(4, 4);
+    dspm::Mat mat4_compare(4, 4);
+    for (int i = 0; i < mat3.length; i++) {
+        mat3.data[i] = (i + 1) * 3;
+        mat4.data[i] = (i + 1) * 4;
+        mat4_compare.data[i] = (i + 1) * 4;
+    }
+    dspm::Mat mat3_sub_3x3 = mat3.getROI(1, 1, 3, 3);
+    dspm::Mat mat3_sub_2x2 = mat3.getROI(1, 1, 2, 2);
+    dspm::Mat mat3_mat_2x2 = mat3.Get(1, 2, 1, 2);
+
+    // matrix and sub-matrix, dimensions are equal
+    // mat1(3, 3), mat3_sub_3x3(3, 3)
+    mat1 = mat3_sub_3x3;
+    TEST_ASSERT_FALSE(mat1.sub_matrix);
+    TEST_ASSERT_EQUAL_INT(3, mat1.rows);
+    TEST_ASSERT_EQUAL_INT(mat3_sub_3x3.rows, mat1.rows);
+    TEST_ASSERT_EQUAL_INT(3, mat1.cols);
+    TEST_ASSERT_EQUAL_INT(mat3_sub_3x3.cols, mat1.cols);
+    test_assert_equal_mat_mat(mat3_sub_3x3, mat1, "=operator, mat = sub_mat (equal dim)");
+
+    dspm::Mat mat4_sub_2x2 = mat4.getROI(1, 1, 2, 2);
+    dspm::Mat mat4_mat_2x2 = mat4.Get(1, 2, 1, 2);
+
+    // matrix and sub-matrix, dimensions are not equal
+    // mat1(3, 3), mat4_sub_2x2(2, 2)
+    mat1 = mat4_sub_2x2;
+    TEST_ASSERT_FALSE(mat1.sub_matrix);
+    TEST_ASSERT_EQUAL_INT(2, mat1.rows);
+    TEST_ASSERT_EQUAL_INT(mat4_sub_2x2.rows, mat1.rows);
+    TEST_ASSERT_EQUAL_INT(2, mat1.cols);
+    TEST_ASSERT_EQUAL_INT(mat4_sub_2x2.cols, mat1.cols);
+    test_assert_equal_mat_mat(mat4_sub_2x2, mat1, "=operator, mat = sub_mat (not equal dim)");
+
+    // sub-matrix and sub-matrix, dimensions are not equal
+    // mat4_sub_2x2(2, 2), mat3_sub_3x3(3, 3)
+    ESP_LOGI("=operator test", "following is an expected error message about matrices not having equal dimensions");
+    mat4_sub_2x2 = mat3_sub_3x3;
+    TEST_ASSERT_TRUE(mat4_sub_2x2.sub_matrix);
+    TEST_ASSERT_EQUAL_INT(2, mat4_sub_2x2.rows);
+    TEST_ASSERT_EQUAL_INT(2, mat4_sub_2x2.cols);
+    test_assert_equal_mat_mat(mat4_mat_2x2, mat4_sub_2x2, "=operator, sub_mat = sub_mat (not equal dim)");
+    test_assert_check_area_mat_mat(mat4_compare, mat4_sub_2x2, 1, 1, "=operator area, sub_mat = sub_mat (not equal dim)");
+
+    // sub-matrix and sub-matrix, dimensions are equal
+    // mat4_sub_2x2(2, 2), mat3_sub_2x2(2, 2)
+    mat4_sub_2x2 = mat3_sub_2x2;
+    TEST_ASSERT_TRUE(mat4_sub_2x2.sub_matrix);
+    TEST_ASSERT_EQUAL_INT(2, mat4_sub_2x2.rows);
+    TEST_ASSERT_EQUAL_INT(mat3_sub_2x2.rows, mat4_sub_2x2.rows);
+    TEST_ASSERT_EQUAL_INT(2, mat4_sub_2x2.cols);
+    TEST_ASSERT_EQUAL_INT(mat3_sub_2x2.cols, mat4_sub_2x2.cols);
+    test_assert_equal_mat_mat(mat3_mat_2x2, mat4_sub_2x2, "=operator, sub_mat = sub_mat (equal dim)");
+    test_assert_check_area_mat_mat(mat4_compare, mat4_sub_2x2, 1, 1, "=operator area, sub_mat = sub_mat (equal dim)");
+}
+
+// operator==
+static void test_mat_subset_operator_eq_eq(void)
+{
+    dspm::Mat A(MAT_ROW, MAT_COL);
+    dspm::Mat B(MAT_ROW, MAT_COL);
+
+    for (int i = 0; i < A.length; i++) {
+        A.data[i] = i;
+        B.data[i] = i * 2;
+    }
+
+    dspm::Mat A_sub = A.getROI(roi_rect);
+    dspm::Mat A_mat = A.Get(roi_rect);
+
+    dspm::Mat B_sub = B.getROI(roi_rect);
+
+    for (int row = 0; row < B_sub.rows; row++) {
+        for (int col = 0; col < B_sub.cols; col++) {
+            B_sub(row, col) = B_sub(row, col) / 2;
+        }
+    }
+    dspm::Mat B_mat = B.Get(roi_rect);
+    dspm::Mat B_mat_neq_cont = B_mat * 3;
+    dspm::Mat B_mat_neq_dim(3, 3);
+
+    TEST_ASSERT_TRUE(A_mat == B_mat);
+    TEST_ASSERT_TRUE(A_sub == B_sub);
+    TEST_ASSERT_TRUE(A_sub == B_mat);
+    TEST_ASSERT_TRUE(A_mat == B_sub);
+    ESP_LOGI("==operator test", "following is an expected error message about matrices not having equal content");
+    TEST_ASSERT_FALSE(A_sub == B_mat_neq_cont);
+    TEST_ASSERT_FALSE(A_sub == B_mat_neq_dim);
+}
+
+// operator/
+static void test_mat_subset_operator_mat_div_mat(void)
+{
+    dspm::Mat mat(MAT_ROW, MAT_COL);
+    for (int i = 0; i < mat.length; i++) {
+        mat.data[i] = i;
+    }
+
+    dspm::Mat C = mat;
+    dspm::Mat C_compare_area = mat;
+
+    dspm::Mat A_sub = mat.getROI(roi_rect);
+    dspm::Mat A_mat = mat.Get(roi_rect);
+
+    dspm::Mat B_sub = mat.getROI(roi_rect);
+    dspm::Mat B_mat = mat.Get(roi_rect);
+
+    dspm::Mat C_sub = C.getROI(roi_rect);
+    dspm::Mat C_mat = C.Get(roi_rect);
+    dspm::Mat C_compare(ROI_ROW, ROI_COL);
+
+    for (int i = 0; i < C_compare.length; i++) {
+        C_compare.data[i] = A_mat.data[i] / B_mat.data[i];
+    }
+
+    C_mat = A_mat / B_mat;
+    test_assert_equal_mat_mat(C_compare, C_mat, "/ operator, mat = mat / mat");
+
+    C_mat = A_sub / B_sub;
+    test_assert_equal_mat_mat(C_compare, C_mat, "/ operator, mat = sub_mat / sub_mat");
+
+    C_mat = A_sub / B_mat;
+    test_assert_equal_mat_mat(C_compare, C_mat, "/ operator, mat = sub_mat / mat");
+
+    C_mat = A_mat / B_sub;
+    test_assert_equal_mat_mat(C_compare, C_mat, "/ operator, mat = mat / sub_mat");
+
+    C_sub = A_sub / B_sub;
+    test_assert_equal_mat_mat(C_compare, C_sub, "/ operator, sub_mat = sub_mat / sub_mat");
+    test_assert_check_area_mat_mat(C_compare_area, C_sub, START_ROI, START_ROI,  "/ operator, area check, sub_mat = sub_mat / sub_mat");
+
+    C_sub = A_mat / B_sub;
+    test_assert_equal_mat_mat(C_compare, C_sub, "/ operator, sub_mat = mat / sub_mat");
+    test_assert_check_area_mat_mat(C_compare_area, C_sub, START_ROI, START_ROI,  "/ operator, area check, sub_mat = sub_mat / sub_mat");
+
+    C = mat;
+    C_mat = C.Get(roi_rect);    // C_mat must be refreshed
+    C_mat /= A_mat;
+    test_assert_equal_mat_mat(C_compare, C_mat, "/ operator, mat /= mat");
+
+    C = mat;
+    C_mat = C.Get(roi_rect);    // C_mat must be refreshed
+    C_mat /= A_sub;
+    test_assert_equal_mat_mat(C_compare, C_mat, "/ operator, mat /= sub_mat");
+
+    C = mat;                    // C must be refreshed, to refresh the C_sub
+    C_sub /= A_mat;
+    test_assert_equal_mat_mat(C_compare, C_sub, "/ operator, sub_mat /= mat");
+    test_assert_check_area_mat_mat(C_compare_area, C_sub, START_ROI, START_ROI,  "/ operator, area check, sub_mat /= mat");
+
+    C = mat;                    // C must be refreshed, to refresh the C_sub
+    C_sub /= A_sub;
+    test_assert_equal_mat_mat(C_compare, C_sub, "/ operator, sub_mat /= sub_mat");
+    test_assert_check_area_mat_mat(C_compare_area, C_sub, START_ROI, START_ROI,  "/ operator, area check, sub_mat /= sub_mat");
+}
+
+// operator^
+static void test_mat_subset_operator_xor(void)
+{
+    dspm::Mat mat(5, 5);
+    dspm::Mat mat_area_check(5, 5);
+    for (int i = 0; i < mat.length; i++) {
+        mat.data[i] = i;
+        mat_area_check.data[i] = i;
+    }
+
+    dspm::Mat::Rect roi_rect(1, 1, 3, 3);
+    dspm::Mat mat_mat = mat.Get(roi_rect);
+    dspm::Mat mat_sub = mat.getROI(roi_rect);
+
+    // XOR 0
+    dspm::Mat res_mat = mat_mat ^ 0;
+    dspm::Mat res_sub = mat_sub ^ 0;
+    test_assert_equal_mat_mat(res_mat, res_sub, "sub-matrix operator^ 0");
+    test_assert_check_area_mat_mat(mat_area_check, mat_sub, 1, 1, "sub-matrix area check operator^ 0");
+
+    // XOR 1
+    res_mat = mat_mat ^ 1;
+    res_sub = mat_sub ^ 1;
+    test_assert_equal_mat_mat(res_mat, res_sub, "sub-matrix operator^ 1");
+    test_assert_check_area_mat_mat(mat_area_check, mat_sub, 1, 1, "sub-matrix area check operator^ 1");
+
+    // XOR even
+    res_mat = mat_mat ^ 2;
+    res_sub = mat_sub ^ 2;
+    test_assert_equal_mat_mat(res_mat, res_sub, "sub-matrix operator^ 2");
+    test_assert_check_area_mat_mat(mat_area_check, mat_sub, 1, 1, "sub-matrix area check operator^ 2");
+
+    // XOR odd
+    res_mat = mat_mat ^ 3;
+    res_sub = mat_sub ^ 3;
+    test_assert_equal_mat_mat(res_mat, res_sub, "sub-matrix operator^ 3");
+    test_assert_check_area_mat_mat(mat_area_check, mat_sub, 1, 1, "sub-matrix area check operator^ 3");
+}
+
+// operator/
+static void test_mat_subset_operator_mat_div_const(void)
+{
+    const float div_const = 2;
+
+    dspm::Mat mat(MAT_ROW, MAT_COL);
+    for (int i = 0; i < mat.length; i++) {
+        mat.data[i] = i;
+    }
+
+    dspm::Mat C_compare_area = mat;
+    dspm::Mat C = mat;
+
+    dspm::Mat A_sub = mat.getROI(roi_rect);
+    dspm::Mat A_mat = mat.Get(roi_rect);
+
+    dspm::Mat C_sub = C.getROI(roi_rect);
+    dspm::Mat C_mat = C.Get(roi_rect);
+    dspm::Mat C_compare = mat.Get(roi_rect);
+
+    for (int i = 0; i < C_compare.length; i++) {
+        C_compare.data[i] /= div_const;
+    }
+
+    C_mat = A_mat / div_const;
+    test_assert_equal_mat_mat(C_compare, C_mat, "/ operator, mat = mat / const");
+
+    C_mat = A_sub / div_const;
+    test_assert_equal_mat_mat(C_compare, C_mat, "/ operator, mat = sub_mat / const");
+    C_mat = C.Get(roi_rect);
+
+    C_mat /= div_const;
+    test_assert_equal_mat_mat(C_compare, C_mat, "/ operator, mat /= const");
+
+    C_sub = A_mat / div_const;
+    test_assert_equal_mat_mat(C_compare, C_sub, "/ operator, sub_mat = mat / const");
+    test_assert_check_area_mat_mat(C_compare_area, C_sub, START_ROI, START_ROI,  "/ operator, area check, sub_mat = mat / const");
+
+    C = mat;
+    C_sub /= div_const;
+    test_assert_equal_mat_mat(C_compare, C_sub, "/ operator, sub_mat /= const");
+    test_assert_check_area_mat_mat(C_compare_area, C_sub, START_ROI, START_ROI,  "/ operator, area check, sub_mat /= const");
+}
+
+// operator-
+static void test_mat_subset_operator_mat_sub_const(void)
+{
+    const float sub_const = 2;
+
+    dspm::Mat mat(MAT_ROW, MAT_COL);
+    for (int i = 0; i < mat.length; i++) {
+        mat.data[i] = i;
+    }
+
+    dspm::Mat C_compare_area = mat;
+    dspm::Mat C = mat;
+
+    dspm::Mat A_sub = mat.getROI(roi_rect);
+    dspm::Mat A_mat = mat.Get(roi_rect);
+
+    dspm::Mat C_sub = C.getROI(roi_rect);
+    dspm::Mat C_mat = C.Get(roi_rect);
+    dspm::Mat C_compare = mat.Get(roi_rect);
+
+    for (int i = 0; i < C_compare.length; i++) {
+        C_compare.data[i] -= sub_const;
+    }
+
+    C_mat = A_mat - sub_const;
+    test_assert_equal_mat_mat(C_compare, C_mat, "- operator, mat = mat - const");
+
+    C_mat = A_sub - sub_const;
+    test_assert_equal_mat_mat(C_compare, C_mat, "- operator, mat = sub_mat - const");
+    C_mat = C.Get(roi_rect);
+
+    C_mat -= sub_const;
+    test_assert_equal_mat_mat(C_compare, C_mat, "- operator, mat -= const");
+
+    C_sub = A_mat - sub_const;
+    test_assert_equal_mat_mat(C_compare, C_sub, "- operator, sub_mat = mat - const");
+    test_assert_check_area_mat_mat(C_compare_area, C_sub, START_ROI, START_ROI,  "- operator, area check, sub_mat = mat - const");
+
+    C = mat;
+    C_sub -= sub_const;
+    test_assert_equal_mat_mat(C_compare, C_sub, "- operator, sub_mat -= const");
+    test_assert_check_area_mat_mat(C_compare_area, C_sub, START_ROI, START_ROI,  "- operator, area check, sub_mat -= const");
+}
+
+// operator-
+static void test_mat_subset_operator_mat_sub_mat(void)
+{
+    dspm::Mat mat(MAT_ROW, MAT_COL);
+    for (int i = 0; i < mat.length; i++) {
+        mat.data[i] = i;
+    }
+
+    dspm::Mat C = mat;
+    dspm::Mat C_compare_area = mat;
+
+    dspm::Mat A_sub = mat.getROI(roi_rect);
+    dspm::Mat A_mat = mat.Get(roi_rect);
+
+    dspm::Mat B_sub = mat.getROI(roi_rect);
+    dspm::Mat B_mat = mat.Get(roi_rect);
+
+    dspm::Mat C_sub = C.getROI(roi_rect);
+    dspm::Mat C_mat = C.Get(roi_rect);
+    dspm::Mat C_compare(ROI_ROW, ROI_COL);
+
+    for (int i = 0; i < C_compare.length; i++) {
+        C_compare.data[i] = A_mat.data[i] - B_mat.data[i];
+    }
+
+    C_mat = A_mat - B_mat;
+    test_assert_equal_mat_mat(C_compare, C_mat, "- operator, mat = mat - mat");
+
+    C_mat = A_sub - B_sub;
+    test_assert_equal_mat_mat(C_compare, C_mat, "- operator, mat = sub_mat - sub_mat");
+
+    C_mat = A_sub - B_mat;
+    test_assert_equal_mat_mat(C_compare, C_mat, "- operator, mat = sub_mat - mat");
+
+    C_mat = A_mat - B_sub;
+    test_assert_equal_mat_mat(C_compare, C_mat, "- operator, mat = mat - sub_mat");
+
+    C_sub = A_sub - B_sub;
+    test_assert_equal_mat_mat(C_compare, C_sub, "- operator, sub_mat = sub_mat - sub_mat");
+    test_assert_check_area_mat_mat(C_compare_area, C_sub, START_ROI, START_ROI,  "- operator, area check, sub_mat = sub_mat - sub_mat");
+
+    C_sub = A_mat - B_sub;
+    test_assert_equal_mat_mat(C_compare, C_sub, "- operator, sub_mat = mat - sub_mat");
+    test_assert_check_area_mat_mat(C_compare_area, C_sub, START_ROI, START_ROI,  "- operator, area check, sub_mat = sub_mat - sub_mat");
+
+    C = mat;
+    C_mat = C.Get(roi_rect);    // C_mat must be refreshed
+    C_mat -= A_mat;
+    test_assert_equal_mat_mat(C_compare, C_mat, "- operator, mat -= mat");
+
+    C = mat;
+    C_mat = C.Get(roi_rect);    // C_mat must be refreshed
+    C_mat -= A_sub;
+    test_assert_equal_mat_mat(C_compare, C_mat, "- operator, mat -= sub_mat");
+
+    C = mat;                    // C must be refreshed, to refresh the C_sub
+    C_sub -= A_mat;
+    test_assert_equal_mat_mat(C_compare, C_sub, "- operator, sub_mat -= mat");
+    test_assert_check_area_mat_mat(C_compare_area, C_sub, START_ROI, START_ROI,  "- operator, area check, sub_mat -= mat");
+
+    C = mat;                    // C must be refreshed, to refresh the C_sub
+    C_sub -= A_sub;
+    test_assert_equal_mat_mat(C_compare, C_sub, "- operator, sub_mat -= sub_mat");
+    test_assert_check_area_mat_mat(C_compare_area, C_sub, START_ROI, START_ROI,  "- operator, area check, sub_mat -= sub_mat");
+}
+
+// operator+
+static void test_mat_subset_operator_mat_add_mat(void)
+{
+    dspm::Mat mat(MAT_ROW, MAT_COL);
+    for (int i = 0; i < mat.length; i++) {
+        mat.data[i] = i;
+    }
+
+    dspm::Mat C = mat;
+    dspm::Mat C_compare_area = mat;
+
+    dspm::Mat A_sub = mat.getROI(roi_rect);
+    dspm::Mat A_mat = mat.Get(roi_rect);
+
+    dspm::Mat B_sub = mat.getROI(roi_rect);
+    dspm::Mat B_mat = mat.Get(roi_rect);
+
+    dspm::Mat C_sub = C.getROI(roi_rect);
+    dspm::Mat C_mat = C.Get(roi_rect);
+    dspm::Mat C_compare(ROI_ROW, ROI_COL);
+
+    for (int i = 0; i < C_compare.length; i++) {
+        C_compare.data[i] = A_mat.data[i] + B_mat.data[i];
+    }
+
+    C_mat = A_mat + B_mat;
+    test_assert_equal_mat_mat(C_compare, C_mat, "+ operator, mat = mat + mat");
+
+    C_mat = A_sub + B_sub;
+    test_assert_equal_mat_mat(C_compare, C_mat, "+ operator, mat = sub_mat + sub_mat");
+
+    C_mat = A_sub + B_mat;
+    test_assert_equal_mat_mat(C_compare, C_mat, "+ operator, mat = sub_mat + mat");
+
+    C_sub = A_sub + B_sub;
+    test_assert_equal_mat_mat(C_compare, C_sub, "+ operator, sub_mat = sub_mat + sub_mat");
+    test_assert_check_area_mat_mat(C_compare_area, C_sub, START_ROI, START_ROI,  "+ operator, area check, sub_mat = sub_mat + sub_mat");
+
+    C_sub = A_mat + B_sub;
+    test_assert_equal_mat_mat(C_compare, C_sub, "+ operator, sub_mat = mat + sub_mat");
+    test_assert_check_area_mat_mat(C_compare_area, C_sub, START_ROI, START_ROI,  "+ operator, area check, sub_mat = sub_mat + sub_mat");
+
+    C = mat;
+    C_mat = C.Get(roi_rect);    // C_mat must be refreshed
+    C_mat += A_mat;
+    test_assert_equal_mat_mat(C_compare, C_mat, "+ operator, mat += mat");
+
+    C = mat;
+    C_mat = C.Get(roi_rect);    // C_mat must be refreshed
+    C_mat += A_sub;
+    test_assert_equal_mat_mat(C_compare, C_mat, "+ operator, mat += sub_mat");
+
+    C = mat;                    // C must be refreshed, to refresh the C_sub
+    C_sub += A_mat;
+    test_assert_equal_mat_mat(C_compare, C_sub, "+ operator, sub_mat += mat");
+    test_assert_check_area_mat_mat(C_compare_area, C_sub, START_ROI, START_ROI,  "+ operator, area check, sub_mat += mat");
+
+    C = mat;                    // C must be refreshed, to refresh the C_sub
+    C_sub += A_sub;
+    test_assert_equal_mat_mat(C_compare, C_sub, "+ operator, sub_mat += sub_mat");
+    test_assert_check_area_mat_mat(C_compare_area, C_sub, START_ROI, START_ROI,  "+ operator, area check, sub_mat += sub_mat");
+}
+
+// operator+
+static void test_mat_subset_operator_mat_add_const(void)
+{
+    const float add_const = 2;
+
+    dspm::Mat mat(MAT_ROW, MAT_COL);
+    for (int i = 0; i < mat.length; i++) {
+        mat.data[i] = i;
+    }
+
+    dspm::Mat C_compare_area = mat;
+    dspm::Mat C = mat;
+
+    dspm::Mat A_sub = mat.getROI(roi_rect);
+    dspm::Mat A_mat = mat.Get(roi_rect);
+
+    dspm::Mat C_sub = C.getROI(roi_rect);
+    dspm::Mat C_mat = C.Get(roi_rect);
+    dspm::Mat C_compare = mat.Get(roi_rect);
+
+    for (int i = 0; i < C_compare.length; i++) {
+        C_compare.data[i] += add_const;
+    }
+
+    C_mat = A_sub + add_const;
+    test_assert_equal_mat_mat(C_compare, C_mat, "+ operator, mat = sub_mat + const");
+    C_mat = C.Get(roi_rect);
+
+    C_mat += add_const;
+    test_assert_equal_mat_mat(C_compare, C_mat, "+ operator, mat += const");
+
+    C_sub = A_mat + add_const;
+    test_assert_equal_mat_mat(C_compare, C_sub, "+ operator, sub_mat = mat + const");
+    test_assert_check_area_mat_mat(C_compare_area, C_sub, START_ROI, START_ROI,  "+ operator, area check, sub_mat = mat + const");
+
+    C = mat;
+    C_sub += add_const;
+    test_assert_equal_mat_mat(C_compare, C_sub, "+ operator, sub_mat += const");
+    test_assert_check_area_mat_mat(C_compare_area, C_sub, START_ROI, START_ROI,  "+ operator, area check, sub_mat += const");
+}
+
+// operator*
+static void test_mat_subset_operator_mat_mul_const(void)
+{
+    const float mul_const = 2;
+
+    dspm::Mat mat(MAT_ROW, MAT_COL);
+    for (int i = 0; i < mat.length; i++) {
+        mat.data[i] = i;
+    }
+
+    dspm::Mat C_compare_area = mat;
+    dspm::Mat C = mat;
+
+    dspm::Mat A_sub = mat.getROI(roi_rect);
+    dspm::Mat A_mat = mat.Get(roi_rect);
+
+    dspm::Mat C_sub = C.getROI(roi_rect);
+    dspm::Mat C_mat = C.Get(roi_rect);
+    dspm::Mat C_compare = mat.Get(roi_rect);
+
+    for (int i = 0; i < C_compare.length; i++) {
+        C_compare.data[i] *= mul_const;
+    }
+
+    C_mat = A_mat * mul_const;
+    test_assert_equal_mat_mat(C_compare, C_mat, "* operator, mat = mat * const");
+
+    C_mat = A_sub * mul_const;
+    test_assert_equal_mat_mat(C_compare, C_mat, "* operator, mat = sub_mat * const");
+    C_mat = C.Get(roi_rect);
+
+    C_mat *= mul_const;
+    test_assert_equal_mat_mat(C_compare, C_mat, "* operator, mat *= const");
+
+    C_sub = A_mat * mul_const;
+    test_assert_equal_mat_mat(C_compare, C_sub, "* operator, sub_mat = mat * const");
+    test_assert_check_area_mat_mat(C_compare_area, C_sub, START_ROI, START_ROI,  "* operator, area check, sub_mat = mat * const");
+
+    C = mat;
+    C_sub *= mul_const;
+    test_assert_equal_mat_mat(C_compare, C_sub, "* operator, sub_mat *= const");
+    test_assert_check_area_mat_mat(C_compare_area, C_sub, START_ROI, START_ROI,  "* operator, area check, sub_mat *= const");
+}
+
+// operator*
+static void test_mat_subset_operator_mat_mul_mat_2(void)
+{
+    dspm::Mat mat(MAT_ROW, MAT_COL);
+    for (int i = 0; i < mat.length; i++) {
+        mat.data[i] = i;
+    }
+
+    dspm::Mat C_compare_area = mat;
+    dspm::Mat C = mat;
+
+    const int m = 4, n = 4, k = 4;
+    dspm::Mat::Rect roi_rect_mul(1, 1, k, m);
+
+    dspm::Mat A_sub = mat.getROI(roi_rect_mul);
+    dspm::Mat A_mat = mat.Get(roi_rect_mul);
+
+    dspm::Mat C_sub = C.getROI(roi_rect_mul);
+    dspm::Mat C_mat = C.Get(roi_rect_mul);
+    dspm::Mat C_compare = dspm::Mat::ones(m, k);
+
+    for (int i = 0 ; i < m ; i++) {
+        for (int j = 0 ; j < k ; j++) {
+            C_compare.data[(i * k) + j] = 0;
+            for (int s = 0 ; s < n ; s++) {
+                C_compare.data[(i * k) + j] += A_mat.data[(i * n) + s] * C_mat.data[(s * k) + j];
+            }
+        }
+    }
+
+    C_mat *= A_mat;
+    test_assert_equal_mat_mat(C_compare, C_mat, "*= operator, mat *= mat");
+    C_mat = C.Get(roi_rect_mul);
+
+    C_mat *= A_sub;
+    test_assert_equal_mat_mat(C_compare, C_mat, "*= operator, mat *= sub_mat");
+
+    C_sub *= A_sub;
+    test_assert_equal_mat_mat(C_compare, C_sub, "*= operator, sub_mat *= sub_mat");
+    test_assert_check_area_mat_mat(C_compare_area, C_sub, START_ROI, START_ROI,  "*= operator, area check, sub_mat *= sub_mat");
+
+    C = mat;
+    C_sub *= A_mat;
+    test_assert_equal_mat_mat(C_compare, C_sub, "*= operator, sub_mat *= sub_mat");
+    test_assert_check_area_mat_mat(C_compare_area, C_sub, START_ROI, START_ROI,  "*= operator, area check, sub_mat *= sub_mat");
+}
+
+// operator*
+static void test_mat_subset_operator_mat_mul_mat_1(void)
+{
+    dspm::Mat mat(MAT_ROW, MAT_COL);
+    for (int i = 0; i < mat.length; i++) {
+        mat.data[i] = i;
+    }
+
+    dspm::Mat C = dspm::Mat::ones(6);
+    dspm::Mat C_compare_area = dspm::Mat::ones(6);
+
+    // matrix dimensions
+    const int m = 4, n = 3, k = 4;
+    dspm::Mat::Rect A_roi_rect(2, 1, n, m);
+    dspm::Mat::Rect B_roi_rect(1, 2, k, n);
+    dspm::Mat::Rect C_roi_rect(1, 1, k, m);
+
+    dspm::Mat A_sub = mat.getROI(A_roi_rect);
+    dspm::Mat A_mat = mat.Get(A_roi_rect);
+
+    dspm::Mat B_sub = mat.getROI(B_roi_rect);
+    dspm::Mat B_mat = mat.Get(B_roi_rect);
+
+    dspm::Mat C_sub = C.getROI(C_roi_rect);
+    dspm::Mat C_mat = C.Get(C_roi_rect);
+    dspm::Mat C_compare = dspm::Mat::ones(m, k);
+
+    for (int i = 0 ; i < m ; i++) {
+        for (int j = 0 ; j < k ; j++) {
+            C_compare.data[(i * k) + j] = 0;
+            for (int s = 0 ; s < n ; s++) {
+                C_compare.data[(i * k) + j] += A_mat.data[(i * n) + s] * B_mat.data[(s * k) + j];
+            }
+        }
+    }
+
+    C_mat = A_mat * B_mat;
+    test_assert_equal_mat_mat(C_compare, C_mat, "* operator, mat = mat * mat");
+
+    C_mat = A_sub * B_sub;
+    test_assert_equal_mat_mat(C_compare, C_mat, "* operator, mat = sub_mat * sub_mat");
+
+    C_mat = A_sub * B_mat;
+    test_assert_equal_mat_mat(C_compare, C_mat, "* operator, mat = sub_mat * mat");
+
+    C_sub = A_sub * B_sub;
+    test_assert_equal_mat_mat(C_compare, C_sub, "* operator, sub_mat = sub_mat * sub_mat");
+    test_assert_check_area_mat_mat(C_compare_area, C_sub, START_ROI, START_ROI,  "* operator, area check, sub_mat = sub_mat * sub_mat");
+
+    C_sub = A_mat * B_sub;
+    test_assert_equal_mat_mat(C_compare, C_sub, "*operator, sub_mat = mat * sub_mat");
+    test_assert_check_area_mat_mat(C_compare_area, C_sub, START_ROI, START_ROI,  "* operator, area check, sub_mat = sub_mat * sub_mat");
+}
+
+TEST_CASE("Matrix subset operators", TAG)
+{
+    test_mat_subset_operator_eq();                  // mat = mat
+    test_mat_subset_operator_eq_eq();               // mat == mat
+    test_mat_subset_operator_xor();                 // mat ^ const
+    test_mat_subset_operator_mat_mul_mat_1();       // mat * mat
+    test_mat_subset_operator_mat_mul_mat_2();       // mat * mat
+    test_mat_subset_operator_mat_mul_const();       // mat * const
+    test_mat_subset_operator_mat_add_mat();         // mat + mat
+    test_mat_subset_operator_mat_add_const();       // mat + const
+    test_mat_subset_operator_mat_sub_mat();         // mat - mat
+    test_mat_subset_operator_mat_sub_const();       // mat - const
+    test_mat_subset_operator_mat_div_mat();         // mat / mat
+    test_mat_subset_operator_mat_div_const();       // mat / const
+}
+
+static void test_mat_subset_solve(void)
+{
+    int m = 3;
+    int n = 3;
+    float data_a[9] = {3, 2, 1, 2, 3, 1, 2, 1, 3};
+    float data_b[9] = {5, -1, 4};
+    dspm::Mat A(data_a, m, n);
+    dspm::Mat b(data_b, m, 1);
+
+    dspm::Mat A_origin = dspm::Mat::ones(5);
+    dspm::Mat b_origin = dspm::Mat::ones(5, 3);
+    dspm::Mat A_origin_area_check = dspm::Mat::ones(5);
+    dspm::Mat b_origin_area_check = dspm::Mat::ones(5, 3);
+
+    A_origin.Copy(A, 1, 1);
+    b_origin.Copy(b, 1, 1);
+
+    // create sub-matrices
+    dspm::Mat A_sub = A_origin.getROI(1, 1, m, n);
+    dspm::Mat b_sub = b_origin.getROI(1, 1, m, 1);
+
+    dspm::Mat x1 = dspm::Mat::solve(A_sub, b_sub);
+    test_assert_check_area_mat_mat(A_origin_area_check, A_sub, 1, 1, "check solve, area A");
+    test_assert_check_area_mat_mat(b_origin_area_check, b_sub, 1, 1, "check solve, area b");
+
+    std::cout << "Solve result matrix: rows: " << x1.rows << ", columns: " << x1.cols << std::endl;
+    std::cout << (x1 * 12).t();
+    dspm::Mat x2 = dspm::Mat::roots(A_sub, b_sub);
+    test_assert_check_area_mat_mat(A_origin_area_check, A_sub, 1, 1, "check solve, area A");
+    test_assert_check_area_mat_mat(b_origin_area_check, b_sub, 1, 1, "check solve, area b");
+
+    std::cout << "Roots result matrix: rows: " << x2.rows << ", columns: " << x2.cols << std::endl;
+    std::cout << (x2 * 12).t();
+    dspm::Mat diff_b = x1 - x2;
+    std::cout << "Difference between solve() abd roots(): " << diff_b.t();
+    for (int row = 0; row < diff_b.rows; row++) {
+        for (int col = 0; col < diff_b.cols; col++) {
+            if (fabs(diff_b(row, col)) > 0.000001) {
+                TEST_ASSERT_MESSAGE (false, "Calculation is incorrect! Error more then expected!");
+            }
+        }
+    }
+}
+
+static void test_mat_subset_inverse(void)
+{
+    // Test inverse()
+    dspm::Mat result;
+    float m_data[] = {2, 5, 7,
+                      6, 3, 4,
+                      5, -2, -3
+                     };
+    float m_result[] = {  1.0000,   -1.0000,    1.0000,
+                          -38.0000,   41.0000,  -34.0000,
+                          27.0000,  -29.0000,   24.0000
+                       };
+
+    result = dspm::Mat(m_data, 3, 3);
+
+    dspm::Mat result_origin = dspm::Mat::ones(5);
+    dspm::Mat result_origin_area_check = dspm::Mat::ones(5);
+
+    result_origin.Copy(result, 1, 1);
+    dspm::Mat result_sub = result_origin.getROI(1, 1, 3, 3);
+
+    result = result_sub.inverse();
+    test_assert_check_area_mat_mat(result_origin_area_check, result_sub, 1, 1, "area check inverse");
+
+    std::cout << "inverse: " << std::endl;
+    std::cout << result << std::endl;
+    for (int i = 0; i < 3 * 3; i++) {
+        if (std::abs(result.data[i] - m_result[i]) > 1e-4) {
+            printf("Error at[%i] = %f, expected= %f, calculated = %f\n", i, std::abs(result.data[i] - m_result[i]), m_result[i], result.data[i]);
+            TEST_ASSERT_MESSAGE (false, "Error in inverse() operation!\n");
+        }
+    }
+    result = dspm::Mat(m_data, 3, 3);
+    result_origin = dspm::Mat::ones(5);
+    result_origin.Copy(result, 1, 1);
+    result_sub = result_origin.getROI(1, 1, 3, 3);
+
+    result = result_sub.pinv();
+    test_assert_check_area_mat_mat(result_origin_area_check, result_sub, 1, 1, "area check pinv");
+
+    std::cout << "pinv: " << std::endl;
+    std::cout << result << std::endl;
+    for (int i = 0; i < 3 * 3; i++) {
+        if (std::abs(result.data[i] - m_result[i]) > 1e-2) {
+            printf("Error at[%i] = %f, expected= %f, calculated = %f \n", i, std::abs(result.data[i] - m_result[i]), m_result[i], result.data[i]);
+            TEST_ASSERT_MESSAGE (false, "Error in pinv() operation!\n");
+        }
+    }
+}
+
+static void test_mat_subset_normalize(void)
+{
+    dspm::Mat result_origin = dspm::Mat::ones(4);
+    dspm::Mat result_area_check = dspm::Mat::ones(4);
+    dspm::Mat result_sub = result_origin.getROI(1, 1, 2, 2);
+
+    std::cout << "Befor normalize: " << std::endl;
+    std::cout << result_sub << std::endl;
+    result_sub.normalize();
+    test_assert_check_area_mat_mat(result_area_check, result_sub, 1, 1, "normalize area check");
+    std::cout << "normalize: " << std::endl;
+    std::cout << result_sub << std::endl;
+
+    for (int row = 0; row < result_sub.rows; row++) {
+        for (int col = 0 ; col < result_sub.cols ; col++) {
+            if (std::abs(result_sub(row, col) - 0.5) > dspm::Mat::abs_tol) {
+                ESP_LOGE(TAG, "Error bigger then expected: %f", std::abs(result_sub(row, col) - 0.5));
+                TEST_ASSERT_MESSAGE (false, "Error in normalize() operation! ");
+            }
+        }
+    }
+}
+
+static void test_mat_subset_swap_trans_dot_clear(void)
+{
+    dspm::Mat mat(5, 5);
+    dspm::Mat mat_area_check(5, 5);
+    for (int row = 0; row < mat.rows; row++) {
+        for (int col = 0; col < mat.cols; col++) {
+            mat(row, col) = row + 1;
+            mat_area_check(row, col) = row + 1;
+        }
+    }
+
+    dspm::Mat::Rect roi_rect(1, 1, 3, 3);
+    dspm::Mat mat_sub = mat.getROI(roi_rect);
+    dspm::Mat mat_mat = mat.Get(roi_rect);
+
+    // check swap rows
+    mat_sub.swapRows(0, 1);
+    mat_mat.swapRows(0, 1);
+    test_assert_equal_mat_mat(mat_sub, mat_mat, "sub-matrix swapRows");
+    test_assert_check_area_mat_mat(mat_area_check, mat_sub, 1, 1, "area check sub-matrix swapRows");
+
+    // check transpose
+    dspm::Mat mat_sub_res = mat_sub.t();
+    dspm::Mat mat_mat_res = mat_mat.t();
+    test_assert_equal_mat_mat(mat_mat_res, mat_sub_res, "sub-matrix transpose");
+    test_assert_check_area_mat_mat(mat_area_check, mat_sub, 1, 1, "area check sub-matrix transpose");
+
+    // check dot product
+    float dot_mat = dspm::Mat::dotProduct(mat_mat, mat_mat);
+    float dot_sub = dspm::Mat::dotProduct(mat_sub, mat_sub);
+    TEST_ASSERT_EQUAL_FLOAT(dot_mat, dot_sub);
+
+    // check clear
+    mat_sub.clear();
+    mat_mat.clear();
+    test_assert_equal_mat_const(mat_sub, 0, "sub-matrix clear");
+    test_assert_equal_mat_mat(mat_mat, mat_sub, "sub-matrix clear");
+    test_assert_check_area_mat_mat(mat_area_check, mat_sub, 1, 1, "area check sub-matrix clear");
+}
+
+
+TEST_CASE("Matrix subset methods check", TAG)
+{
+    test_mat_subset_solve();
+    test_mat_subset_inverse();
+    test_mat_subset_normalize();
+    test_mat_subset_swap_trans_dot_clear();
+}
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/test/test_mmult_3x3xx_f32_ae32.c
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/test/test_mmult_3x3xx_f32_ae32.c
@@ -0,0 +1,187 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "esp_dsp.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+
+#include "dspm_mult.h"
+#include "esp_attr.h"
+#include "dsp_tests.h"
+
+static const char *TAG = "dspm_mult_3x3xX_f32";
+
+// Test dsps_dotprod_s16_ansi function
+TEST_CASE("dspm_mult_3x3x1_f32 functionality", "[dspm]")
+{
+    int m = 3;
+    int n = 3;
+    int k = 1;
+
+
+    float A[m][n];
+    float *A_ptr = (float *)A;
+
+    float B[n][k];
+    float *B_ptr = (float *)B;
+
+    float C[m][k];
+    float *C_ptr = (float *)C;
+    float C_compare[m][k];
+    float *Cc_ptr = (float *)C_compare;
+
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+            A[i][j] = i;
+        }
+    }
+    for (int i = 0; i < n; i++) {
+        for (int j = 0; j < k; j++) {
+            B[i][j] = i;
+        }
+    }
+
+    dspm_mult_3x3x1_f32(A_ptr, B_ptr, C_ptr);
+    dspm_mult_f32_ansi(A_ptr, B_ptr, Cc_ptr, m, n, k);
+
+    for (int i = 0 ; i < m ; i++) {
+        for (int j = 0 ; j < k ; j++) {
+            ESP_LOGD(TAG, "[%i][%i] calc=%f, expected =%f", i, j, C[i][j], C_compare[i][j]);
+        }
+    }
+    //Compare and check results
+    for (int i = 0; i < m * k; i++) {
+        if (Cc_ptr[i] != C_ptr[i]) {
+            TEST_ASSERT_EQUAL(C_ptr[i], Cc_ptr[i]);
+        }
+    }
+}
+
+TEST_CASE("dspm_mult_3x3x3_f32 functionality", "[dspm]")
+{
+    int m = 3;
+    int n = 3;
+    int k = 3;
+
+
+    float A[m][n];
+    float *A_ptr = (float *)A;
+
+    float B[n][k];
+    float *B_ptr = (float *)B;
+
+    float C[m][k];
+    float *C_ptr = (float *)C;
+    float C_compare[m][k];
+    float *Cc_ptr = (float *)C_compare;
+
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+            A[i][j] = i;
+            C[i][j] = 0;
+        }
+    }
+    for (int i = 0; i < n; i++) {
+        for (int j = 0; j < k; j++) {
+            B[i][j] = i;
+        }
+    }
+
+    dspm_mult_3x3x3_f32(A_ptr, B_ptr, C_ptr);
+    dspm_mult_f32_ansi(A_ptr, B_ptr, Cc_ptr, m, n, k);
+
+    for (int i = 0 ; i < m ; i++) {
+        for (int j = 0 ; j < k ; j++) {
+            ESP_LOGD(TAG, "[%i][%i] calc=%f, expected =%f", i, j, C[i][j], C_compare[i][j]);
+        }
+    }
+    // Compare and check results
+    for (int i = 0 ; i < m * k ; i++) {
+        if (Cc_ptr[i] != C_ptr[i]) {
+            TEST_ASSERT_EQUAL( C_ptr[i], Cc_ptr[i]);
+        }
+    }
+}
+
+static portMUX_TYPE testnlock = portMUX_INITIALIZER_UNLOCKED;
+
+TEST_CASE("dspm_mult_3x3x1_f32 benchmark", "[dspm]")
+{
+    int m = 3;
+    int n = 3;
+    int k = 1;
+
+    float A[m][n];
+    float *A_ptr = (float *)A;
+
+    float B[n][k];
+    float *B_ptr = (float *)B;
+
+    float C[m][k];
+    float *C_ptr = (float *)C;
+
+
+    portENTER_CRITICAL(&testnlock);
+
+    unsigned int start_b = dsp_get_cpu_cycle_count();
+    int repeat_count = 1024;
+    for (int i = 0 ; i < repeat_count ; i++) {
+        dspm_mult_3x3x1_f32(A_ptr, B_ptr, C_ptr);
+    }
+    unsigned int end_b = dsp_get_cpu_cycle_count();
+    portEXIT_CRITICAL(&testnlock);
+
+    float total_b = end_b - start_b;
+    float cycles = total_b / (repeat_count);
+    ESP_LOGI("dspm_mult_3x3x1_f32", "dspm_mult_3x3x1_f32 - %f per multiplication (ae32 - 134, ansi - 285)", cycles);
+    float min_exec = 60;
+    float max_exec = 200;
+    TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
+}
+
+TEST_CASE("dspm_mult_3x3x3_f32 benchmark", "[dspm]")
+{
+    int m = 4;
+    int n = 4;
+    int k = 4;
+
+    float A[m][n];
+    float *A_ptr = (float *)A;
+
+    float B[n][k];
+    float *B_ptr = (float *)B;
+
+    float C[m][k];
+    float *C_ptr = (float *)C;
+
+
+    portENTER_CRITICAL(&testnlock);
+
+    unsigned int start_b = dsp_get_cpu_cycle_count();
+    int repeat_count = 1024;
+    for (int i = 0 ; i < repeat_count ; i++) {
+        dspm_mult_3x3x3_f32(A_ptr, B_ptr, C_ptr);
+    }
+    unsigned int end_b = dsp_get_cpu_cycle_count();
+    portEXIT_CRITICAL(&testnlock);
+
+    float total_b = end_b - start_b;
+    float cycles = total_b / (repeat_count);
+    ESP_LOGI("dspm_mult_3x3x3_f32", "dspm_mult_3x3x3_f32 - %f per multiplication", cycles);
+    float min_exec = 100;
+    float max_exec = 400;
+    TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
+}
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/test/test_mmult_4x4xx_f32_ae32.c
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/test/test_mmult_4x4xx_f32_ae32.c
@@ -0,0 +1,186 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "esp_dsp.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+
+#include "dspm_mult.h"
+#include "esp_attr.h"
+#include "dsp_tests.h"
+
+static const char *TAG = "dspm_mult_4x4x1_f32_ae32";
+
+TEST_CASE("dspm_mult_4x4x1_f32_ae32 functionality", "[dspm]")
+{
+    int m = 4;
+    int n = 4;
+    int k = 1;
+
+    float A[m][n];
+    float *A_ptr = (float *)A;
+
+    float B[n][k];
+    float *B_ptr = (float *)B;
+
+    float C[m][k];
+    float *C_ptr = (float *)C;
+    float C_compare[m][k];
+    float *Cc_ptr = (float *)C_compare;
+
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+            A[i][j] = i;
+        }
+    }
+    for (int i = 0; i < n; i++) {
+        for (int j = 0; j < k; j++) {
+            B[i][j] = i;
+        }
+    }
+
+    dspm_mult_4x4x1_f32(A_ptr, B_ptr, C_ptr);
+    dspm_mult_f32_ansi(A_ptr, B_ptr, Cc_ptr, m, n, k);
+
+    for (int i = 0 ; i < m ; i++) {
+        for (int j = 0 ; j < k ; j++) {
+            ESP_LOGD(TAG, "[%i][%i] calc=%f, expected =%f", i, j, C[i][j], C_compare[i][j]);
+        }
+    }
+    //Compare and check results
+    for (int i = 0; i < m * k; i++) {
+        if (Cc_ptr[i] != C_ptr[i]) {
+            TEST_ASSERT_EQUAL(C_ptr[i], Cc_ptr[i]);
+        }
+    }
+}
+
+TEST_CASE("dspm_mult_4x4x4_f32_ae32 functionality", "[dspm]")
+{
+    int m = 4;
+    int n = 4;
+    int k = 4;
+
+
+    float A[m][n];
+    float *A_ptr = (float *)A;
+
+    float B[n][k];
+    float *B_ptr = (float *)B;
+
+    float C[m][k];
+    float *C_ptr = (float *)C;
+    float C_compare[m][k];
+    float *Cc_ptr = (float *)C_compare;
+
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+            A[i][j] = i;
+            C[i][j] = 0;
+        }
+    }
+    for (int i = 0; i < n; i++) {
+        for (int j = 0; j < k; j++) {
+            B[i][j] = i;
+        }
+    }
+
+    dspm_mult_4x4x4_f32(A_ptr, B_ptr, C_ptr);
+    dspm_mult_f32_ansi(A_ptr, B_ptr, Cc_ptr, m, n, k);
+
+    for (int i = 0 ; i < m ; i++) {
+        for (int j = 0 ; j < k ; j++) {
+            ESP_LOGD(TAG, "[%i][%i] calc=%f, expected =%f", i, j, C[i][j], C_compare[i][j]);
+        }
+    }
+    // Compare and check results
+    for (int i = 0 ; i < m * k ; i++) {
+        if (Cc_ptr[i] != C_ptr[i]) {
+            TEST_ASSERT_EQUAL( C_ptr[i], Cc_ptr[i]);
+        }
+    }
+}
+
+static portMUX_TYPE testnlock = portMUX_INITIALIZER_UNLOCKED;
+
+TEST_CASE("dspm_mult_4x4x1_f32_ae32 benchmark", "[dspm]")
+{
+    int m = 4;
+    int n = 4;
+    int k = 1;
+
+    float A[m][n];
+    float *A_ptr = (float *)A;
+
+    float B[n][k];
+    float *B_ptr = (float *)B;
+
+    float C[m][k];
+    float *C_ptr = (float *)C;
+
+
+    portENTER_CRITICAL(&testnlock);
+
+    unsigned int start_b = dsp_get_cpu_cycle_count();
+    int repeat_count = 1024;
+    for (int i = 0 ; i < repeat_count ; i++) {
+        dspm_mult_4x4x1_f32(A_ptr, B_ptr, C_ptr);
+    }
+    unsigned int end_b = dsp_get_cpu_cycle_count();
+    portEXIT_CRITICAL(&testnlock);
+
+    float total_b = end_b - start_b;
+    float cycles = total_b / (repeat_count);
+    ESP_LOGI("dspm_mult_4x4x1_f32_ae32", "dspm_mult_4x4x1_f32_ae32 - %f per multiplication", cycles);
+    float min_exec = 60;
+    float max_exec = 300;
+    TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
+}
+
+TEST_CASE("dspm_mult_4x4x4_f32_ae32 benchmark", "[dspm]")
+{
+    int m = 4;
+    int n = 4;
+    int k = 4;
+
+    float A[m][n];
+    float *A_ptr = (float *)A;
+
+    float B[n][k];
+    float *B_ptr = (float *)B;
+
+    float C[m][k];
+    float *C_ptr = (float *)C;
+
+    ESP_LOGI(TAG, "A: %8.8"PRIx32", B: %8.8"PRIx32", C=%8.8"PRIx32"", (uint32_t)A_ptr, (uint32_t)B_ptr, (uint32_t)C_ptr);
+
+    portENTER_CRITICAL(&testnlock);
+
+    unsigned int start_b = dsp_get_cpu_cycle_count();
+    int repeat_count = 1024;
+    for (int i = 0 ; i < repeat_count ; i++) {
+        dspm_mult_4x4x4_f32(A_ptr, B_ptr, C_ptr);
+    }
+    unsigned int end_b = dsp_get_cpu_cycle_count();
+    portEXIT_CRITICAL(&testnlock);
+
+    float total_b = end_b - start_b;
+    float cycles = total_b / (repeat_count);
+    ESP_LOGI("dspm_mult_4x4x4_f32_ae32", "dspm_mult_4x4x4_f32_ae32 - %f per multiplication", cycles);
+    float min_exec = 50;
+    float max_exec = 750;
+    TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
+}
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/test/test_mmult_ex_f32_aexx.cpp
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/test/test_mmult_ex_f32_aexx.cpp
@@ -0,0 +1,285 @@
+/*
+ * SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <string.h>
+#include <malloc.h>
+#include "unity.h"
+#include "esp_dsp.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+
+#include "dspm_mult.h"
+#include "esp_attr.h"
+#include "dsp_tests.h"
+#include "test_mat_common.h"
+
+
+// create ROI rectangles
+dspm::Mat::Rect A_roi_rect;
+dspm::Mat::Rect B_roi_rect;
+dspm::Mat::Rect C_roi_rect;
+
+static void dspm_mult_ex_f32_aexx_functionality_in_cycle(m_test_data_t *test_d)
+{
+    char message[120];
+    sprintf(message, "var = %d,  A_s_row = %d, A_s_col = %d, B_s_row = %d B_s_col = %d, C_s_row = %d,  C_s_col = %d, m = %d, n = %d, k = %d\n", test_d->var,
+            test_d->A_start_row, test_d->A_start_col, test_d->B_start_row, test_d->B_start_col,
+            test_d->C_start_row, test_d->C_start_col, test_d->m, test_d->n, test_d->k);
+
+    // aligned data for A B C matrices
+    float *A_data = (float *)memalign(16, ((test_d->m + (2 * test_d->A_start_row)) * (test_d->n + (2 * test_d->A_start_col))) * sizeof(float));
+    float *B_data = (float *)memalign(16, ((test_d->n + (2 * test_d->B_start_row)) * (test_d->k + (2 * test_d->B_start_col))) * sizeof(float));
+    float *C_data = (float *)memalign(16, ((test_d->m + (2 * test_d->C_start_row)) * (test_d->k + (2 * test_d->C_start_col))) * sizeof(float));
+
+    // create A B C matrices with m n k dimensions + padding
+    // padding is from both sides of the targeted sub-matrix
+    // 1 1 1 1
+    // 1 x x 1
+    // 1 x x 1
+    // 1 1 1 1
+    dspm::Mat A(A_data, test_d->m + (2 * test_d->A_start_row), test_d->n + (2 * test_d->A_start_col));
+    dspm::Mat B(B_data, test_d->n + (2 * test_d->B_start_row), test_d->k + (2 * test_d->B_start_col));
+    dspm::Mat C(C_data, test_d->m + (2 * test_d->C_start_row), test_d->k + (2 * test_d->C_start_col));
+
+    // create ROI rectangles for sub-matrices
+    A_roi_rect.resizeRect(test_d->A_start_col, test_d->A_start_row, test_d->n, test_d->m);
+    B_roi_rect.resizeRect(test_d->B_start_col, test_d->B_start_row, test_d->k, test_d->n);
+    C_roi_rect.resizeRect(test_d->C_start_col, test_d->C_start_row, test_d->k, test_d->m);
+
+    // aligned data for sub-matrices
+    float *A_sub_data = (float *)memalign(16, A_roi_rect.areaRect() * sizeof(float));
+    float *B_sub_data = (float *)memalign(16, B_roi_rect.areaRect() * sizeof(float));
+    float *C_sub_data = (float *)memalign(16, C_roi_rect.areaRect() * sizeof(float));
+
+    // create sub-matrices A, B C matrices with aligned data
+    // matrices are used as sub-matrices with data copying for a matrix operation testing
+    dspm::Mat A_sub(A_sub_data, test_d->m, test_d->n);
+    dspm::Mat B_sub(B_sub_data, test_d->n, test_d->k);
+    dspm::Mat C_sub(C_sub_data, test_d->m, test_d->k);
+
+    // fill A B matrices with numbers
+    // fill C matrix with ones
+    for (int i = 0; i < A.length; i++) {
+        A.data[i] = i + 1;
+    }
+    for (int i = 0; i < B.length; i++) {
+        B.data[i] = i + 1;
+    }
+
+    if (test_d->var < 4) {
+        for (int i = 0; i < C.length; i++) {
+            C.data[i] = 1;
+        }
+    }
+
+    // Combinations of A B C matrices and sub-matrices are created for testing
+    // As an example: case 1
+    // Matrices A and C are sub-matrices - the data are defined as a pointer to an external buffer
+    // Matrix B is a matrix - the data are copied into the B matrix
+    switch (test_d->var) {
+    case 0: {
+        A_sub.CopyHead(A.getROI(A_roi_rect));    // A sub-matrix - NO DATA CPY
+        B_sub.CopyHead(B.getROI(B_roi_rect));    // B sub-matrix - NO DATA CPY
+        C_sub.CopyHead(C.getROI(C_roi_rect));    // C sub-matrix - NO DATA CPY
+    } break;
+    case 1: {
+        A_sub = A.Get(A_roi_rect);               // A matrix     - DATA CPY
+        B_sub.CopyHead(B.getROI(B_roi_rect));    // B sub-matrix - NO DATA CPY
+        C_sub.CopyHead(C.getROI(C_roi_rect));    // C sub-matrix - NO DATA CPY
+    } break;
+    case 2: {
+        A_sub.CopyHead(A.getROI(A_roi_rect));    // A sub-matrix - NO DATA CPY
+        B_sub = B.Get(B_roi_rect);               // B matrix     - DATA CPY
+        C_sub.CopyHead(C.getROI(C_roi_rect));    // C sub-matrix - NO DATA CPY
+    } break;
+    case 3: {
+        A_sub = A.Get(A_roi_rect);               // A matrix     - DATA CPY
+        B_sub = B.Get(B_roi_rect);               // B matrix     - DATA CPY
+        C_sub.CopyHead(C.getROI(C_roi_rect));    // C sub-matrix - NO DATA CPY
+    } break;
+    case 4: {
+        A_sub.CopyHead(A.getROI(A_roi_rect));    // A sub-matrix - NO DATA CPY
+        B_sub.CopyHead(B.getROI(B_roi_rect));    // B sub-matrix - NO DATA CPY
+        C_sub = C.Get(C_roi_rect);               // B matrix     - DATA CPY
+    } break;
+    case 5: {
+        A_sub.CopyHead(A.getROI(A_roi_rect));    // A sub-matrix - NO DATA CPY
+        B_sub = B.Get(B_roi_rect);               // B matrix     - DATA CPY
+        C_sub = C.Get(C_roi_rect);               // C matrix     - DATA CPY
+    } break;
+    case 6: {
+        A_sub = A.Get(A_roi_rect);               // A matrix     - DATA CPY
+        B_sub.CopyHead(B.getROI(B_roi_rect));    // B sub-matrix - NO DATA CPY
+        C_sub = C.Get(C_roi_rect);               // C matrix     - DATA CPY
+    } break;
+    default:
+        break;
+    }
+
+    // create A B check sub-matrices, actual matrix data are COPIED
+    dspm::Mat A_sub_check = A.Get(A_roi_rect);
+    dspm::Mat B_sub_check = B.Get(B_roi_rect);
+    dspm::Mat C_sub_check(test_d->m, test_d->k);
+
+    // Calculate C_sub_check = A_sub_check * B_sub_check
+    for (int i = 0 ; i < test_d->m ; i++) {
+        for (int j = 0 ; j < test_d->k ; j++) {
+            C_sub_check(i, j) = 0;
+            for (int s = 0 ; s < test_d->n ; s++) {
+                C_sub_check(i, j) += A_sub_check(i, s) * B_sub_check(s, j);
+            }
+        }
+    }
+
+    dspm_mult_ex_f32(A_sub.data, B_sub.data, C_sub.data, test_d->m, test_d->n, test_d->k, A_sub.padding, B_sub.padding, C_sub.padding);
+
+    // C is a sub-matrix
+    if (C_sub.sub_matrix) {
+        // Create a copy of the original C matrix (filled with ones 1)
+        // to check if an area around the sub-matrix is unaffected after a matrix operation
+        dspm::Mat C_area_check = dspm::Mat::ones(test_d->m + (2 * test_d->C_start_row), test_d->k + (2 * test_d->C_start_col));
+        test_assert_equal_mat_mat(C_sub_check, C_sub, message);
+        test_assert_check_area_mat_mat(C_area_check, C_sub, test_d->C_start_row, test_d->C_start_col, message);
+        // C is a matrix
+    } else {
+        test_assert_equal_mat_mat(C_sub_check, C_sub, message);
+    }
+
+    free(A_data);
+    free(B_data);
+    free(C_data);
+    free(A_sub_data);
+    free(B_sub_data);
+    free(C_sub_data);
+}
+
+TEST_CASE("dspm_mult_ex_f32_aexx functionality", "[dspm]")
+{
+    m_test_data_t test_data;
+
+    const int test_varations = 7;
+    const int start_col_min = 0;
+    const int start_row_min = 0;
+
+#if CONFIG_IDF_TARGET_ESP32S3
+    const int start_col_max = 4;
+    const int start_row_max = 4;
+    const int col_row_increment = 4;
+    const int m_max = 12;
+    const int n_max = 12;
+    const int k_mak = 12;
+    const int dim_increment = 4;
+    const int dim_start = 4;
+#elif CONFIG_IDF_TARGET_ESP32P4
+    const int start_col_max = 1;
+    const int start_row_max = 1;
+    const int col_row_increment = 1;
+    const int m_max = 4;
+    const int n_max = 4;
+    const int k_mak = 4;
+    const int dim_increment = 1;
+    const int dim_start = 2; // <= the esp.lp.setup instruction is not working with loop count 1. The min value is 2.
+#else
+    const int start_col_max = 1;
+    const int start_row_max = 1;
+    const int col_row_increment = 1;
+    const int m_max = 4;
+    const int n_max = 4;
+    const int k_mak = 4;
+    const int dim_increment = 1;
+    const int dim_start = 1;
+#endif
+
+    for (int var = 0; var < test_varations; var++) {
+        // C Matrix starting row for sub-matrix
+        for (int C_start_row = start_row_min; C_start_row <= start_row_max; C_start_row += col_row_increment) {
+
+            // C Matrix starting col for sub-matrix
+            for (int C_start_col = start_col_min; C_start_col <= start_col_max; C_start_col += col_row_increment) {
+
+                // A Matrix starting row for sub-matrix
+                for (int A_start_row = start_row_min; A_start_row <= start_row_max; A_start_row += col_row_increment) {
+
+                    // A Matrix starting col for sub-matrix
+                    for (int A_start_col = start_col_min; A_start_col <= start_col_max; A_start_col += col_row_increment) {
+
+                        // B Matrix starting row for sub-matrix
+                        for (int B_start_row = start_row_min; B_start_row <= start_row_max; B_start_row += col_row_increment) {
+
+                            // B Matrix starting col for sub-matrix
+                            for (int B_start_col = start_col_min; B_start_col <= start_col_max; B_start_col += col_row_increment) {
+
+                                // sub-matrix m parameter
+                                for (int m = dim_start; m <= m_max; m += dim_increment) {
+
+                                    // sub-matrix n paramter
+                                    for (int n = dim_start; n <= n_max; n += dim_increment) {
+
+                                        // sub-matrix k parameter
+                                        for (int k = dim_start; k <= k_mak; k += dim_increment) {
+
+                                            test_data = {var, A_start_row, A_start_col, B_start_row, B_start_col, C_start_row, C_start_col, m, n, k};
+                                            dspm_mult_ex_f32_aexx_functionality_in_cycle(&test_data);
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        std::cout << var + 1 << "/" << test_varations << " of test done" << std::endl;
+    }
+}
+
+static portMUX_TYPE testnlock = portMUX_INITIALIZER_UNLOCKED;
+
+TEST_CASE("dspm_mult_ex_f32_aexx benchmark", "[dspm]")
+{
+    const int m = 4;
+    const int n = 4;
+    const int k = 4;
+    const int start_row_col = 4;
+
+    A_roi_rect.resizeRect(start_row_col, start_row_col, n, m);
+    B_roi_rect.resizeRect(start_row_col, start_row_col, k, n);
+    C_roi_rect.resizeRect(start_row_col, start_row_col, k, m);
+
+    float *A_data = (float *)memalign(16, (m + (2 * start_row_col)) * (n + (2 * start_row_col)) * sizeof(float));
+    float *B_data = (float *)memalign(16, (n + (2 * start_row_col)) * (k + (2 * start_row_col)) * sizeof(float));
+    float *C_data = (float *)memalign(16, (m + (2 * start_row_col)) * (k + (2 * start_row_col)) * sizeof(float));
+
+    dspm::Mat A(A_data, m + (2 * start_row_col), n + (2 * start_row_col));
+    dspm::Mat B(B_data, n + (2 * start_row_col), k + (2 * start_row_col));
+    dspm::Mat C(C_data, m + (2 * start_row_col), k + (2 * start_row_col));
+
+    dspm::Mat A_subset = A.getROI(A_roi_rect);
+    dspm::Mat B_subset = B.getROI(B_roi_rect);
+    dspm::Mat C_subset = C.getROI(C_roi_rect);
+
+    portENTER_CRITICAL(&testnlock);
+    dspm_mult_ex_f32(A_subset.data, B_subset.data, C_subset.data, m, n, k, A_subset.padding, B_subset.padding, C_subset.padding);
+
+    unsigned int start_b = dsp_get_cpu_cycle_count();
+    int repeat_count = 1024;
+    for (int i = 0 ; i < repeat_count ; i++) {
+        dspm_mult_ex_f32(A_subset.data, B_subset.data, C_subset.data, m, n, k, A_subset.padding, B_subset.padding, C_subset.padding);
+    }
+    unsigned int end_b = dsp_get_cpu_cycle_count();
+    portEXIT_CRITICAL(&testnlock);
+
+    float total_b = end_b - start_b;
+    float cycles = total_b / (repeat_count);
+    printf("Benchmark dspm_mult_f32 - %f per multiplication 4x4 + overhead.\n", cycles);
+    float min_exec = 100;
+    float max_exec = 750;
+    TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
+
+    free(A_data);
+    free(B_data);
+    free(C_data);
+}
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/test/test_mmult_ex_f32_ansi.cpp
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/test/test_mmult_ex_f32_ansi.cpp
@@ -0,0 +1,176 @@
+/*
+ * SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <string.h>
+#include "unity.h"
+#include "esp_dsp.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+
+#include "dspm_mult.h"
+#include "esp_attr.h"
+#include "dsp_tests.h"
+#include "test_mat_common.h"
+
+TEST_CASE("dspm_mult_ex_f32_ansi functionality", "[dspm]")
+{
+    // create ROI rectangles
+    dspm::Mat::Rect A_roi_rect;
+    dspm::Mat::Rect B_roi_rect;
+    dspm::Mat::Rect C_roi_rect;
+
+    char message[60];
+    for (int var = 0; var < 7; var++) {
+        for (int start_row = 0; start_row < 2; start_row++) {
+            for (int start_col = 0; start_col < 2; start_col++) {
+                for (int m = 1; m < 6; m++) {
+                    for (int n = 1; n < 6; n++) {
+                        for (int k = 1; k < 6; k++) {
+                            sprintf(message, "var = %d  s_row = %d  s_col = %d, m = %d, n = %d, k = %d", var, start_row, start_col, m, n, k);
+                            // create A B C matrices with m n k dimensions + padding
+                            // padding is from both sides of the targeted sub-matrix
+                            // 1 1 1 1
+                            // 1 x x 1
+                            // 1 x x 1
+                            // 1 1 1 1
+                            dspm::Mat A(m + (2 * start_row), n + (2 * start_col));
+                            dspm::Mat B(n + (2 * start_row), k + (2 * start_col));
+                            dspm::Mat C = dspm::Mat::ones(m + (2 * start_row), k + (2 * start_col));
+
+                            // create A B C sub matrices with undefined dimensions
+                            dspm::Mat A_sub;
+                            dspm::Mat B_sub;
+                            dspm::Mat C_sub;
+
+                            // adjust ROI rectangles
+                            A_roi_rect.resizeRect(start_col, start_row, n, m);
+                            B_roi_rect.resizeRect(start_col, start_row, k, n);
+                            C_roi_rect.resizeRect(start_col, start_row, k, m);
+
+                            // fill A B matrices with numbers
+                            // fill C matrix with ones
+                            for (int i = 0; i < A.length; i++) {
+                                A.data[i] = i + 1;
+                            }
+                            for (int i = 0; i < B.length; i++) {
+                                B.data[i] = i + 1;
+                            }
+
+                            // Combinations of A B C matrices and sub-matrices are created for testing
+                            // As an example: case 1
+                            // Matrices B and C are sub-matrices - the data are defined as a pointer to an external buffer
+                            // Matrix B is a matrix - the data are copied into the B matrix
+                            switch (var) {
+                            case 0: {
+                                A_sub.CopyHead(A.getROI(A_roi_rect));    // A sub-matrix - NO DATA CPY
+                                B_sub.CopyHead(B.getROI(B_roi_rect));    // B sub-matrix - NO DATA CPY
+                                C_sub.CopyHead(C.getROI(C_roi_rect));    // C sub-matrix - NO DATA CPY
+                            } break;
+                            case 1: {
+                                A_sub = A.Get(A_roi_rect);               // A matrix     - DATA CPY
+                                B_sub.CopyHead(B.getROI(B_roi_rect));    // B sub_matirx - NO DATA CPY
+                                C_sub.CopyHead(C.getROI(C_roi_rect));    // C sub_matirx - NO DATA CPY
+                            } break;
+                            case 2: {
+                                A_sub.CopyHead(A.getROI(A_roi_rect));    // A sub-matrix - NO DATA CPY
+                                B_sub = B.Get(B_roi_rect);               // B matrix     - DATA CPY
+                                C_sub.CopyHead(C.getROI(C_roi_rect));    // C sub-matrix - NO DATA CPY
+                            } break;
+                            case 3: {
+                                A_sub = A.Get(A_roi_rect);               // A matrix     - DATA CPY
+                                B_sub = B.Get(B_roi_rect);               // B matrix     - DATA CPY
+                                C_sub.CopyHead(C.getROI(C_roi_rect));    // C sub-matrix - NO DATA CPY
+                            } break;
+                            case 4: {
+                                A_sub.CopyHead(A.getROI(A_roi_rect));    // A sub-matrix - NO DATA CPY
+                                B_sub.CopyHead(B.getROI(B_roi_rect));    // B sub-matrix - NO DATA CPY
+                                C_sub = C.Get(C_roi_rect);               // C matrix     - DATA CPY
+                            } break;
+                            case 5: {
+                                A_sub.CopyHead(A.getROI(A_roi_rect));    // A sub-matrix - NO DATA CPY
+                                B_sub = B.Get(B_roi_rect);               // B matrix     - DATA CPY
+                                C_sub = C.Get(C_roi_rect);               // C matrix     - DATA CPY
+                            } break;
+                            case 6: {
+                                A_sub = A.Get(A_roi_rect);               // A matrix     - DATA CPY
+                                B_sub.CopyHead(B.getROI(B_roi_rect));    // B sub-matrix - NO DATA CPY
+                                C_sub = C.Get(C_roi_rect);               // C matrix     - DATA CPY
+                            } break;
+                            default:
+                                break;
+                            }
+
+                            // create A B check sub-matrices, actual matrix data are COPIED
+                            dspm::Mat A_sub_check = A.Get(A_roi_rect);
+                            dspm::Mat B_sub_check = B.Get(B_roi_rect);
+                            dspm::Mat C_sub_check(m, k);
+
+                            // Calculate C_sub_check = A_sub_check * B_sub_check
+                            for (int i = 0 ; i < m ; i++) {
+                                for (int j = 0 ; j < k ; j++) {
+                                    C_sub_check(i, j) = 0;
+                                    for (int s = 0 ; s < n ; s++) {
+                                        C_sub_check(i, j) += A_sub_check(i, s) * B_sub_check(s, j);
+                                    }
+                                }
+                            }
+
+                            dspm_mult_ex_f32_ansi(A_sub.data, B_sub.data, C_sub.data, m, n, k, A_sub.padding, B_sub.padding, C_sub.padding);
+
+                            // C is a sub-matrix
+                            if (C_sub.sub_matrix) {
+                                // Create a copy of the original C matrix (filled with ones 1)
+                                // to check if an area around the sub-matrix is unaffected after a matrix operation
+                                dspm::Mat C_area_check = dspm::Mat::ones(m + (2 * start_row), k + (2 * start_col));
+                                test_assert_equal_mat_mat(C_sub_check, C_sub, message);
+                                test_assert_check_area_mat_mat(C_area_check, C_sub, start_row, start_col, message);
+                                // C is a matrix
+                            } else {
+                                test_assert_equal_mat_mat(C_sub_check, C_sub, message);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static portMUX_TYPE testnlock = portMUX_INITIALIZER_UNLOCKED;
+
+TEST_CASE("dspm_mult_ex_f32_ansi benchmark", "[dspm]")
+{
+    const int m = 4;
+    const int n = 4;
+    const int k = 4;
+    const int M_off = 1;
+
+    dspm::Mat A(m + M_off, n + M_off);
+    dspm::Mat B(n + M_off, k + M_off);
+    dspm::Mat C(m + M_off, k + M_off);
+
+    dspm::Mat A_subset = A.getROI(M_off, M_off, m, n);
+    dspm::Mat B_subset = B.getROI(M_off, M_off, n, k);
+    dspm::Mat C_subset = C.getROI(M_off, M_off, m, k);
+
+    portENTER_CRITICAL(&testnlock);
+    dspm_mult_ex_f32_ansi(A_subset.data, B_subset.data, C_subset.data, m, n, k, A_subset.padding, B_subset.padding, C_subset.padding);
+
+    unsigned int start_b = dsp_get_cpu_cycle_count();
+    int repeat_count = 1024;
+    for (int i = 0 ; i < repeat_count ; i++) {
+        dspm_mult_ex_f32_ansi(A_subset.data, B_subset.data, C_subset.data, m, n, k, A_subset.padding, B_subset.padding, C_subset.padding);
+    }
+    unsigned int end_b = dsp_get_cpu_cycle_count();
+    portEXIT_CRITICAL(&testnlock);
+
+    float total_b = end_b - start_b;
+    float cycles = total_b / (repeat_count);
+    printf("Benchmark dspm_mult_f32 - %f per multiplication 4x4 + overhead.\n", cycles);
+    float min_exec = 100;
+    float max_exec = 1400;
+    TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
+}
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/test/test_mmult_f32_ae32.c
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/test/test_mmult_f32_ae32.c
@@ -0,0 +1,108 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "esp_dsp.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+
+#include "dspm_mult.h"
+#include "esp_attr.h"
+#include "dsp_tests.h"
+
+static const char *TAG = "dspm_mult_f32_aexx";
+
+// Test dsps_dotprod_s16_ansi function
+TEST_CASE("dspm_mult_f32 functionality", "[dspm]")
+{
+    int m = 4;
+    int n = 3;
+    int k = 4;
+
+
+    float A[m][n];
+    float *A_ptr = (float *)A;
+
+    float B[n][k];
+    float *B_ptr = (float *)B;
+
+    float C[m][k];
+    float *C_ptr = (float *)C;
+    float C_compare[m][k];
+    float *Cc_ptr = (float *)C_compare;
+
+    for (int i = 0 ; i < m * n; i++) {
+        A_ptr[i] = i;
+        B_ptr[i] = i;
+    }
+    for (int i = 0 ; i < m ; i++) {
+        for (int j = 0 ; j < k ; j++) {
+            C_compare[i][j] = 0;
+            for (int s = 0 ; s < n ; s++) {
+                C_compare[i][j] += A[i][s] * B[s][j];
+            }
+        }
+    }
+    dspm_mult_f32(A_ptr, B_ptr, C_ptr, m, n, k);
+
+    for (int i = 0 ; i < m ; i++) {
+        for (int j = 0 ; j < k ; j++) {
+            ESP_LOGI(TAG, "[%i][%i] calc=%f, expected =%f", i, j, C[i][j], C_compare[i][j]);
+        }
+    }
+    // Compare and check results
+    for (int i = 0 ; i < m * k ; i++) {
+        if (Cc_ptr[i] != C_ptr[i]) {
+            TEST_ASSERT_EQUAL( C_ptr[i], Cc_ptr[i]);
+        }
+    }
+}
+
+static portMUX_TYPE testnlock = portMUX_INITIALIZER_UNLOCKED;
+
+TEST_CASE("dspm_mult_f32 benchmark", "[dspm]")
+{
+    int m = 4;
+    int n = 4;
+    int k = 4;
+
+    float A[m][n];
+    float *A_ptr = (float *)A;
+
+    float B[n][k];
+    float *B_ptr = (float *)B;
+
+    float C[m][k];
+    float *C_ptr = (float *)C;
+
+
+    ESP_LOGI(TAG, "A: %8.8"PRIx32", B: %8.8"PRIx32", C=%8.8"PRIx32"", (uint32_t)A_ptr, (uint32_t)B_ptr, (uint32_t)C_ptr);
+    portENTER_CRITICAL(&testnlock);
+
+    unsigned int start_b = dsp_get_cpu_cycle_count();
+    int repeat_count = 1024;
+    for (int i = 0 ; i < repeat_count ; i++) {
+        dspm_mult_f32(A_ptr, B_ptr, C_ptr, m, n, k);
+    }
+    unsigned int end_b = dsp_get_cpu_cycle_count();
+    portEXIT_CRITICAL(&testnlock);
+
+    float total_b = end_b - start_b;
+    float cycles = total_b / (repeat_count);
+    printf("Benchmark dspm_mult_f32 - %f per multiplication 4x4 + overhead.\n", cycles);
+    float min_exec = 100;
+    float max_exec = 800;
+    TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
+}
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/test/test_mmult_f32_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/test/test_mmult_f32_ansi.c
@@ -0,0 +1,118 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "esp_dsp.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+
+#include "dspm_mult.h"
+#include "esp_attr.h"
+#include "dsp_tests.h"
+
+static const char *TAG = "dspm_mult_f32_ansi";
+
+// Test dsps_dotprod_s16_ansi function
+TEST_CASE("dspm_mult_f32_ansi functionality", "[dspm]")
+{
+    for (int m = 1 ; m < 8 ; m++) {
+        for (int n = 1; n < 8 ; n++) {
+            for (int k = 1; k < 8 ; k++) {
+                float A[m][n];
+                float *A_ptr = (float *)A;
+
+                float B[n][k];
+                float *B_ptr = (float *)B;
+
+                float C[m][k];
+                float *C_ptr = (float *)C;
+                float C_compare[m][k];
+                float *Cc_ptr = (float *)C_compare;
+
+                for (int i = 0 ; i < m * n; i++) {
+                    A_ptr[i] = i;
+                    B_ptr[i] = i;
+                }
+                for (int i = 0 ; i < m ; i++) {
+                    for (int j = 0 ; j < n ; j++) {
+                        A[i][j] = i * n + j;
+                    }
+                }
+                for (int i = 0 ; i < n ; i++) {
+                    for (int j = 0 ; j < k ; j++) {
+                        B[i][j] = i * k + j;
+                    }
+                }
+                for (int i = 0 ; i < m ; i++) {
+                    for (int j = 0 ; j < k ; j++) {
+                        C_compare[i][j] = 0;
+                        for (int s = 0 ; s < n ; s++) {
+                            C_compare[i][j] += A[i][s] * B[s][j];
+                        }
+                    }
+                }
+                dspm_mult_f32_ansi(A_ptr, B_ptr, C_ptr, m, n, k);
+
+                for (int i = 0 ; i < m ; i++) {
+                    for (int j = 0 ; j < k ; j++) {
+                        ESP_LOGD(TAG, "[%i][%i] calc=%f, expected =%f", i, j, C[i][j], C_compare[i][j]);
+                    }
+                }
+                // Compare and check results
+                for (int i = 0 ; i < m * k ; i++) {
+                    if (Cc_ptr[i] != C_ptr[i]) {
+                        TEST_ASSERT_EQUAL(Cc_ptr[i], C_ptr[i]);
+                    }
+                }
+            }
+        }
+    }
+}
+
+static portMUX_TYPE testnlock = portMUX_INITIALIZER_UNLOCKED;
+
+TEST_CASE("dspm_mult_f32_ansi benchmark", "[dspm]")
+{
+    int m = 4;
+    int n = 4;
+    int k = 4;
+
+    float A[m][n];
+    float *A_ptr = (float *)A;
+
+    float B[n][k];
+    float *B_ptr = (float *)B;
+
+    float C[m][k];
+    float *C_ptr = (float *)C;
+
+
+    portENTER_CRITICAL(&testnlock);
+
+    unsigned int start_b = dsp_get_cpu_cycle_count();
+    int repeat_count = 1024;
+    for (int i = 0 ; i < repeat_count ; i++) {
+        dspm_mult_f32_ansi(A_ptr, B_ptr, C_ptr, m, n, k);
+    }
+    unsigned int end_b = dsp_get_cpu_cycle_count();
+    portEXIT_CRITICAL(&testnlock);
+
+    float total_b = end_b - start_b;
+    float cycles = total_b / (repeat_count);
+    printf("Benchmark dspm_mult_f32_ansi - %f per multiplication 4x4 + overhead.\n", cycles);
+    float min_exec = 100;
+    float max_exec = 2000;
+    TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
+}
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/test/test_mmult_s16_ae32.c
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/test/test_mmult_s16_ae32.c
@@ -0,0 +1,106 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "esp_dsp.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+
+#include "dspm_mult.h"
+#include "esp_attr.h"
+#include "esp_log.h"
+
+// Test dsps_dotprod_s16_ansi function
+TEST_CASE("dspm_mult_s16_aexx functionality", "[dspm]")
+{
+    for (int m = 1 ; m < 8 ; m++) {
+        for (int n = 1 ; n < 16 ; n++) {
+            for (int k = 1 ; k < 16 ; k++) {
+
+                int16_t A[m][n];
+                int16_t *A_ptr = (int16_t *)A;
+
+                int16_t B[n][k];
+                int16_t *B_ptr = (int16_t *)B;
+
+                int16_t C[m][k];
+                int16_t *C_ptr = (int16_t *)C;
+                int16_t C_compare[m][k];
+                int16_t *Cc_ptr = (int16_t *)C_compare;
+                for (int shift = -4 ; shift < 4 ; shift++) {
+                    for (int i = 0 ; i < m ; i++) {
+                        for (int j = 0 ; j < n; j++) {
+                            A[i][j] = 0x123;
+                        }
+                    }
+                    for (int i = 0 ; i < n ; i++) {
+                        for (int j = 0 ; j < k; j++) {
+                            B[i][j] = 0x123;
+                        }
+                    }
+
+                    dspm_mult_s16_ansi(A_ptr, B_ptr, Cc_ptr, m, n, k, shift);
+                    dspm_mult_s16(A_ptr, B_ptr, C_ptr,  m, n, k, shift);
+
+                    // Compare and check results
+                    for (int i = 0 ; i < m * k ; i++) {
+                        if (Cc_ptr[i] != C_ptr[i]) {
+                            ESP_LOGE("dspm_mult_s16_aexx", "Process path m=%i, n=%i, k=%i,  shift=%i", m, n, k, shift);
+                            ESP_LOGE("dspm_mult_s16_aexx", "data[%i] %4.4x != %4.4x expected \n", i, C_ptr[i], Cc_ptr[i]);
+                            TEST_ASSERT_EQUAL(Cc_ptr[i], C_ptr[i]);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static portMUX_TYPE testnlock = portMUX_INITIALIZER_UNLOCKED;
+
+TEST_CASE("dspm_mult_s16_aexx benchmark", "[dspm]")
+{
+    unsigned int start_b = dsp_get_cpu_cycle_count();
+    unsigned int end_b = dsp_get_cpu_cycle_count();
+    for (int m = 2 ; m <= 8 ; m++) {
+        for (int n = 2 ; n <= 16 ; n++) {
+            for (int k = 1 ; k <= 16 ; k++) {
+
+                int16_t A[m][n];
+                int16_t *A_ptr = (int16_t *)A;
+
+                int16_t B[m][n];
+                int16_t *B_ptr = (int16_t *)B;
+
+                int16_t C[m][k];
+                int16_t *C_ptr = (int16_t *)C;
+
+                memset(A, 0, sizeof(A));
+                memset(B, 0, sizeof(A));
+                memset(C, 0, sizeof(A));
+                portENTER_CRITICAL(&testnlock);
+
+                start_b = dsp_get_cpu_cycle_count();
+                dspm_mult_s16(A_ptr, B_ptr, C_ptr, m, n, k, 0);
+                end_b = dsp_get_cpu_cycle_count();
+                portEXIT_CRITICAL(&testnlock);
+
+                float total_b = end_b - start_b;
+                float cycles = total_b;
+                ESP_LOGD("dspm_mult_s16_aexx", "dspm_mult_s16_aexx[%i][%i][%i] - %f", m, n, k, cycles);
+            }
+        }
+    }
+}
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/test/test_mmult_s16_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/test/test_mmult_s16_ansi.c
@@ -0,0 +1,110 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "esp_dsp.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+
+#include "dspm_mult.h"
+#include "esp_attr.h"
+#include "dsp_tests.h"
+
+static const char *TAG = "dspm_mult_s16_ansi";
+
+// Test dsps_dotprod_s16_ansi function
+TEST_CASE("dspm_mult_s16_ansi functionality", "[dspm]")
+{
+    int m = 4;
+    int n = 3;
+    int k = 4;
+
+
+    int16_t A[m][n];
+    int16_t *A_ptr = (int16_t *)A;
+
+    int16_t B[n][k];
+    int16_t *B_ptr = (int16_t *)B;
+
+    int16_t C[m][k];
+    int16_t *C_ptr = (int16_t *)C;
+    int16_t C_compare[m][k];
+    int16_t *Cc_ptr = (int16_t *)C_compare;
+
+    int shift = 0;
+    for (int i = 0 ; i < m * n; i++) {
+        A_ptr[i] = 0x1000;
+        B_ptr[i] = 0x200;
+    }
+    long long store_reg = 0;
+    for (int i = 0 ; i < m ; i++) {
+        for (int j = 0 ; j < k ; j++) {
+            store_reg = (0x7fff >> shift);
+            for (int s = 0 ; s < n ; s++) {
+                store_reg += ((int32_t)A[i][s] * (int32_t)B[s][j]);
+            }
+            C_compare[i][j] = store_reg >> (15 - shift);
+        }
+    }
+    dspm_mult_s16_ansi(A_ptr, B_ptr, C_ptr, m, n, k, shift);
+
+    for (int i = 0 ; i < m ; i++) {
+        for (int j = 0 ; j < k ; j++) {
+            ESP_LOGD(TAG, "[%i][%i] calc=%i, expected =%i", i, j, C[i][j], C_compare[i][j]);
+        }
+    }
+    // Compare and check results
+    for (int i = 0 ; i < m * k ; i++) {
+        if (Cc_ptr[i] != C_ptr[i]) {
+            TEST_ASSERT_EQUAL(Cc_ptr[i], C_ptr[i]);
+        }
+    }
+}
+
+static portMUX_TYPE testnlock = portMUX_INITIALIZER_UNLOCKED;
+
+TEST_CASE("dspm_mult_s16_ansi benchmark", "[dspm]")
+{
+    int m = 4;
+    int n = 4;
+    int k = 4;
+
+    int16_t A[m][n];
+    int16_t *A_ptr = (int16_t *)A;
+
+    int16_t B[n][k];
+    int16_t *B_ptr = (int16_t *)B;
+
+    int16_t C[m][k];
+    int16_t *C_ptr = (int16_t *)C;
+
+
+    portENTER_CRITICAL(&testnlock);
+
+    unsigned int start_b = dsp_get_cpu_cycle_count();
+    int repeat_count = 1024;
+    for (int i = 0 ; i < repeat_count ; i++) {
+        dspm_mult_s16_ansi(A_ptr, B_ptr, C_ptr, m, n, k, 0);
+    }
+    unsigned int end_b = dsp_get_cpu_cycle_count();
+    portEXIT_CRITICAL(&testnlock);
+
+    float total_b = end_b - start_b;
+    float cycles = total_b / (repeat_count);
+    ESP_LOGI("dspm_mult_s16_ansi", "Benchmark dspm_mult_s16_ansi - %f per multiplication %ix%ix%i.\n", cycles, m, n, k);
+    float min_exec = 1000;
+    float max_exec = 3000;
+    TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
+}
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/test_sim/main.c
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/test_sim/main.c
@@ -0,0 +1,12 @@
+
+void test_mmult();
+
+int main(void)
+{
+    printf("main starts!\n");
+//    xt_iss_profile_enable();
+    test_mmult();
+//    xt_iss_profile_disable();
+
+    printf("Test done\n");
+}
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/test_sim/test_mmult.c
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/test_sim/test_mmult.c
@@ -0,0 +1,65 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "dsp_common.h"
+
+#include "dspm_mult.h"
+extern void xt_iss_profile_disable();
+extern void xt_iss_profile_enable();
+
+#define M 4
+#define N 8
+#define K 16
+
+const    int m = M;
+const    int n = N;
+const    int k = K;
+
+float A[M][N];
+float B[N][K];
+float C[M][K];
+float C_compare[M][K];
+
+void test_mmult()
+{
+
+    float *A_ptr = (float *)A;
+    float *B_ptr = (float *)B;
+    float *C_ptr = (float *)C;
+    float *Cc_ptr = (float *)C_compare;
+
+    for (int i = 0 ; i < m * n; i++) {
+        A_ptr[i] = i;
+        B_ptr[i] = i;
+    }
+    for (int i = 0 ; i < m ; i++) {
+        for (int j = 0 ; j < k ; j++) {
+            C_compare[i][j] = 0;
+            for (int s = 0 ; s < n ; s++) {
+                C_compare[i][j] += A[i][s] * B[s][j];
+            }
+            C[i][j] = -1;
+        }
+    }
+    xt_iss_profile_enable();
+    dspm_mult_f32_ae32(A_ptr, B_ptr, Cc_ptr, m, n, k);
+    dspm_mult_f32_aes3(A_ptr, B_ptr, C_ptr, m, n, k);
+    xt_iss_profile_disable();
+
+    for (int i = 0 ; i < m ; i++) {
+        for (int j = 0 ; j < k ; j++) {
+            printf("[%i][%i] calc=%f, expected =%f\n", i, j, C[i][j], C_compare[i][j]);
+        }
+    }
+    // Compare and check results
+    for (int i = 0 ; i < m * k ; i++) {
+        if (Cc_ptr[i] != C_ptr[i]) {
+            printf("Error - C_ptr= %f, Cc_ptr= %f \n", C_ptr[i], Cc_ptr[i]);
+            return;
+        }
+    }
+
+    printf("Test Pass!\n");
+}