add some code

2025-09-05 13:25:11 +08:00
parent 9ff0a99e7a
commit 3cf1229a85
8911 changed files with 2535396 additions and 0 deletions
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/fixed/dspm_mult_s16_ae32.S
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/fixed/dspm_mult_s16_ae32.S
@@ -0,0 +1,174 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspm_mult_platform.h"
+#if (dspm_mult_s16_ae32_enabled == 1)
+
+#include "dsps_dotprod_s16_m_ae32.S"
+#include "dspm_mult_s16_m_ae32_vector.S"
+//esp_err_t dspm_mult_s16_ae32(const int16_t* A, const int16_t* B, int16_t* C, int m, int n, int k, int shift);
+
+// This is matrix multipliction function for ESP32 processor.
+	.text
+	.align  4
+	.global dspm_mult_s16_ae32
+	.global .dspm_mult_s16_ae32_body
+	.type   dspm_mult_s16_ae32,@function
+
+dspm_mult_s16_ae32: 
+// A - a2
+// B - a3
+// C - a4
+// m - a5 - any > 0
+// n - a6 - 1,2,3, any
+// k - a7 - 1, any
+// shift - stack (a8) 
+
+// a14 - n*4 - pointer increment
+//
+	entry	a1, 80
+// ======     process matrices when k == 1   ============
+.dspm_mult_s16_ae32_body:
+	l32i.n	a8, a1, 80 // Load shift to the a8 register
+	
+
+	// Prepare and load round value
+	ssr a8 // store shift to ssa
+	movi a15, 0x7fff
+	srl a15, a15
+
+	neg  a8, a8 
+	addi a8, a8, 15
+	ssr a8 // store shift to ssa
+	movi a8, 0  // Clear a8 
+
+	slli    a14, a6, 1 // Pointer increment for n
+	movi.n	a10, 2 // Increment = 2
+	movi.n	a9, 0  // initial counter loop1
+
+	movi     a12, 1
+	beq      a7, a12, vector_mult
+	// We have normal path with k > 1
+	// a2, a3, a4 - A,B,C
+	// a5 - m
+	// a6 - n
+	// a7 - k
+	// a8 - temp
+	// a9 - temp
+	// a10- k counter
+	// a11- m counter
+	// a12- B
+	// a13- A 
+	// a14 - pointer increment for n
+	// a15 - round value
+
+	bbsi  a6, 0, even_N_samples
+//  ----------------  for odd N
+	srli    a6, a6, 1 // counter a6 = a6/2. We have to do it only once
+	slli    a7, a7, 1 // counter a7 = a7*2. We have to do it only once
+	
+	// loop for M
+m_loop_mmult:
+	movi    a10, 0  // reset k loop counter
+	mov     a13, a3 // set pointer to the first column
+// loop for K
+k_loop_mmult:
+
+		addi     a12, a2, -4 // every loop the same start position
+
+		movi    a8, 0
+		wsr     a8, acchi
+		wsr     a15, acclo // initialize acc with shifted round value
+
+		loopnez a6, .loop_end_mmult // loop for N
+		.loop_mmult:
+			ldinc       m3, a12
+			l16si       a8, a13, 0
+			add         a13, a13, a7
+			mula.ad.ll  a8, m3
+			l16si       a8, a13, 0
+			add         a13, a13, a7            
+			mula.ad.lh  a8, m3
+		.loop_end_mmult:
+
+		rsr     a8, acchi
+		rsr     a9, acclo
+		src     a8, a8, a9        
+		s16i	a8, a4, 0
+		addi    a4, a4, 2
+		// check and increment for K
+		
+		addi    a10, a10, 2
+		add     a13, a3, a10 // we shift collumn 
+		bne     a10, a7, k_loop_mmult
+
+		// Check and increment for M
+		add     a2, a2, a14 // move to the next raw
+		addi    a5, a5, -1
+		bnez.n  a5, m_loop_mmult
+
+	movi.n	a2, 0 // return status ESP_OK
+	retw.n
+
+even_N_samples:
+//  ----------------  for odd N
+	slli    a7, a7, 1 // counter a7 = a7*2. We have to do it only once
+	
+	// loop for M
+m_loop_mmult_even:
+	movi    a10, 0  // reset k loop counter
+	mov     a13, a3 // set pointer to the first column
+// loop for K
+k_loop_mmult_even:
+
+		mov     a12, a2     // every loop the same start position
+
+		movi    a8, 0
+		wsr     a8,  acchi
+		wsr     a15, acclo // initialize acc with shifted round value
+
+		loopnez a6, .loop_end_mmult_even // loop for N
+		.loop_mmult_even:
+			l16si       a9, a12, 0
+			l16si       a8, a13, 0
+			addi        a12, a12, 2
+			add         a13, a13, a7
+			mula.aa.ll  a8, a9
+		.loop_end_mmult_even:
+
+		rsr     a8, acchi
+		rsr     a9, acclo
+		src     a8, a8, a9        
+		s16i	a8, a4, 0
+		addi    a4, a4, 2
+		// check and increment for K
+		
+		addi    a10, a10, 2
+		add     a13, a3, a10 // we shift collumn 
+		bne     a10, a7, k_loop_mmult_even
+
+		// Check and increment for M
+		add     a2, a2, a14 // move to the next raw
+		addi    a5, a5, -1
+		bnez.n  a5, m_loop_mmult_even
+
+	movi.n	a2, 0 // return status ESP_OK
+	retw.n
+
+// The path where n > 1
+vector_mult:
+	dspm_mult_s16_m_ae32_vector;
+
+
+#endif // dspm_mult_s16_ae32_enabled
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/fixed/dspm_mult_s16_aes3.S
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/fixed/dspm_mult_s16_aes3.S
@@ -0,0 +1,142 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspm_mult_platform.h"
+#if (dspm_mult_s16_aes3_enabled == 1)
+#include "dsps_dotprod_s16_m_ae32.S"
+#include "dspm_mult_s16_m_ae32_vector.S"
+
+//esp_err_t dspm_mult_s16_ae32(const int16_t* A, const int16_t* B, int16_t* C, int m, int n, int k, int shift);
+
+// This is matrix multipliction function for ESP32 processor.
+	.text
+	.align	4
+	.literal_position
+	.literal	.LC0_1_38, 32767
+	.literal	.LC1_1_39, 16383
+
+	.global  dspm_mult_s16_aes3
+	.global .dspm_mult_s16_ae32_body
+	.type    dspm_mult_s16_aes3,@function
+
+dspm_mult_s16_aes3: 
+
+	entry	a1,80                   	#  
+
+	movi.n	a10, 7
+	and a10, a10, a7
+	beqz  a10, .dspm_mult_s16_aes3_body
+	// Call Esp32 function
+	J 	.dspm_mult_s16_ae32_body
+
+.dspm_mult_s16_aes3_body:
+	mov.n	a10,a4                  	# [0]  
+	mov.n	a11,a5                  	# [1]  
+	l32i	a5,a1,80                 	# [2]  id:77 shift+0x0
+	s32i.n	a3,a1,32               	# [3]  gra_spill_temp_0
+	
+	bltz	a5,.Lt_0_6146            	# [4]  
+
+#.LBB3_dspm_mult_s16_aes3:	# 0x13
+	l32r	a9,.LC0_1_38             	# [0]  
+	ssr	a5                        	# [1]  
+	sra	a9,a9                     	# [2]  
+
+.LBB23_dspm_mult_s16_aes3:	# 0x1c
+	s16i	a9,a1,0                  	# [0]  id:78 round_data_64+0x0
+	s16i	a9,a1,2                  	# [1]  id:78 round_data_64+0x0
+	s16i	a9,a1,4                  	# [2]  id:78 round_data_64+0x0
+	s16i	a9,a1,6                  	# [3]  id:78 round_data_64+0x0
+	s16i	a9,a1,8                  	# [4]  id:78 round_data_64+0x0
+	s16i	a9,a1,10                 	# [5]  id:78 round_data_64+0x0
+	s16i	a9,a1,12                 	# [6]  id:78 round_data_64+0x0
+	s16i	a9,a1,14                 	# [7]  id:78 round_data_64+0x0
+
+	blti	a11,1,.Lt_0_7426         	# [0]  
+
+	mov.n	a13,a2                  	# [0]  
+	slli	a4,a7,1                  	# [1]  
+	mov.n	a12,a1                  	# [2]  
+	l32i.n	a14,a1,32              	# [3]  gra_spill_temp_0
+	movi.n	a15,15                 	# [4]  
+	movi.n	a8,0                   	# [5]  
+	slli	a9,a6,1                  	# [6]  
+	s32i.n	a9,a1,36               	# [7]  gra_spill_temp_1
+	s32i.n	a8,a1,44               	# [8]  gra_spill_temp_3
+	sub	a15,a15,a5                	# [9]  
+	addi.n	a8,a7,7                	# [10]  
+	movgez	a8,a7,a7               	# [11]  
+	srai	a8,a8,3                  	# [12]  
+	s32i.n	a8,a1,40               	# [13]  gra_spill_temp_2
+	slli	a8,a8,4                  	# [14]  
+	add.n	a14,a14,a8              	# [15]  
+
+.Lt_0_7938:	# 0x5d
+	l32i.n	a8,a1,40               	# [0]  gra_spill_temp_2
+	beqz.n	a8,.Lt_0_8194          	# [2]  
+
+	l32i.n	a7,a1,32               	# [0]  gra_spill_temp_0
+	mov.n	a2,a13                  	# [1]  
+
+.Lt_0_8706:	# 0x65
+	ee.ldqa.u16.128.ip	a12,0      	# [0]  id:80
+	ee.vldbc.16.ip	q1,a2,2        	# [1]  id:79
+	mov.n	a3,a7                   	# [2]  
+	ee.vld.128.xp	q0,a3,a4        	# [3]  id:81
+	addi	a7,a7,16                 	# [4]  
+	blti	a6,1,.Lt_0_8962          	# [5]  
+
+	srai	a5,a6,1                  	# [0]  
+	bbci	a6,0,.LBB68_dspm_mult_s16_aes3 	# [1]  
+
+	ee.vmulas.s16.qacc.ldbc.incp	q1,a2,q0,q1 	# [0]  id:82
+	ee.vld.128.xp	q0,a3,a4        	# [1]  id:83
+
+.LBB68_dspm_mult_s16_aes3:	# 0x82
+	loopgtz	a5,.LBB74_dspm_mult_s16_aes3 	# [0]  
+
+.LBB64_dspm_mult_s16_aes3:	# 0x85
+	ee.vld.128.xp	q2,a3,a4        	# [0*II+0]  id:83
+	ee.vmulas.s16.qacc.ldbc.incp	q1,a2,q0,q1 	# [0*II+1]  id:82
+	ee.vld.128.xp	q0,a3,a4        	# [0*II+2]  id:83
+	ee.vmulas.s16.qacc.ldbc.incp	q1,a2,q2,q1 	# [0*II+3]  id:82
+
+.LBB74_dspm_mult_s16_aes3:	# 0x91
+
+.Lt_0_8962:	# 0x91
+	mov.n	a2,a13                  	# [0]  
+	ee.srcmb.s16.qacc	q0,a15,1    	# [1]  
+	ee.vst.128.ip	q0,a10,16       	# [2]  id:85
+	bne	a7,a14,.Lt_0_8706         	# [3]  
+
+.Lt_0_8194:	# 0x9c
+	l32i.n	a8,a1,36               	# [0]  gra_spill_temp_1
+	l32i.n	a9,a1,44               	# [1]  gra_spill_temp_3
+	add.n	a13,a13,a8              	# [2]  
+	addi.n	a9,a9,1                	# [3]  
+	s32i.n	a9,a1,44               	# [4]  gra_spill_temp_3
+	bne	a11,a9,.Lt_0_7938         	# [5]  
+
+.Lt_0_7426:	# 0xa9
+	movi.n	a2,0                   	# [0]  
+	retw.n                        	# [1]  
+
+.Lt_0_6146:	# 0xad
+	l32r	a9,.LC1_1_39             	# [0]  
+	ssr	a5                        	# [1]  
+	sra	a9,a9                     	# [2]  
+	j	.LBB23_dspm_mult_s16_aes3   	# [3] 
+
+
+#endif // dspm_mult_s16_ae32_enabled
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/fixed/dspm_mult_s16_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/fixed/dspm_mult_s16_ansi.c
@@ -0,0 +1,40 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dsps_dotprod.h"
+#include "dspm_mult.h"
+
+// Matrinx A(m,n), m - amount or rows, n - amount of columns
+// C(m,k) = A(m,n)*B(n,k)
+// c(i,j) = sum(a(i,s)*b(s,j)) , s=1..n
+esp_err_t dspm_mult_s16_ansi(const int16_t *A, const int16_t *B, int16_t *C, int m, int n, int k, int shift)
+{
+    int final_shift = shift - 15;
+    for (int i = 0 ; i < m ; i++) {
+        for (int j = 0 ; j < k ; j++) {
+            // This code also could be used
+            //dsps_dotprode_f32_ae32(&A[i*n],&B[j],&C[i*k + j],n,1,n);
+            long long acc = 0x7fff >> shift;
+            for (int s = 0; s < n ; s++) {
+                acc += (int32_t)A[i * n + s] * (int32_t)B[s * k + j];
+            }
+            if (final_shift > 0) {
+                C[i * k + j] = (acc << final_shift);
+            } else {
+                C[i * k + j] = (acc >> (-final_shift));
+            }
+        }
+    }
+    return ESP_OK;
+}
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/fixed/dspm_mult_s16_arp4.S
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/fixed/dspm_mult_s16_arp4.S
@@ -0,0 +1,121 @@
+// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspm_mult_platform.h"
+#if (dspm_mult_s16_arp4_enabled == 1)
+
+// This is matrix multipliction function for Risc-V processor core.
+    .text
+    .align  4
+    .global dspm_mult_s16_arp4
+    .global dspm_mult_s16_ansi  
+    .global .dspm_mult_s16_arp4_body
+    .type   dspm_mult_s16_arp4,@function
+// The function implements the following C code:
+// esp_err_t dspm_mult_f32_ansi(const int16_t *A, const int16_t *B, int16_t *C, int m, int n, int k, int shift)
+// {
+//    int final_shift = shift - 15;
+//    for (int i = 0 ; i < m ; i++) {
+//        for (int j = 0 ; j < k ; j++) {
+//            // This code also could be used
+//            //dsps_dotprode_f32_ae32(&A[i*n],&B[j],&C[i*k + j],n,1,n);
+//            long long acc = 0x7fff >> shift;
+//            for (int s = 0; s < n ; s++) {
+//                acc += (int32_t)A[i * n + s] * (int32_t)B[s * k + j];
+//            }
+//            if (final_shift > 0) {
+//                C[i * k + j] = (acc << final_shift);
+//            } else {
+//                C[i * k + j] = (acc >> (-final_shift));
+//            }
+//        }
+//    }
+//     return ESP_OK;
+// }
+
+dspm_mult_s16_arp4: 
+// A - a0
+// B - a1
+// C - a2
+// m - a3
+// n - a4
+// k - a5
+// shift - a6
+
+// a7 - counter loop1: 0..m
+// t1 - counter loop2: 0..k
+// t0 - counter loop3: 0..n
+// x25(s9) - matrix step for input2
+// x24(s8) - pointer to current B
+// x29(t4) - pointer to initial B
+// x30(t5) - pointer to A
+// x31(t6) = 2 for increment....
+// x26(s10)- final_shift
+
+    or      t0, a3, a4
+    or      t0, t0, a5
+    andi    t0, t0, 0x7
+    beqz    t0, .dspm_mult_s16_arp4_body
+    j   dspm_mult_s16_ansi
+    //ret
+
+.dspm_mult_s16_arp4_body:
+    add sp,sp,-16
+    sw  s8, 4(sp)
+    sw  s9, 8(sp)
+    sw  s10, 12(sp)
+    mv      t0, a4
+    li      a7, 0  // counter loop1
+    slli    x25, a5, 1 // step = step*2
+    li      x31, 2
+    // final_shift = shift - 15
+    add     x26, a6, -15
+
+.dpf_loop1: // loop for m
+    li      t1, 0 // reset counter for loop2
+    mv      x29, a1
+.dpf_loop2: // loop for k
+        mv  x30, a0
+        mv  x24, x29        // load B
+        // Calculating dotproduct...
+        esp.zero.qacc                       // qacc = 0;
+        esp.vldbc.16.xp     q0, x30, x31    // q0 = a[mx..mx]
+        esp.vld.128.xp      q1, x24, x25    // q1 = b[x0..x7],
+        esp.lp.setup    0, t0, .matrix_mul_loop
+            esp.vmulas.s16.qacc.ldbc.incp   q0,x30,     q0,q1
+        .matrix_mul_loop:   esp.vld.128.xp  q1,x24,x25
+            
+        esp.srcmb.s16.qacc  q2, x26, 0          //   q2 = qacc >> shift
+        esp.vst.128.ip      q2, a2, 16          //  save k0..k7
+        add     x29,x29, 16
+
+        // check loop 2
+        addi  t1, t1, 8 // Increment loop2 counter
+        blt   t1, a5, .dpf_loop2
+    add   x30, x30, -2
+    mv    a0, x30   // 
+
+    // check loop 1
+    add   a7, a7, 1 // Increment loop1 counter
+    blt   a7, a3, .dpf_loop1
+
+    // Exit
+    mv  a0, a6      // return status ESP_OK
+    lw  s10, 12(sp)
+    lw  s9, 8(sp)
+    lw  s8, 4(sp)
+    add sp,sp,16
+    ret
+
+#endif //dspm_mult_s16_arp4_enabled
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/fixed/dspm_mult_s16_m_ae32.S
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/fixed/dspm_mult_s16_m_ae32.S
@@ -0,0 +1,58 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+
+.macro dspm_mult_s16_ae32_MxNxN
+// A - a2
+// B - a3
+// C - a4
+// m - a5
+// n - a6
+// k - a7
+// shift - stack (a8) 
+
+	movi  a10, 4 // load 4 as a constant
+	// Check if n >=4 then acceleration is possible and 
+	blt   a6, a10, do_dotproduct 
+	// Here we make operations one by one...
+
+
+	movi.n	a2, 0 // return status ESP_OK
+	retw.n
+
+do_dotproduct:
+
+		mov  a12, a2
+		mov  a13, a3
+		
+		srli a9, a6, 2  // a9 - count/4 - 1
+		addi a9, a9, -1
+
+		movi.n	a10, 0 // load 0 to the a10 to increment second array
+		dotprod_s16_ae32_full a12, a13, a9, a10, a6
+
+		/* Get accumulator */
+		ssr a6
+		rsr a2, acchi
+		rsr a3, acclo
+		src a2, a2, a3
+		
+		s16i	a2, a4, 0
+		movi.n	a2, 0
+
+
+	movi.n	a2, 0 // return status ESP_OK
+	retw.n
+
+.endm // dspm_mult_s16_ae32_MxNxN
--- a/managed_components/espressif__esp-dsp/modules/matrix/mul/fixed/dspm_mult_s16_m_ae32_vector.S
+++ b/managed_components/espressif__esp-dsp/modules/matrix/mul/fixed/dspm_mult_s16_m_ae32_vector.S
@@ -0,0 +1,105 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+
+.macro dspm_mult_s16_m_ae32_vector
+// m - a5 - any > 0
+// n - a6 - 1,2,3, any
+// k - a7 - 1, any
+
+
+	// Define path for n < 4
+	movi a7, 4
+	blt  a6, a7, small_process_loop // jump for n < 4
+
+	srli a7, a6,  2
+	addi a7, a7, -1
+ 
+
+mmultv_loop1:
+	wsr a8, acchi
+	wsr a15, acclo // initialize acc with shifted round value
+
+	// Clear initial state of the result register
+	// a2 - A
+	// a3 - B
+	// a4 - C
+	// a6 - n
+	// a7 - n/4 - 1
+	// a8 - 0
+	// a15- 0x7fff>>shift
+
+		mov      a12, a2 // load A
+		mov      a13, a3 // Load B
+
+		dotprod_s16_ae32_full a12, a13, a7, a6
+
+	// check loop 1
+		/* Get accumulator */
+		rsr a12, acchi
+		rsr a13, acclo
+		src a12, a12, a13
+		
+		s16i	a12, a4, 0
+		addi    a4, a4, 2
+
+		add.n   a2, a2, a14 // Increment A, A = A[i*n]
+		addi    a9, a9, 1 // Increment loop1 counter    
+	blt     a9, a5, mmultv_loop1
+
+
+	movi.n	a2, 0 // return status ESP_OK
+	retw.n
+
+small_process_loop:
+
+	wsr a8, acchi
+	wsr a15, acclo // initialize acc with shifted round value
+
+	mov      a12, a2 // load A
+	mov      a13, a3 // Load B
+
+	addi  a12, a12, -4 // To arrange fist pointer
+	addi  a13, a13, -4 // To arrange fist pointer
+
+		bbci  a6, 1, .mod2chk_short
+		ldinc m0, a12
+		ldinc m2, a13
+		mula.dd.hh m0, m2
+		mula.dd.ll m0, m2
+	.mod2chk_short:
+		bbci  a6, 0, .mod1chk_short
+		ldinc m0, a12
+		ldinc m2, a13
+		mula.dd.ll m0, m2
+	.mod1chk_short:
+
+	// check loop 1
+		/* Get accumulator */
+		rsr a12, acchi
+		rsr a13, acclo
+		src a12, a12, a13
+		
+		s16i	a12, a4, 0
+		addi     a4, a4, 2
+
+		add.n   a2, a2, a14 // Increment A, A = A[i*n]
+		addi    a9, a9, 1 // Increment loop1 counter    
+	blt     a9, a5, small_process_loop
+
+	movi.n	a2, 0 // return status ESP_OK
+	retw.n
+
+
+.endm // dspm_mult_s16_m_ae32_vector