add some code
This commit is contained in:
@@ -0,0 +1,174 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspm_mult_platform.h"
|
||||
#if (dspm_mult_s16_ae32_enabled == 1)
|
||||
|
||||
#include "dsps_dotprod_s16_m_ae32.S"
|
||||
#include "dspm_mult_s16_m_ae32_vector.S"
|
||||
//esp_err_t dspm_mult_s16_ae32(const int16_t* A, const int16_t* B, int16_t* C, int m, int n, int k, int shift);
|
||||
|
||||
// This is matrix multipliction function for ESP32 processor.
|
||||
.text
|
||||
.align 4
|
||||
.global dspm_mult_s16_ae32
|
||||
.global .dspm_mult_s16_ae32_body
|
||||
.type dspm_mult_s16_ae32,@function
|
||||
|
||||
dspm_mult_s16_ae32:
|
||||
// A - a2
|
||||
// B - a3
|
||||
// C - a4
|
||||
// m - a5 - any > 0
|
||||
// n - a6 - 1,2,3, any
|
||||
// k - a7 - 1, any
|
||||
// shift - stack (a8)
|
||||
|
||||
// a14 - n*4 - pointer increment
|
||||
//
|
||||
entry a1, 80
|
||||
// ====== process matrices when k == 1 ============
|
||||
.dspm_mult_s16_ae32_body:
|
||||
l32i.n a8, a1, 80 // Load shift to the a8 register
|
||||
|
||||
|
||||
// Prepare and load round value
|
||||
ssr a8 // store shift to ssa
|
||||
movi a15, 0x7fff
|
||||
srl a15, a15
|
||||
|
||||
neg a8, a8
|
||||
addi a8, a8, 15
|
||||
ssr a8 // store shift to ssa
|
||||
movi a8, 0 // Clear a8
|
||||
|
||||
slli a14, a6, 1 // Pointer increment for n
|
||||
movi.n a10, 2 // Increment = 2
|
||||
movi.n a9, 0 // initial counter loop1
|
||||
|
||||
movi a12, 1
|
||||
beq a7, a12, vector_mult
|
||||
// We have normal path with k > 1
|
||||
// a2, a3, a4 - A,B,C
|
||||
// a5 - m
|
||||
// a6 - n
|
||||
// a7 - k
|
||||
// a8 - temp
|
||||
// a9 - temp
|
||||
// a10- k counter
|
||||
// a11- m counter
|
||||
// a12- B
|
||||
// a13- A
|
||||
// a14 - pointer increment for n
|
||||
// a15 - round value
|
||||
|
||||
bbsi a6, 0, even_N_samples
|
||||
// ---------------- for odd N
|
||||
srli a6, a6, 1 // counter a6 = a6/2. We have to do it only once
|
||||
slli a7, a7, 1 // counter a7 = a7*2. We have to do it only once
|
||||
|
||||
// loop for M
|
||||
m_loop_mmult:
|
||||
movi a10, 0 // reset k loop counter
|
||||
mov a13, a3 // set pointer to the first column
|
||||
// loop for K
|
||||
k_loop_mmult:
|
||||
|
||||
addi a12, a2, -4 // every loop the same start position
|
||||
|
||||
movi a8, 0
|
||||
wsr a8, acchi
|
||||
wsr a15, acclo // initialize acc with shifted round value
|
||||
|
||||
loopnez a6, .loop_end_mmult // loop for N
|
||||
.loop_mmult:
|
||||
ldinc m3, a12
|
||||
l16si a8, a13, 0
|
||||
add a13, a13, a7
|
||||
mula.ad.ll a8, m3
|
||||
l16si a8, a13, 0
|
||||
add a13, a13, a7
|
||||
mula.ad.lh a8, m3
|
||||
.loop_end_mmult:
|
||||
|
||||
rsr a8, acchi
|
||||
rsr a9, acclo
|
||||
src a8, a8, a9
|
||||
s16i a8, a4, 0
|
||||
addi a4, a4, 2
|
||||
// check and increment for K
|
||||
|
||||
addi a10, a10, 2
|
||||
add a13, a3, a10 // we shift collumn
|
||||
bne a10, a7, k_loop_mmult
|
||||
|
||||
// Check and increment for M
|
||||
add a2, a2, a14 // move to the next raw
|
||||
addi a5, a5, -1
|
||||
bnez.n a5, m_loop_mmult
|
||||
|
||||
movi.n a2, 0 // return status ESP_OK
|
||||
retw.n
|
||||
|
||||
even_N_samples:
|
||||
// ---------------- for odd N
|
||||
slli a7, a7, 1 // counter a7 = a7*2. We have to do it only once
|
||||
|
||||
// loop for M
|
||||
m_loop_mmult_even:
|
||||
movi a10, 0 // reset k loop counter
|
||||
mov a13, a3 // set pointer to the first column
|
||||
// loop for K
|
||||
k_loop_mmult_even:
|
||||
|
||||
mov a12, a2 // every loop the same start position
|
||||
|
||||
movi a8, 0
|
||||
wsr a8, acchi
|
||||
wsr a15, acclo // initialize acc with shifted round value
|
||||
|
||||
loopnez a6, .loop_end_mmult_even // loop for N
|
||||
.loop_mmult_even:
|
||||
l16si a9, a12, 0
|
||||
l16si a8, a13, 0
|
||||
addi a12, a12, 2
|
||||
add a13, a13, a7
|
||||
mula.aa.ll a8, a9
|
||||
.loop_end_mmult_even:
|
||||
|
||||
rsr a8, acchi
|
||||
rsr a9, acclo
|
||||
src a8, a8, a9
|
||||
s16i a8, a4, 0
|
||||
addi a4, a4, 2
|
||||
// check and increment for K
|
||||
|
||||
addi a10, a10, 2
|
||||
add a13, a3, a10 // we shift collumn
|
||||
bne a10, a7, k_loop_mmult_even
|
||||
|
||||
// Check and increment for M
|
||||
add a2, a2, a14 // move to the next raw
|
||||
addi a5, a5, -1
|
||||
bnez.n a5, m_loop_mmult_even
|
||||
|
||||
movi.n a2, 0 // return status ESP_OK
|
||||
retw.n
|
||||
|
||||
// The path where n > 1
|
||||
vector_mult:
|
||||
dspm_mult_s16_m_ae32_vector;
|
||||
|
||||
|
||||
#endif // dspm_mult_s16_ae32_enabled
|
||||
@@ -0,0 +1,142 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspm_mult_platform.h"
|
||||
#if (dspm_mult_s16_aes3_enabled == 1)
|
||||
#include "dsps_dotprod_s16_m_ae32.S"
|
||||
#include "dspm_mult_s16_m_ae32_vector.S"
|
||||
|
||||
//esp_err_t dspm_mult_s16_ae32(const int16_t* A, const int16_t* B, int16_t* C, int m, int n, int k, int shift);
|
||||
|
||||
// This is matrix multipliction function for ESP32 processor.
|
||||
.text
|
||||
.align 4
|
||||
.literal_position
|
||||
.literal .LC0_1_38, 32767
|
||||
.literal .LC1_1_39, 16383
|
||||
|
||||
.global dspm_mult_s16_aes3
|
||||
.global .dspm_mult_s16_ae32_body
|
||||
.type dspm_mult_s16_aes3,@function
|
||||
|
||||
dspm_mult_s16_aes3:
|
||||
|
||||
entry a1,80 #
|
||||
|
||||
movi.n a10, 7
|
||||
and a10, a10, a7
|
||||
beqz a10, .dspm_mult_s16_aes3_body
|
||||
// Call Esp32 function
|
||||
J .dspm_mult_s16_ae32_body
|
||||
|
||||
.dspm_mult_s16_aes3_body:
|
||||
mov.n a10,a4 # [0]
|
||||
mov.n a11,a5 # [1]
|
||||
l32i a5,a1,80 # [2] id:77 shift+0x0
|
||||
s32i.n a3,a1,32 # [3] gra_spill_temp_0
|
||||
|
||||
bltz a5,.Lt_0_6146 # [4]
|
||||
|
||||
#.LBB3_dspm_mult_s16_aes3: # 0x13
|
||||
l32r a9,.LC0_1_38 # [0]
|
||||
ssr a5 # [1]
|
||||
sra a9,a9 # [2]
|
||||
|
||||
.LBB23_dspm_mult_s16_aes3: # 0x1c
|
||||
s16i a9,a1,0 # [0] id:78 round_data_64+0x0
|
||||
s16i a9,a1,2 # [1] id:78 round_data_64+0x0
|
||||
s16i a9,a1,4 # [2] id:78 round_data_64+0x0
|
||||
s16i a9,a1,6 # [3] id:78 round_data_64+0x0
|
||||
s16i a9,a1,8 # [4] id:78 round_data_64+0x0
|
||||
s16i a9,a1,10 # [5] id:78 round_data_64+0x0
|
||||
s16i a9,a1,12 # [6] id:78 round_data_64+0x0
|
||||
s16i a9,a1,14 # [7] id:78 round_data_64+0x0
|
||||
|
||||
blti a11,1,.Lt_0_7426 # [0]
|
||||
|
||||
mov.n a13,a2 # [0]
|
||||
slli a4,a7,1 # [1]
|
||||
mov.n a12,a1 # [2]
|
||||
l32i.n a14,a1,32 # [3] gra_spill_temp_0
|
||||
movi.n a15,15 # [4]
|
||||
movi.n a8,0 # [5]
|
||||
slli a9,a6,1 # [6]
|
||||
s32i.n a9,a1,36 # [7] gra_spill_temp_1
|
||||
s32i.n a8,a1,44 # [8] gra_spill_temp_3
|
||||
sub a15,a15,a5 # [9]
|
||||
addi.n a8,a7,7 # [10]
|
||||
movgez a8,a7,a7 # [11]
|
||||
srai a8,a8,3 # [12]
|
||||
s32i.n a8,a1,40 # [13] gra_spill_temp_2
|
||||
slli a8,a8,4 # [14]
|
||||
add.n a14,a14,a8 # [15]
|
||||
|
||||
.Lt_0_7938: # 0x5d
|
||||
l32i.n a8,a1,40 # [0] gra_spill_temp_2
|
||||
beqz.n a8,.Lt_0_8194 # [2]
|
||||
|
||||
l32i.n a7,a1,32 # [0] gra_spill_temp_0
|
||||
mov.n a2,a13 # [1]
|
||||
|
||||
.Lt_0_8706: # 0x65
|
||||
ee.ldqa.u16.128.ip a12,0 # [0] id:80
|
||||
ee.vldbc.16.ip q1,a2,2 # [1] id:79
|
||||
mov.n a3,a7 # [2]
|
||||
ee.vld.128.xp q0,a3,a4 # [3] id:81
|
||||
addi a7,a7,16 # [4]
|
||||
blti a6,1,.Lt_0_8962 # [5]
|
||||
|
||||
srai a5,a6,1 # [0]
|
||||
bbci a6,0,.LBB68_dspm_mult_s16_aes3 # [1]
|
||||
|
||||
ee.vmulas.s16.qacc.ldbc.incp q1,a2,q0,q1 # [0] id:82
|
||||
ee.vld.128.xp q0,a3,a4 # [1] id:83
|
||||
|
||||
.LBB68_dspm_mult_s16_aes3: # 0x82
|
||||
loopgtz a5,.LBB74_dspm_mult_s16_aes3 # [0]
|
||||
|
||||
.LBB64_dspm_mult_s16_aes3: # 0x85
|
||||
ee.vld.128.xp q2,a3,a4 # [0*II+0] id:83
|
||||
ee.vmulas.s16.qacc.ldbc.incp q1,a2,q0,q1 # [0*II+1] id:82
|
||||
ee.vld.128.xp q0,a3,a4 # [0*II+2] id:83
|
||||
ee.vmulas.s16.qacc.ldbc.incp q1,a2,q2,q1 # [0*II+3] id:82
|
||||
|
||||
.LBB74_dspm_mult_s16_aes3: # 0x91
|
||||
|
||||
.Lt_0_8962: # 0x91
|
||||
mov.n a2,a13 # [0]
|
||||
ee.srcmb.s16.qacc q0,a15,1 # [1]
|
||||
ee.vst.128.ip q0,a10,16 # [2] id:85
|
||||
bne a7,a14,.Lt_0_8706 # [3]
|
||||
|
||||
.Lt_0_8194: # 0x9c
|
||||
l32i.n a8,a1,36 # [0] gra_spill_temp_1
|
||||
l32i.n a9,a1,44 # [1] gra_spill_temp_3
|
||||
add.n a13,a13,a8 # [2]
|
||||
addi.n a9,a9,1 # [3]
|
||||
s32i.n a9,a1,44 # [4] gra_spill_temp_3
|
||||
bne a11,a9,.Lt_0_7938 # [5]
|
||||
|
||||
.Lt_0_7426: # 0xa9
|
||||
movi.n a2,0 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.Lt_0_6146: # 0xad
|
||||
l32r a9,.LC1_1_39 # [0]
|
||||
ssr a5 # [1]
|
||||
sra a9,a9 # [2]
|
||||
j .LBB23_dspm_mult_s16_aes3 # [3]
|
||||
|
||||
|
||||
#endif // dspm_mult_s16_ae32_enabled
|
||||
@@ -0,0 +1,40 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dsps_dotprod.h"
|
||||
#include "dspm_mult.h"
|
||||
|
||||
// Matrinx A(m,n), m - amount or rows, n - amount of columns
|
||||
// C(m,k) = A(m,n)*B(n,k)
|
||||
// c(i,j) = sum(a(i,s)*b(s,j)) , s=1..n
|
||||
esp_err_t dspm_mult_s16_ansi(const int16_t *A, const int16_t *B, int16_t *C, int m, int n, int k, int shift)
|
||||
{
|
||||
int final_shift = shift - 15;
|
||||
for (int i = 0 ; i < m ; i++) {
|
||||
for (int j = 0 ; j < k ; j++) {
|
||||
// This code also could be used
|
||||
//dsps_dotprode_f32_ae32(&A[i*n],&B[j],&C[i*k + j],n,1,n);
|
||||
long long acc = 0x7fff >> shift;
|
||||
for (int s = 0; s < n ; s++) {
|
||||
acc += (int32_t)A[i * n + s] * (int32_t)B[s * k + j];
|
||||
}
|
||||
if (final_shift > 0) {
|
||||
C[i * k + j] = (acc << final_shift);
|
||||
} else {
|
||||
C[i * k + j] = (acc >> (-final_shift));
|
||||
}
|
||||
}
|
||||
}
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,121 @@
|
||||
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspm_mult_platform.h"
|
||||
#if (dspm_mult_s16_arp4_enabled == 1)
|
||||
|
||||
// This is matrix multipliction function for Risc-V processor core.
|
||||
.text
|
||||
.align 4
|
||||
.global dspm_mult_s16_arp4
|
||||
.global dspm_mult_s16_ansi
|
||||
.global .dspm_mult_s16_arp4_body
|
||||
.type dspm_mult_s16_arp4,@function
|
||||
// The function implements the following C code:
|
||||
// esp_err_t dspm_mult_f32_ansi(const int16_t *A, const int16_t *B, int16_t *C, int m, int n, int k, int shift)
|
||||
// {
|
||||
// int final_shift = shift - 15;
|
||||
// for (int i = 0 ; i < m ; i++) {
|
||||
// for (int j = 0 ; j < k ; j++) {
|
||||
// // This code also could be used
|
||||
// //dsps_dotprode_f32_ae32(&A[i*n],&B[j],&C[i*k + j],n,1,n);
|
||||
// long long acc = 0x7fff >> shift;
|
||||
// for (int s = 0; s < n ; s++) {
|
||||
// acc += (int32_t)A[i * n + s] * (int32_t)B[s * k + j];
|
||||
// }
|
||||
// if (final_shift > 0) {
|
||||
// C[i * k + j] = (acc << final_shift);
|
||||
// } else {
|
||||
// C[i * k + j] = (acc >> (-final_shift));
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// return ESP_OK;
|
||||
// }
|
||||
|
||||
dspm_mult_s16_arp4:
|
||||
// A - a0
|
||||
// B - a1
|
||||
// C - a2
|
||||
// m - a3
|
||||
// n - a4
|
||||
// k - a5
|
||||
// shift - a6
|
||||
|
||||
// a7 - counter loop1: 0..m
|
||||
// t1 - counter loop2: 0..k
|
||||
// t0 - counter loop3: 0..n
|
||||
// x25(s9) - matrix step for input2
|
||||
// x24(s8) - pointer to current B
|
||||
// x29(t4) - pointer to initial B
|
||||
// x30(t5) - pointer to A
|
||||
// x31(t6) = 2 for increment....
|
||||
// x26(s10)- final_shift
|
||||
|
||||
or t0, a3, a4
|
||||
or t0, t0, a5
|
||||
andi t0, t0, 0x7
|
||||
beqz t0, .dspm_mult_s16_arp4_body
|
||||
j dspm_mult_s16_ansi
|
||||
//ret
|
||||
|
||||
.dspm_mult_s16_arp4_body:
|
||||
add sp,sp,-16
|
||||
sw s8, 4(sp)
|
||||
sw s9, 8(sp)
|
||||
sw s10, 12(sp)
|
||||
mv t0, a4
|
||||
li a7, 0 // counter loop1
|
||||
slli x25, a5, 1 // step = step*2
|
||||
li x31, 2
|
||||
// final_shift = shift - 15
|
||||
add x26, a6, -15
|
||||
|
||||
.dpf_loop1: // loop for m
|
||||
li t1, 0 // reset counter for loop2
|
||||
mv x29, a1
|
||||
.dpf_loop2: // loop for k
|
||||
mv x30, a0
|
||||
mv x24, x29 // load B
|
||||
// Calculating dotproduct...
|
||||
esp.zero.qacc // qacc = 0;
|
||||
esp.vldbc.16.xp q0, x30, x31 // q0 = a[mx..mx]
|
||||
esp.vld.128.xp q1, x24, x25 // q1 = b[x0..x7],
|
||||
esp.lp.setup 0, t0, .matrix_mul_loop
|
||||
esp.vmulas.s16.qacc.ldbc.incp q0,x30, q0,q1
|
||||
.matrix_mul_loop: esp.vld.128.xp q1,x24,x25
|
||||
|
||||
esp.srcmb.s16.qacc q2, x26, 0 // q2 = qacc >> shift
|
||||
esp.vst.128.ip q2, a2, 16 // save k0..k7
|
||||
add x29,x29, 16
|
||||
|
||||
// check loop 2
|
||||
addi t1, t1, 8 // Increment loop2 counter
|
||||
blt t1, a5, .dpf_loop2
|
||||
add x30, x30, -2
|
||||
mv a0, x30 //
|
||||
|
||||
// check loop 1
|
||||
add a7, a7, 1 // Increment loop1 counter
|
||||
blt a7, a3, .dpf_loop1
|
||||
|
||||
// Exit
|
||||
mv a0, a6 // return status ESP_OK
|
||||
lw s10, 12(sp)
|
||||
lw s9, 8(sp)
|
||||
lw s8, 4(sp)
|
||||
add sp,sp,16
|
||||
ret
|
||||
|
||||
#endif //dspm_mult_s16_arp4_enabled
|
||||
@@ -0,0 +1,58 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
.macro dspm_mult_s16_ae32_MxNxN
|
||||
// A - a2
|
||||
// B - a3
|
||||
// C - a4
|
||||
// m - a5
|
||||
// n - a6
|
||||
// k - a7
|
||||
// shift - stack (a8)
|
||||
|
||||
movi a10, 4 // load 4 as a constant
|
||||
// Check if n >=4 then acceleration is possible and
|
||||
blt a6, a10, do_dotproduct
|
||||
// Here we make operations one by one...
|
||||
|
||||
|
||||
movi.n a2, 0 // return status ESP_OK
|
||||
retw.n
|
||||
|
||||
do_dotproduct:
|
||||
|
||||
mov a12, a2
|
||||
mov a13, a3
|
||||
|
||||
srli a9, a6, 2 // a9 - count/4 - 1
|
||||
addi a9, a9, -1
|
||||
|
||||
movi.n a10, 0 // load 0 to the a10 to increment second array
|
||||
dotprod_s16_ae32_full a12, a13, a9, a10, a6
|
||||
|
||||
/* Get accumulator */
|
||||
ssr a6
|
||||
rsr a2, acchi
|
||||
rsr a3, acclo
|
||||
src a2, a2, a3
|
||||
|
||||
s16i a2, a4, 0
|
||||
movi.n a2, 0
|
||||
|
||||
|
||||
movi.n a2, 0 // return status ESP_OK
|
||||
retw.n
|
||||
|
||||
.endm // dspm_mult_s16_ae32_MxNxN
|
||||
@@ -0,0 +1,105 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
.macro dspm_mult_s16_m_ae32_vector
|
||||
// m - a5 - any > 0
|
||||
// n - a6 - 1,2,3, any
|
||||
// k - a7 - 1, any
|
||||
|
||||
|
||||
// Define path for n < 4
|
||||
movi a7, 4
|
||||
blt a6, a7, small_process_loop // jump for n < 4
|
||||
|
||||
srli a7, a6, 2
|
||||
addi a7, a7, -1
|
||||
|
||||
|
||||
mmultv_loop1:
|
||||
wsr a8, acchi
|
||||
wsr a15, acclo // initialize acc with shifted round value
|
||||
|
||||
// Clear initial state of the result register
|
||||
// a2 - A
|
||||
// a3 - B
|
||||
// a4 - C
|
||||
// a6 - n
|
||||
// a7 - n/4 - 1
|
||||
// a8 - 0
|
||||
// a15- 0x7fff>>shift
|
||||
|
||||
mov a12, a2 // load A
|
||||
mov a13, a3 // Load B
|
||||
|
||||
dotprod_s16_ae32_full a12, a13, a7, a6
|
||||
|
||||
// check loop 1
|
||||
/* Get accumulator */
|
||||
rsr a12, acchi
|
||||
rsr a13, acclo
|
||||
src a12, a12, a13
|
||||
|
||||
s16i a12, a4, 0
|
||||
addi a4, a4, 2
|
||||
|
||||
add.n a2, a2, a14 // Increment A, A = A[i*n]
|
||||
addi a9, a9, 1 // Increment loop1 counter
|
||||
blt a9, a5, mmultv_loop1
|
||||
|
||||
|
||||
movi.n a2, 0 // return status ESP_OK
|
||||
retw.n
|
||||
|
||||
small_process_loop:
|
||||
|
||||
wsr a8, acchi
|
||||
wsr a15, acclo // initialize acc with shifted round value
|
||||
|
||||
mov a12, a2 // load A
|
||||
mov a13, a3 // Load B
|
||||
|
||||
addi a12, a12, -4 // To arrange fist pointer
|
||||
addi a13, a13, -4 // To arrange fist pointer
|
||||
|
||||
bbci a6, 1, .mod2chk_short
|
||||
ldinc m0, a12
|
||||
ldinc m2, a13
|
||||
mula.dd.hh m0, m2
|
||||
mula.dd.ll m0, m2
|
||||
.mod2chk_short:
|
||||
bbci a6, 0, .mod1chk_short
|
||||
ldinc m0, a12
|
||||
ldinc m2, a13
|
||||
mula.dd.ll m0, m2
|
||||
.mod1chk_short:
|
||||
|
||||
// check loop 1
|
||||
/* Get accumulator */
|
||||
rsr a12, acchi
|
||||
rsr a13, acclo
|
||||
src a12, a12, a13
|
||||
|
||||
s16i a12, a4, 0
|
||||
addi a4, a4, 2
|
||||
|
||||
add.n a2, a2, a14 // Increment A, A = A[i*n]
|
||||
addi a9, a9, 1 // Increment loop1 counter
|
||||
blt a9, a5, small_process_loop
|
||||
|
||||
movi.n a2, 0 // return status ESP_OK
|
||||
retw.n
|
||||
|
||||
|
||||
.endm // dspm_mult_s16_m_ae32_vector
|
||||
Reference in New Issue
Block a user