add some code

This commit is contained in:
2025-09-05 13:25:11 +08:00
parent 9ff0a99e7a
commit 3cf1229a85
8911 changed files with 2535396 additions and 0 deletions

View File

@@ -0,0 +1,174 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspm_mult_platform.h"
#if (dspm_mult_s16_ae32_enabled == 1)
#include "dsps_dotprod_s16_m_ae32.S"
#include "dspm_mult_s16_m_ae32_vector.S"
//esp_err_t dspm_mult_s16_ae32(const int16_t* A, const int16_t* B, int16_t* C, int m, int n, int k, int shift);
// This is matrix multipliction function for ESP32 processor.
.text
.align 4
.global dspm_mult_s16_ae32
.global .dspm_mult_s16_ae32_body
.type dspm_mult_s16_ae32,@function
dspm_mult_s16_ae32:
// A - a2
// B - a3
// C - a4
// m - a5 - any > 0
// n - a6 - 1,2,3, any
// k - a7 - 1, any
// shift - stack (a8)
// a14 - n*4 - pointer increment
//
entry a1, 80
// ====== process matrices when k == 1 ============
.dspm_mult_s16_ae32_body:
l32i.n a8, a1, 80 // Load shift to the a8 register
// Prepare and load round value
ssr a8 // store shift to ssa
movi a15, 0x7fff
srl a15, a15
neg a8, a8
addi a8, a8, 15
ssr a8 // store shift to ssa
movi a8, 0 // Clear a8
slli a14, a6, 1 // Pointer increment for n
movi.n a10, 2 // Increment = 2
movi.n a9, 0 // initial counter loop1
movi a12, 1
beq a7, a12, vector_mult
// We have normal path with k > 1
// a2, a3, a4 - A,B,C
// a5 - m
// a6 - n
// a7 - k
// a8 - temp
// a9 - temp
// a10- k counter
// a11- m counter
// a12- B
// a13- A
// a14 - pointer increment for n
// a15 - round value
bbsi a6, 0, even_N_samples
// ---------------- for odd N
srli a6, a6, 1 // counter a6 = a6/2. We have to do it only once
slli a7, a7, 1 // counter a7 = a7*2. We have to do it only once
// loop for M
m_loop_mmult:
movi a10, 0 // reset k loop counter
mov a13, a3 // set pointer to the first column
// loop for K
k_loop_mmult:
addi a12, a2, -4 // every loop the same start position
movi a8, 0
wsr a8, acchi
wsr a15, acclo // initialize acc with shifted round value
loopnez a6, .loop_end_mmult // loop for N
.loop_mmult:
ldinc m3, a12
l16si a8, a13, 0
add a13, a13, a7
mula.ad.ll a8, m3
l16si a8, a13, 0
add a13, a13, a7
mula.ad.lh a8, m3
.loop_end_mmult:
rsr a8, acchi
rsr a9, acclo
src a8, a8, a9
s16i a8, a4, 0
addi a4, a4, 2
// check and increment for K
addi a10, a10, 2
add a13, a3, a10 // we shift collumn
bne a10, a7, k_loop_mmult
// Check and increment for M
add a2, a2, a14 // move to the next raw
addi a5, a5, -1
bnez.n a5, m_loop_mmult
movi.n a2, 0 // return status ESP_OK
retw.n
even_N_samples:
// ---------------- for odd N
slli a7, a7, 1 // counter a7 = a7*2. We have to do it only once
// loop for M
m_loop_mmult_even:
movi a10, 0 // reset k loop counter
mov a13, a3 // set pointer to the first column
// loop for K
k_loop_mmult_even:
mov a12, a2 // every loop the same start position
movi a8, 0
wsr a8, acchi
wsr a15, acclo // initialize acc with shifted round value
loopnez a6, .loop_end_mmult_even // loop for N
.loop_mmult_even:
l16si a9, a12, 0
l16si a8, a13, 0
addi a12, a12, 2
add a13, a13, a7
mula.aa.ll a8, a9
.loop_end_mmult_even:
rsr a8, acchi
rsr a9, acclo
src a8, a8, a9
s16i a8, a4, 0
addi a4, a4, 2
// check and increment for K
addi a10, a10, 2
add a13, a3, a10 // we shift collumn
bne a10, a7, k_loop_mmult_even
// Check and increment for M
add a2, a2, a14 // move to the next raw
addi a5, a5, -1
bnez.n a5, m_loop_mmult_even
movi.n a2, 0 // return status ESP_OK
retw.n
// The path where n > 1
vector_mult:
dspm_mult_s16_m_ae32_vector;
#endif // dspm_mult_s16_ae32_enabled

View File

@@ -0,0 +1,142 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspm_mult_platform.h"
#if (dspm_mult_s16_aes3_enabled == 1)
#include "dsps_dotprod_s16_m_ae32.S"
#include "dspm_mult_s16_m_ae32_vector.S"
//esp_err_t dspm_mult_s16_ae32(const int16_t* A, const int16_t* B, int16_t* C, int m, int n, int k, int shift);
// This is matrix multipliction function for ESP32 processor.
.text
.align 4
.literal_position
.literal .LC0_1_38, 32767
.literal .LC1_1_39, 16383
.global dspm_mult_s16_aes3
.global .dspm_mult_s16_ae32_body
.type dspm_mult_s16_aes3,@function
dspm_mult_s16_aes3:
entry a1,80 #
movi.n a10, 7
and a10, a10, a7
beqz a10, .dspm_mult_s16_aes3_body
// Call Esp32 function
J .dspm_mult_s16_ae32_body
.dspm_mult_s16_aes3_body:
mov.n a10,a4 # [0]
mov.n a11,a5 # [1]
l32i a5,a1,80 # [2] id:77 shift+0x0
s32i.n a3,a1,32 # [3] gra_spill_temp_0
bltz a5,.Lt_0_6146 # [4]
#.LBB3_dspm_mult_s16_aes3: # 0x13
l32r a9,.LC0_1_38 # [0]
ssr a5 # [1]
sra a9,a9 # [2]
.LBB23_dspm_mult_s16_aes3: # 0x1c
s16i a9,a1,0 # [0] id:78 round_data_64+0x0
s16i a9,a1,2 # [1] id:78 round_data_64+0x0
s16i a9,a1,4 # [2] id:78 round_data_64+0x0
s16i a9,a1,6 # [3] id:78 round_data_64+0x0
s16i a9,a1,8 # [4] id:78 round_data_64+0x0
s16i a9,a1,10 # [5] id:78 round_data_64+0x0
s16i a9,a1,12 # [6] id:78 round_data_64+0x0
s16i a9,a1,14 # [7] id:78 round_data_64+0x0
blti a11,1,.Lt_0_7426 # [0]
mov.n a13,a2 # [0]
slli a4,a7,1 # [1]
mov.n a12,a1 # [2]
l32i.n a14,a1,32 # [3] gra_spill_temp_0
movi.n a15,15 # [4]
movi.n a8,0 # [5]
slli a9,a6,1 # [6]
s32i.n a9,a1,36 # [7] gra_spill_temp_1
s32i.n a8,a1,44 # [8] gra_spill_temp_3
sub a15,a15,a5 # [9]
addi.n a8,a7,7 # [10]
movgez a8,a7,a7 # [11]
srai a8,a8,3 # [12]
s32i.n a8,a1,40 # [13] gra_spill_temp_2
slli a8,a8,4 # [14]
add.n a14,a14,a8 # [15]
.Lt_0_7938: # 0x5d
l32i.n a8,a1,40 # [0] gra_spill_temp_2
beqz.n a8,.Lt_0_8194 # [2]
l32i.n a7,a1,32 # [0] gra_spill_temp_0
mov.n a2,a13 # [1]
.Lt_0_8706: # 0x65
ee.ldqa.u16.128.ip a12,0 # [0] id:80
ee.vldbc.16.ip q1,a2,2 # [1] id:79
mov.n a3,a7 # [2]
ee.vld.128.xp q0,a3,a4 # [3] id:81
addi a7,a7,16 # [4]
blti a6,1,.Lt_0_8962 # [5]
srai a5,a6,1 # [0]
bbci a6,0,.LBB68_dspm_mult_s16_aes3 # [1]
ee.vmulas.s16.qacc.ldbc.incp q1,a2,q0,q1 # [0] id:82
ee.vld.128.xp q0,a3,a4 # [1] id:83
.LBB68_dspm_mult_s16_aes3: # 0x82
loopgtz a5,.LBB74_dspm_mult_s16_aes3 # [0]
.LBB64_dspm_mult_s16_aes3: # 0x85
ee.vld.128.xp q2,a3,a4 # [0*II+0] id:83
ee.vmulas.s16.qacc.ldbc.incp q1,a2,q0,q1 # [0*II+1] id:82
ee.vld.128.xp q0,a3,a4 # [0*II+2] id:83
ee.vmulas.s16.qacc.ldbc.incp q1,a2,q2,q1 # [0*II+3] id:82
.LBB74_dspm_mult_s16_aes3: # 0x91
.Lt_0_8962: # 0x91
mov.n a2,a13 # [0]
ee.srcmb.s16.qacc q0,a15,1 # [1]
ee.vst.128.ip q0,a10,16 # [2] id:85
bne a7,a14,.Lt_0_8706 # [3]
.Lt_0_8194: # 0x9c
l32i.n a8,a1,36 # [0] gra_spill_temp_1
l32i.n a9,a1,44 # [1] gra_spill_temp_3
add.n a13,a13,a8 # [2]
addi.n a9,a9,1 # [3]
s32i.n a9,a1,44 # [4] gra_spill_temp_3
bne a11,a9,.Lt_0_7938 # [5]
.Lt_0_7426: # 0xa9
movi.n a2,0 # [0]
retw.n # [1]
.Lt_0_6146: # 0xad
l32r a9,.LC1_1_39 # [0]
ssr a5 # [1]
sra a9,a9 # [2]
j .LBB23_dspm_mult_s16_aes3 # [3]
#endif // dspm_mult_s16_ae32_enabled

View File

@@ -0,0 +1,40 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dsps_dotprod.h"
#include "dspm_mult.h"
// Matrinx A(m,n), m - amount or rows, n - amount of columns
// C(m,k) = A(m,n)*B(n,k)
// c(i,j) = sum(a(i,s)*b(s,j)) , s=1..n
esp_err_t dspm_mult_s16_ansi(const int16_t *A, const int16_t *B, int16_t *C, int m, int n, int k, int shift)
{
int final_shift = shift - 15;
for (int i = 0 ; i < m ; i++) {
for (int j = 0 ; j < k ; j++) {
// This code also could be used
//dsps_dotprode_f32_ae32(&A[i*n],&B[j],&C[i*k + j],n,1,n);
long long acc = 0x7fff >> shift;
for (int s = 0; s < n ; s++) {
acc += (int32_t)A[i * n + s] * (int32_t)B[s * k + j];
}
if (final_shift > 0) {
C[i * k + j] = (acc << final_shift);
} else {
C[i * k + j] = (acc >> (-final_shift));
}
}
}
return ESP_OK;
}

View File

@@ -0,0 +1,121 @@
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspm_mult_platform.h"
#if (dspm_mult_s16_arp4_enabled == 1)
// This is matrix multipliction function for Risc-V processor core.
.text
.align 4
.global dspm_mult_s16_arp4
.global dspm_mult_s16_ansi
.global .dspm_mult_s16_arp4_body
.type dspm_mult_s16_arp4,@function
// The function implements the following C code:
// esp_err_t dspm_mult_f32_ansi(const int16_t *A, const int16_t *B, int16_t *C, int m, int n, int k, int shift)
// {
// int final_shift = shift - 15;
// for (int i = 0 ; i < m ; i++) {
// for (int j = 0 ; j < k ; j++) {
// // This code also could be used
// //dsps_dotprode_f32_ae32(&A[i*n],&B[j],&C[i*k + j],n,1,n);
// long long acc = 0x7fff >> shift;
// for (int s = 0; s < n ; s++) {
// acc += (int32_t)A[i * n + s] * (int32_t)B[s * k + j];
// }
// if (final_shift > 0) {
// C[i * k + j] = (acc << final_shift);
// } else {
// C[i * k + j] = (acc >> (-final_shift));
// }
// }
// }
// return ESP_OK;
// }
dspm_mult_s16_arp4:
// A - a0
// B - a1
// C - a2
// m - a3
// n - a4
// k - a5
// shift - a6
// a7 - counter loop1: 0..m
// t1 - counter loop2: 0..k
// t0 - counter loop3: 0..n
// x25(s9) - matrix step for input2
// x24(s8) - pointer to current B
// x29(t4) - pointer to initial B
// x30(t5) - pointer to A
// x31(t6) = 2 for increment....
// x26(s10)- final_shift
or t0, a3, a4
or t0, t0, a5
andi t0, t0, 0x7
beqz t0, .dspm_mult_s16_arp4_body
j dspm_mult_s16_ansi
//ret
.dspm_mult_s16_arp4_body:
add sp,sp,-16
sw s8, 4(sp)
sw s9, 8(sp)
sw s10, 12(sp)
mv t0, a4
li a7, 0 // counter loop1
slli x25, a5, 1 // step = step*2
li x31, 2
// final_shift = shift - 15
add x26, a6, -15
.dpf_loop1: // loop for m
li t1, 0 // reset counter for loop2
mv x29, a1
.dpf_loop2: // loop for k
mv x30, a0
mv x24, x29 // load B
// Calculating dotproduct...
esp.zero.qacc // qacc = 0;
esp.vldbc.16.xp q0, x30, x31 // q0 = a[mx..mx]
esp.vld.128.xp q1, x24, x25 // q1 = b[x0..x7],
esp.lp.setup 0, t0, .matrix_mul_loop
esp.vmulas.s16.qacc.ldbc.incp q0,x30, q0,q1
.matrix_mul_loop: esp.vld.128.xp q1,x24,x25
esp.srcmb.s16.qacc q2, x26, 0 // q2 = qacc >> shift
esp.vst.128.ip q2, a2, 16 // save k0..k7
add x29,x29, 16
// check loop 2
addi t1, t1, 8 // Increment loop2 counter
blt t1, a5, .dpf_loop2
add x30, x30, -2
mv a0, x30 //
// check loop 1
add a7, a7, 1 // Increment loop1 counter
blt a7, a3, .dpf_loop1
// Exit
mv a0, a6 // return status ESP_OK
lw s10, 12(sp)
lw s9, 8(sp)
lw s8, 4(sp)
add sp,sp,16
ret
#endif //dspm_mult_s16_arp4_enabled

View File

@@ -0,0 +1,58 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
.macro dspm_mult_s16_ae32_MxNxN
// A - a2
// B - a3
// C - a4
// m - a5
// n - a6
// k - a7
// shift - stack (a8)
movi a10, 4 // load 4 as a constant
// Check if n >=4 then acceleration is possible and
blt a6, a10, do_dotproduct
// Here we make operations one by one...
movi.n a2, 0 // return status ESP_OK
retw.n
do_dotproduct:
mov a12, a2
mov a13, a3
srli a9, a6, 2 // a9 - count/4 - 1
addi a9, a9, -1
movi.n a10, 0 // load 0 to the a10 to increment second array
dotprod_s16_ae32_full a12, a13, a9, a10, a6
/* Get accumulator */
ssr a6
rsr a2, acchi
rsr a3, acclo
src a2, a2, a3
s16i a2, a4, 0
movi.n a2, 0
movi.n a2, 0 // return status ESP_OK
retw.n
.endm // dspm_mult_s16_ae32_MxNxN

View File

@@ -0,0 +1,105 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
.macro dspm_mult_s16_m_ae32_vector
// m - a5 - any > 0
// n - a6 - 1,2,3, any
// k - a7 - 1, any
// Define path for n < 4
movi a7, 4
blt a6, a7, small_process_loop // jump for n < 4
srli a7, a6, 2
addi a7, a7, -1
mmultv_loop1:
wsr a8, acchi
wsr a15, acclo // initialize acc with shifted round value
// Clear initial state of the result register
// a2 - A
// a3 - B
// a4 - C
// a6 - n
// a7 - n/4 - 1
// a8 - 0
// a15- 0x7fff>>shift
mov a12, a2 // load A
mov a13, a3 // Load B
dotprod_s16_ae32_full a12, a13, a7, a6
// check loop 1
/* Get accumulator */
rsr a12, acchi
rsr a13, acclo
src a12, a12, a13
s16i a12, a4, 0
addi a4, a4, 2
add.n a2, a2, a14 // Increment A, A = A[i*n]
addi a9, a9, 1 // Increment loop1 counter
blt a9, a5, mmultv_loop1
movi.n a2, 0 // return status ESP_OK
retw.n
small_process_loop:
wsr a8, acchi
wsr a15, acclo // initialize acc with shifted round value
mov a12, a2 // load A
mov a13, a3 // Load B
addi a12, a12, -4 // To arrange fist pointer
addi a13, a13, -4 // To arrange fist pointer
bbci a6, 1, .mod2chk_short
ldinc m0, a12
ldinc m2, a13
mula.dd.hh m0, m2
mula.dd.ll m0, m2
.mod2chk_short:
bbci a6, 0, .mod1chk_short
ldinc m0, a12
ldinc m2, a13
mula.dd.ll m0, m2
.mod1chk_short:
// check loop 1
/* Get accumulator */
rsr a12, acchi
rsr a13, acclo
src a12, a12, a13
s16i a12, a4, 0
addi a4, a4, 2
add.n a2, a2, a14 // Increment A, A = A[i*n]
addi a9, a9, 1 // Increment loop1 counter
blt a9, a5, small_process_loop
movi.n a2, 0 // return status ESP_OK
retw.n
.endm // dspm_mult_s16_m_ae32_vector