add some code

This commit is contained in:
2025-09-05 13:25:11 +08:00
parent 9ff0a99e7a
commit 3cf1229a85
8911 changed files with 2535396 additions and 0 deletions

View File

@@ -0,0 +1,398 @@
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_aes3_enabled == 1)
.text
.align 4
.literal .LC0_1_61, 458755
# Program Unit: dspi_dotprod_off_s16_aes3
.type dspi_dotprod_off_s16_aes3, @function
.align 4
.global dspi_dotprod_off_s16_aes3
dspi_dotprod_off_s16_aes3: # 0x4
.LBB1_dspi_dotprod_off_s16_aes3: # 0x4
entry a1,128 #
l32i.n a10,a2,4 # [0] id:760
l32i.n a12,a2,12 # [1] id:759
mull a8,a10,a5 # [2]
blt a12,a8,.LBB83_dspi_dotprod_off_s16_aes3 # [4]
l32i.n a13,a2,8 # [0] id:761
l32i.n a9,a2,16 # [1] id:762
mull a11,a13,a6 # [2]
blt a9,a11,.LBB83_dspi_dotprod_off_s16_aes3 # [4]
l32i.n a15,a3,4 # [0] id:764
l32i.n a14,a3,12 # [1] id:763
mull a11,a15,a5 # [2]
blt a14,a11,.LBB83_dspi_dotprod_off_s16_aes3 # [4]
l32i.n a8,a3,16 # [0] id:766
l32i.n a9,a3,8 # [1] id:765
s32i a9,a1,88 # [2] gra_spill_temp_2
mull a9,a9,a6 # [3]
blt a8,a9,.LBB83_dspi_dotprod_off_s16_aes3 # [5]
l32i.n a8,a3,0 # [0] id:767
s32i a8,a1,84 # [1] gra_spill_temp_1
bbsi a8,0,.Lt_0_36354 # [2]
bne a14,a11,.Lt_0_36354 # [0]
bnei a15,1,.Lt_0_36354 # [0]
l32i a9,a1,88 # [0] gra_spill_temp_2
beqi a9,1,.Lt_0_19458 # [2]
.Lt_0_36354: # 0x46
.Lt_0_19714: # 0x46
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
l16si a8,a1,128 # [6] id:768 offset+0x0
s32i.n a8,a1,0 # [7] id:875
.type dspi_dotprod_off_s16_ansi, @function
call8 dspi_dotprod_off_s16_ansi # [8] dspi_dotprod_off_s16_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB83_dspi_dotprod_off_s16_aes3: # 0x5e
l32r a2,.LC0_1_61 # [0]
retw.n # [1]
.Lt_0_19458: # 0x63
addi.n a9,a10,-1 # [0]
bnez a9,.Lt_0_37122 # [1]
addi.n a10,a13,-1 # [0]
bnez a10,.Lt_0_37122 # [1]
extui a11,a5,0,3 # [0]
bnez.n a11,.Lt_0_37122 # [1]
blti a6,4,.Lt_0_37122 # [0]
movi.n a14,32 # [0]
blt a14,a5,.LBB27_dspi_dotprod_off_s16_aes3 # [1]
.Lt_0_37634: # 0x7a
.Lt_0_21506: # 0x7a
l32i a15,a1,84 # [0] gra_spill_temp_1
l32i.n a2,a2,0 # [1] id:769
l16si a9,a1,128 # [2] id:768 offset+0x0
mull a10,a12,a13 # [3]
addi a8,a1,16 # [4] temp_offset
slli a10,a10,1 # [5]
s32i a10,a1,80 # [6] gra_spill_temp_0
movi.n a10,2 # [7]
# loop-count fixed at 2
loop a10,.LBB137_dspi_dotprod_off_s16_aes3 # [8]
.LBB132_dspi_dotprod_off_s16_aes3: # 0x93
s16i a9,a8,0 # [0*II+0] id:770 temp_offset+0x0
s16i a9,a8,2 # [0*II+1] id:770 temp_offset+0x0
s16i a9,a8,4 # [0*II+2] id:770 temp_offset+0x0
s16i a9,a8,6 # [0*II+3] id:770 temp_offset+0x0
s16i a9,a8,8 # [0*II+4] id:770 temp_offset+0x0
s16i a9,a8,10 # [0*II+5] id:770 temp_offset+0x0
s16i a9,a8,12 # [0*II+6] id:770 temp_offset+0x0
s16i a9,a8,14 # [0*II+7] id:770 temp_offset+0x0
addi a8,a8,16 # [0*II+8]
.LBB137_dspi_dotprod_off_s16_aes3: # 0xae
mov.n a3,a6 # [0]
addi a11,a5,-24 # [1]
addi a12,a1,24 # [3] temp_offset+8
movi.n a13,0 # [4]
wur.sar_byte a13 # [5]
wur.accx_0 a13 # [6]
wur.accx_1 a13 # [7]
ee.vld.128.ip q6,a12,0 # [8] id:771
s32i.n a12,a1,48 # [9] offset_data_ptr
beqz a11,.LBB34_dspi_dotprod_off_s16_aes3 # [10]
.Lt_0_25602: # 0xc8
.Lt_0_25090: # 0xc8
ee.vld.128.ip q0,a15,16 # [0] id:786
addi a14,a5,-16 # [1]
beqz a14,.LBB40_dspi_dotprod_off_s16_aes3 # [2]
.Lt_0_27138: # 0xd1
.Lt_0_26626: # 0xd1
addi a8,a5,-8 # [0]
beqz a8,.LBB46_dspi_dotprod_off_s16_aes3 # [1]
.Lt_0_28674: # 0xd7
.Lt_0_28162: # 0xd7
addi a9,a5,-32 # [0]
beqz a9,.LBB52_dspi_dotprod_off_s16_aes3 # [1]
.Lt_0_30210: # 0xdd
.Lt_0_29698: # 0xdd
addi a10,a5,-64 # [0]
beqz a10,.LBB58_dspi_dotprod_off_s16_aes3 # [1]
movi.n a11,64 # [0]
bge a11,a5,.Lt_0_33026 # [1]
movi.n a12,0 # [0]
ee.ld.128.usar.ip q1,a2,16 # [1] id:848
ee.ld.128.usar.ip q2,a2,16 # [2] id:849
ee.src.q.ld.ip q3,a2,16,q1,q2 # [4] id:850
beqz.n a3,.Lt_0_33026 # [5]
slli a8,a5,1 # [0]
l32i a14,a1,80 # [1] gra_spill_temp_0
addi a13,a5,31 # [2]
movgez a13,a5,a5 # [3]
srai a13,a13,5 # [4]
sub a14,a14,a8 # [5]
addi a14,a14,16 # [6]
addi.n a13,a13,-1 # [7]
.Lt_0_33794: # 0x10c
beqz.n a13,.Lt_0_34050 # [0]
loopnez a13,.LBB273_dspi_dotprod_off_s16_aes3 # [0]
.LBB271_dspi_dotprod_off_s16_aes3: # 0x111
ee.vmulas.s16.accx.ld.ip.qup q0,a2,16,q0,q1,q2,q3 # [0*II+0] id:851
ee.vmulas.s16.accx.ld.ip q1,a15,16,q1,q6 # [0*II+1] id:852
ee.vmulas.s16.accx.ld.ip.qup q1,a2,16,q1,q2,q3,q0 # [0*II+3] id:853
ee.vmulas.s16.accx.ld.ip q4,a15,16,q2,q6 # [0*II+4] id:854
ee.vmulas.s16.accx.ld.ip.qup q2,a2,16,q4,q3,q0,q1 # [0*II+6] id:855
ee.vmulas.s16.accx.ld.ip q4,a15,16,q3,q6 # [0*II+7] id:856
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q4,q0,q1,q2 # [0*II+9] id:857
ee.vmulas.s16.accx.ld.ip q0,a15,16,q0,q6 # [0*II+10] id:858
.LBB273_dspi_dotprod_off_s16_aes3: # 0x131
.Lt_0_34050: # 0x131
ee.vmulas.s16.accx.ld.ip.qup q0,a2,16,q0,q1,q2,q3 # [0] id:859
ee.vmulas.s16.accx.ld.ip q1,a15,16,q1,q6 # [1] id:860
movi.n a9,32 # [2]
ee.vmulas.s16.accx.ld.xp.qup q7,a2,a14,q1,q2,q3,q0 # [3] id:861
ee.vmulas.s16.accx.ld.ip q5,a15,16,q2,q6 # [4] id:862
movi.n a10,-16 # [5]
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a10,q5,q3,q0,q7 # [6] id:863
ee.vmulas.s16.accx.ld.ip q4,a15,16,q3,q6 # [7] id:865
ee.ld.128.usar.xp q1,a2,a9 # [8] id:864
addi.n a12,a12,1 # [9]
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q4,q0,q1,q2 # [10] id:866
ee.vmulas.s16.accx.ld.ip q0,a15,16,q0,q6 # [11] id:867
bne a12,a3,.Lt_0_33794 # [12]
.Lt_0_33026: # 0x15d
.Lt_0_32770: # 0x15d
rur.accx_0 a9 # [0]
rur.accx_1 a10 # [1]
blti a7,1,.Lt_0_35586 # [2]
movi.n a2,0 # [0]
addi a13,a7,-33 # [1]
addi.n a14,a7,-1 # [2]
ssr a14 # [3]
sra a12,a10 # [4]
src a11,a10,a9 # [5]
movgez a11,a12,a13 # [6]
addi.n a11,a11,1 # [7]
srai a11,a11,1 # [8]
s16i a11,a4,0 # [9] id:873
retw.n # [10]
.Lt_0_37122: # 0x183
.Lt_0_20738: # 0x183
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
l16si a8,a1,128 # [6] id:768 offset+0x0
s32i.n a8,a1,0 # [7] id:876
call8 dspi_dotprod_off_s16_ansi # [8] dspi_dotprod_off_s16_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB27_dspi_dotprod_off_s16_aes3: # 0x19b
extui a9,a5,0,1 # [0]
beqz a9,.Lt_0_37634 # [1]
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
l16si a8,a1,128 # [6] id:768 offset+0x0
s32i.n a8,a1,0 # [7] id:877
call8 dspi_dotprod_off_s16_ansi # [8] dspi_dotprod_off_s16_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB34_dspi_dotprod_off_s16_aes3: # 0x1b9
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
l32i a12,a1,80 # [2] gra_spill_temp_0
ee.ld.128.usar.ip q0,a2,16 # [3] id:776
ee.ld.128.usar.ip q2,a2,16 # [4] id:777
addi a12,a12,-32 # [5]
ee.src.q.ld.ip q3,a2,16,q0,q2 # [6] id:778
loopgtz a6,.LBB159_dspi_dotprod_off_s16_aes3 # [7]
.LBB157_dspi_dotprod_off_s16_aes3: # 0x1cf
ee.vmulas.s16.accx.ld.ip q1,a15,16,q0,q6 # [0*II+0] id:779
ee.vmulas.s16.accx.ld.xp.qup q1,a2,a12,q1,q0,q2,q3 # [0*II+2] id:780
ee.vmulas.s16.accx.ld.ip q0,a15,16,q2,q6 # [0*II+3] id:781
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a11,q0,q2,q3,q1 # [0*II+5] id:782
ee.vmulas.s16.accx.ld.ip q1,a15,16,q3,q6 # [0*II+6] id:784
ee.ld.128.usar.xp q0,a2,a10 # [0*II+7] id:783
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q1,q3,q0,q2 # [0*II+9] id:785
.LBB159_dspi_dotprod_off_s16_aes3: # 0x1ea
j .Lt_0_25602 # [0]
.LBB40_dspi_dotprod_off_s16_aes3: # 0x1ed
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
srli a3,a6,1 # [2]
l32i a12,a1,80 # [3] gra_spill_temp_0
ee.ld.128.usar.ip q1,a2,16 # [4] id:787
ee.ld.128.usar.ip q2,a2,16 # [5] id:788
addi a12,a12,-16 # [7]
ee.src.q.ld.xp q3,a2,a12,q1,q2 # [8] id:789
loopnez a3,.LBB182_dspi_dotprod_off_s16_aes3 # [9]
.LBB180_dspi_dotprod_off_s16_aes3: # 0x206
ee.vmulas.s16.accx.ld.xp.qup q0,a2,a11,q0,q1,q2,q3 # [0*II+0] id:790
ee.vmulas.s16.accx.ld.ip q3,a15,16,q1,q6 # [0*II+1] id:791
ee.ld.128.usar.xp q1,a2,a10 # [0*II+2] id:792
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a12,q3,q2,q1,q0 # [0*II+4] id:793
ee.vmulas.s16.accx.ld.ip q4,a15,16,q2,q6 # [0*II+5] id:794
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a11,q4,q1,q0,q3 # [0*II+7] id:795
ee.vmulas.s16.accx.ld.ip q3,a15,16,q1,q6 # [0*II+8] id:796
ee.ld.128.usar.xp q1,a2,a10 # [0*II+9] id:797
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a12,q3,q0,q1,q2 # [0*II+11] id:798
ee.vmulas.s16.accx.ld.ip q0,a15,16,q0,q6 # [0*II+12] id:799
.LBB182_dspi_dotprod_off_s16_aes3: # 0x22c
j .Lt_0_27138 # [0]
.LBB46_dspi_dotprod_off_s16_aes3: # 0x22f
movi.n a10,-16 # [0]
l32i a11,a1,80 # [1] gra_spill_temp_0
addi a8,a2,16 # [2]
addi a11,a11,16 # [3]
ee.ld.128.usar.xp q2,a8,a10 # [4] id:800
ee.ld.128.usar.xp q1,a8,a11 # [5] id:801
ee.src.q.ld.xp q3,a8,a10,q1,q2 # [7] id:802
ee.ld.128.usar.xp q2,a8,a11 # [8] id:803
srli a3,a3,2 # [9]
mov.n a2,a8 # [10]
loopnez a3,.LBB205_dspi_dotprod_off_s16_aes3 # [11]
.LBB203_dspi_dotprod_off_s16_aes3: # 0x24e
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a10,q0,q1,q2,q3 # [0*II+0] id:804
ee.vmulas.s16.accx.ld.ip q0,a15,16,q1,q6 # [0*II+1] id:805
ee.ld.128.usar.xp q1,a2,a11 # [0*II+2] id:806
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a10,q0,q2,q1,q3 # [0*II+4] id:807
ee.vmulas.s16.accx.ld.ip q0,a15,16,q2,q6 # [0*II+5] id:808
ee.ld.128.usar.xp q4,a2,a11 # [0*II+6] id:809
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a10,q0,q1,q4,q3 # [0*II+8] id:810
ee.vmulas.s16.accx.ld.ip q0,a15,16,q1,q6 # [0*II+9] id:811
ee.ld.128.usar.xp q1,a2,a11 # [0*II+10] id:812
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a10,q0,q4,q1,q3 # [0*II+12] id:813
ee.vmulas.s16.accx.ld.ip q0,a15,16,q4,q6 # [0*II+13] id:814
ee.ld.128.usar.xp q2,a2,a11 # [0*II+14] id:815
.LBB205_dspi_dotprod_off_s16_aes3: # 0x27a
j .Lt_0_28674 # [0]
.LBB52_dspi_dotprod_off_s16_aes3: # 0x27d
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
slli a13,a5,1 # [2]
l32i a12,a1,80 # [3] gra_spill_temp_0
ee.ld.128.usar.ip q1,a2,16 # [4] id:816
ee.ld.128.usar.ip q2,a2,16 # [5] id:817
sub a12,a12,a13 # [6]
ee.src.q.ld.ip q3,a2,16,q1,q2 # [8] id:818
addi a12,a12,16 # [9]
loopnez a3,.LBB228_dspi_dotprod_off_s16_aes3 # [10]
.LBB226_dspi_dotprod_off_s16_aes3: # 0x299
ee.vmulas.s16.accx.ld.ip.qup q0,a2,16,q0,q1,q2,q3 # [0*II+0] id:819
ee.vmulas.s16.accx.ld.ip q4,a15,16,q1,q6 # [0*II+1] id:820
ee.vmulas.s16.accx.ld.xp.qup q4,a2,a12,q4,q2,q3,q0 # [0*II+3] id:821
ee.vmulas.s16.accx.ld.ip q1,a15,16,q2,q6 # [0*II+4] id:822
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a11,q1,q3,q0,q4 # [0*II+6] id:823
ee.vmulas.s16.accx.ld.ip q4,a15,16,q3,q6 # [0*II+7] id:825
ee.ld.128.usar.xp q1,a2,a10 # [0*II+8] id:824
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q4,q0,q1,q2 # [0*II+10] id:826
ee.vmulas.s16.accx.ld.ip q0,a15,16,q0,q6 # [0*II+11] id:827
.LBB228_dspi_dotprod_off_s16_aes3: # 0x2bc
j .Lt_0_30210 # [0]
.LBB58_dspi_dotprod_off_s16_aes3: # 0x2bf
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
slli a13,a5,1 # [2]
l32i a12,a1,80 # [3] gra_spill_temp_0
ee.ld.128.usar.ip q1,a2,16 # [4] id:828
ee.ld.128.usar.ip q2,a2,16 # [5] id:829
sub a12,a12,a13 # [7]
addi a12,a12,16 # [8]
ee.src.q.ld.ip q3,a2,16,q1,q2 # [9] id:830
mov.n a8,a2 # [10]
loopnez a3,.LBB250_dspi_dotprod_off_s16_aes3 # [11]
.LBB248_dspi_dotprod_off_s16_aes3: # 0x2dd
ee.vmulas.s16.accx.ld.ip.qup q0,a8,16,q0,q1,q2,q3 # [0*II+0] id:831
ee.vmulas.s16.accx.ld.ip q4,a15,16,q1,q6 # [0*II+1] id:832
ee.vmulas.s16.accx.ld.ip.qup q4,a8,16,q4,q2,q3,q0 # [0*II+3] id:833
ee.vmulas.s16.accx.ld.ip q1,a15,16,q2,q6 # [0*II+4] id:834
ee.vmulas.s16.accx.ld.ip.qup q1,a8,16,q1,q3,q0,q4 # [0*II+6] id:835
ee.vmulas.s16.accx.ld.ip q5,a15,16,q3,q6 # [0*II+7] id:836
ee.vmulas.s16.accx.ld.ip.qup q5,a8,16,q5,q0,q4,q1 # [0*II+9] id:837
ee.vmulas.s16.accx.ld.ip q0,a15,16,q0,q6 # [0*II+10] id:838
ee.vmulas.s16.accx.ld.ip.qup q0,a8,16,q0,q4,q1,q5 # [0*II+12] id:839
ee.vmulas.s16.accx.ld.ip q4,a15,16,q4,q6 # [0*II+13] id:840
ee.vmulas.s16.accx.ld.xp.qup q4,a8,a12,q4,q1,q5,q0 # [0*II+15] id:841
ee.vmulas.s16.accx.ld.ip q1,a15,16,q1,q6 # [0*II+16] id:842
ee.vmulas.s16.accx.ld.xp.qup q2,a8,a11,q1,q5,q0,q4 # [0*II+18] id:843
ee.vmulas.s16.accx.ld.ip q4,a15,16,q5,q6 # [0*II+19] id:845
ee.ld.128.usar.xp q1,a8,a10 # [0*II+20] id:844
ee.vmulas.s16.accx.ld.ip.qup q3,a8,16,q4,q0,q1,q2 # [0*II+22] id:846
ee.vmulas.s16.accx.ld.ip q0,a15,16,q0,q6 # [0*II+23] id:847
.LBB250_dspi_dotprod_off_s16_aes3: # 0x320
j .Lt_0_33026 # [0]
.Lt_0_35586: # 0x323
movi.n a2,0 # [0]
sext a14,a9,15 # [1]
s16i a14,a4,0 # [2] id:874
retw.n # [3]
#endif // dsps_dotprod_s16_aes3_enabled

View File

@@ -0,0 +1,49 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod.h"
esp_err_t dspi_dotprod_off_s16_ansi(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift, int16_t offset)
{
if (in_image->step_x * count_x > in_image->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (in_image->step_y * count_y > in_image->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_x * count_x > filter->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_y * count_y > filter->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
int16_t *i_data = (int16_t *)in_image->data;
int16_t *f_data = (int16_t *)filter->data;
int i_step = in_image->stride_x * in_image->step_y;
int f_step = filter->stride_x * filter->step_y;
int64_t acc = 0;
for (int y = 0; y < count_y; y++) {
for (int x = 0; x < count_x; x++) {
acc += (int32_t)i_data[in_image->step_x * x] * ((int32_t)f_data[filter->step_x * x] + (int32_t)offset);
}
i_data += i_step;
f_data += f_step;
}
acc += 1 << (shift - 1); // round operation
acc >>= shift;
*out_value = acc;
return ESP_OK;
}

View File

@@ -0,0 +1,104 @@
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_arp4_enabled == 1)
#include "dsp_err_codes.h"
.text
.align 4
.global dspi_dotprod_off_s16_arp4
.global dspi_dotprod_off_s16_ansi
.type dspi_dotprod_off_s16_arp4,@function
// esp_err_t dspi_dotprod_off_s16_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift, int16_t offset);
dspi_dotprod_off_s16_arp4:
// in_image - a0
// filter - a1
// out_value - a2
// count_x - a3
// count_y - a4
// shift - a5
// offset - a6
// i_data - t0
// f_data - t1
// i_step - t2
// f_step - t3
// current i_data - t4
// current f_data - t5
lw t1, 4(a0) // load in_image->step_x
lw t2, 4(a1) // load filter->step_x
or t1, t1, t2
addi t1, t1, -1 // should be 0 now
andi t2, a3, 7
or t1, t1, t2
beqz t1, .dspi_dotprod_off_s16_arp4_body
j dspi_dotprod_off_s16_ansi
.dspi_dotprod_off_s16_arp4_body:
add sp, sp, -16
sw a6, 0(sp)
mv t6, sp
esp.vldbc.16.ip q2, t6, 0
lw t0, 0(a0) // i_data
lw t1, 0(a1) // f_data
lw t2, 8(a0) // step_y
lw t4, 12(a0) // stride_x
mul t2, t4, t2
slli t2, t2, 1 // i_step = i_step<<1
lw t3, 8(a1) // step_y
lw t5, 12(a1) // stride_x
mul t3, t5, t3
slli t3, t3, 1 // f_step = f_step<<1
srli t6, a3, 3 // t5 = len/8
addi a7, a5, -1
li t4, 1
sll t4, t4, a7
esp.zero.xacc
esp.movx.w.xacc.l t4
.loop_count_y:
mv t4, t0
mv t5, t1
esp.vld.128.ip q1, t5, 16 // q0 - i_data
esp.lp.setup 0, t6, .loop_count_x
esp.vld.128.ip q0, t4, 16 // q1 - f_data
esp.vadd.s16 q3, q2, q1
.loop_count_x: esp.vmulas.s16.xacc.ld.ip q1, t5, 16, q0, q3 // q0 - i_data
add t0, t0, t2
add t1, t1, t3
add a4,a4, -1
bgtz a4, .loop_count_y
esp.srs.s.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
sh t5, 0(a2) // store result to output buffer
li a0,0
add sp,sp,16
ret
#endif // dspi_dotprod_arp4_enabled

View File

@@ -0,0 +1,408 @@
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_aes3_enabled == 1)
.text
.align 4
.literal .LC0_1_57, 458755
# Program Unit: dspi_dotprod_off_s8_aes3
.type dspi_dotprod_off_s8_aes3, @function
.align 4
.global dspi_dotprod_off_s8_aes3
dspi_dotprod_off_s8_aes3: # 0x4
.LBB1_dspi_dotprod_off_s8_aes3: # 0x4
entry a1,112 #
l32i.n a10,a2,4 # [0] id:745
l32i.n a12,a2,12 # [1] id:744
mull a8,a10,a5 # [2]
blt a12,a8,.LBB86_dspi_dotprod_off_s8_aes3 # [4]
l32i.n a13,a2,8 # [0] id:746
l32i.n a9,a2,16 # [1] id:747
mull a11,a13,a6 # [2]
blt a9,a11,.LBB86_dspi_dotprod_off_s8_aes3 # [4]
l32i.n a15,a3,4 # [0] id:749
l32i.n a14,a3,12 # [1] id:748
mull a11,a15,a5 # [2]
blt a14,a11,.LBB86_dspi_dotprod_off_s8_aes3 # [4]
l32i.n a8,a3,16 # [0] id:751
l32i.n a9,a3,8 # [1] id:750
s32i a9,a1,72 # [2] gra_spill_temp_2
mull a9,a9,a6 # [3]
blt a8,a9,.LBB86_dspi_dotprod_off_s8_aes3 # [5]
l32i.n a8,a3,0 # [0] id:752
s32i a8,a1,68 # [1] gra_spill_temp_1
bbsi a8,0,.Lt_0_35330 # [2]
bne a14,a11,.Lt_0_35330 # [0]
bnei a15,1,.Lt_0_35330 # [0]
l32i a11,a1,72 # [0] gra_spill_temp_2
beqi a11,1,.Lt_0_18946 # [2]
.Lt_0_35330: # 0x46
.Lt_0_19202: # 0x46
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
.type dspi_dotprod_s8_ansi, @function
call8 dspi_dotprod_s8_ansi # [6] dspi_dotprod_s8_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB86_dspi_dotprod_off_s8_aes3: # 0x59
l32r a2,.LC0_1_57 # [0]
retw.n # [1]
.Lt_0_18946: # 0x5e
addi.n a14,a10,-1 # [0]
bnez a14,.Lt_0_36098 # [1]
addi.n a15,a13,-1 # [0]
bnez a15,.Lt_0_36098 # [1]
extui a8,a5,0,4 # [0]
bnez.n a8,.Lt_0_36098 # [1]
blti a6,4,.Lt_0_36098 # [0]
movi.n a9,64 # [0]
blt a9,a5,.LBB27_dspi_dotprod_off_s8_aes3 # [1]
.Lt_0_36610: # 0x75
.Lt_0_20994: # 0x75
mov.n a8,a1 # [0]
l8ui a9,a1,112 # [1] id:754 offset+0x0
l32i.n a15,a2,0 # [2] id:753
mull a10,a12,a13 # [3]
l32i a2,a1,68 # [4] gra_spill_temp_1
s32i a10,a1,64 # [5] gra_spill_temp_0
sext a9,a9,7 # [6]
movi.n a10,4 # [7]
# loop-count fixed at 4
loop a10,.LBB140_dspi_dotprod_off_s8_aes3 # [8]
.LBB135_dspi_dotprod_off_s8_aes3: # 0x8d
s8i a9,a8,0 # [0*II+0] id:755 temp_offset+0x0
s8i a9,a8,1 # [0*II+1] id:755 temp_offset+0x0
s8i a9,a8,2 # [0*II+2] id:755 temp_offset+0x0
s8i a9,a8,3 # [0*II+3] id:755 temp_offset+0x0
s8i a9,a8,4 # [0*II+4] id:755 temp_offset+0x0
s8i a9,a8,5 # [0*II+5] id:755 temp_offset+0x0
s8i a9,a8,6 # [0*II+6] id:755 temp_offset+0x0
s8i a9,a8,7 # [0*II+7] id:755 temp_offset+0x0
addi.n a8,a8,8 # [0*II+8]
.LBB140_dspi_dotprod_off_s8_aes3: # 0xa7
mov.n a3,a6 # [0]
addi a11,a5,-48 # [1]
addi.n a12,a1,8 # [3] temp_offset+8
movi.n a13,0 # [4]
wur.accx_0 a13 # [5]
wur.accx_1 a13 # [6]
ee.vld.128.ip q6,a12,0 # [7] id:756
s32i.n a12,a1,32 # [8] offset_data_ptr
beqz a11,.LBB34_dspi_dotprod_off_s8_aes3 # [9]
l32i a2,a1,68 # [0] gra_spill_temp_1
ee.vld.128.ip q0,a2,16 # [2] id:771
st.qr q0,a1,48 # [3] q0
.Lt_0_24578: # 0xc6
addi a14,a5,-32 # [0]
beqz a14,.LBB43_dspi_dotprod_off_s8_aes3 # [1]
.Lt_0_26626: # 0xcc
.Lt_0_26114: # 0xcc
addi a8,a5,-16 # [0]
beqz a8,.LBB50_dspi_dotprod_off_s8_aes3 # [1]
.Lt_0_28162: # 0xd2
.Lt_0_27650: # 0xd2
addi a9,a5,-64 # [0]
beqz a9,.LBB57_dspi_dotprod_off_s8_aes3 # [1]
.Lt_0_29698: # 0xd8
.Lt_0_29186: # 0xd8
addi a10,a5,-128 # [0]
beqz a10,.LBB64_dspi_dotprod_off_s8_aes3 # [1]
movi a11,128 # [0]
bge a11,a5,.Lt_0_32514 # [1]
movi.n a12,0 # [0]
ee.ld.128.usar.ip q1,a15,16 # [1] id:833
ee.ld.128.usar.ip q2,a15,16 # [2] id:834
ee.src.q.ld.ip q3,a15,16,q1,q2 # [4] id:835
beqz.n a3,.Lt_0_32514 # [5]
ld.qr q0,a1,48 # [0] q0
l32i a14,a1,64 # [1] gra_spill_temp_0
addi a13,a5,31 # [2]
movgez a13,a5,a5 # [3]
srai a13,a13,5 # [4]
sub a14,a14,a5 # [5]
addi a14,a14,16 # [6]
addi.n a13,a13,-1 # [7]
.Lt_0_33282: # 0x108
beqz.n a13,.Lt_0_33538 # [0]
loopnez a13,.LBB277_dspi_dotprod_off_s8_aes3 # [0]
.LBB275_dspi_dotprod_off_s8_aes3: # 0x10d
ee.vmulas.s8.accx.ld.ip.qup q0,a15,16,q0,q1,q2,q3 # [0*II+0] id:836
ee.vmulas.s8.accx.ld.ip q1,a2,16,q1,q6 # [0*II+1] id:837
ee.vmulas.s8.accx.ld.ip.qup q1,a15,16,q1,q2,q3,q0 # [0*II+3] id:838
ee.vmulas.s8.accx.ld.ip q4,a2,16,q2,q6 # [0*II+4] id:839
ee.vmulas.s8.accx.ld.ip.qup q2,a15,16,q4,q3,q0,q1 # [0*II+6] id:840
ee.vmulas.s8.accx.ld.ip q4,a2,16,q3,q6 # [0*II+7] id:841
ee.vmulas.s8.accx.ld.ip.qup q3,a15,16,q4,q0,q1,q2 # [0*II+9] id:842
ee.vmulas.s8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+10] id:843
.LBB277_dspi_dotprod_off_s8_aes3: # 0x12d
.Lt_0_33538: # 0x12d
ee.vmulas.s8.accx.ld.ip.qup q4,a15,16,q0,q1,q2,q3 # [0] id:844
ee.vmulas.s8.accx.ld.ip q1,a2,16,q1,q6 # [1] id:845
movi.n a8,32 # [2]
ee.vmulas.s8.accx.ld.xp.qup q0,a15,a14,q1,q2,q3,q4 # [3] id:846
ee.vmulas.s8.accx.ld.ip q7,a2,16,q2,q6 # [4] id:847
movi.n a9,-16 # [5]
ee.vmulas.s8.accx.ld.xp.qup q2,a15,a9,q7,q3,q4,q0 # [6] id:848
ee.vmulas.s8.accx.ld.ip q5,a2,16,q3,q6 # [7] id:850
ee.ld.128.usar.xp q1,a15,a8 # [8] id:849
addi.n a12,a12,1 # [9]
ee.vmulas.s8.accx.ld.ip.qup q3,a15,16,q5,q4,q1,q2 # [10] id:851
ee.vmulas.s8.accx.ld.ip q0,a2,16,q4,q6 # [11] id:852
bne a12,a3,.Lt_0_33282 # [12]
.Lt_0_32514: # 0x159
.Lt_0_32258: # 0x159
movi.n a2,0 # [0]
rur.accx_0 a10 # [1]
addi.n a12,a7,-1 # [2]
movi.n a11,1 # [3]
ssl a12 # [4]
sll a11,a11 # [5]
ssr a7 # [6]
add.n a10,a10,a11 # [7]
sra a10,a10 # [8]
s8i a10,a4,0 # [9] id:854
retw.n # [10]
.Lt_0_36098: # 0x175
.Lt_0_20226: # 0x175
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
call8 dspi_dotprod_s8_ansi # [6] dspi_dotprod_s8_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB27_dspi_dotprod_off_s8_aes3: # 0x188
extui a14,a5,0,1 # [0]
beqz a14,.Lt_0_36610 # [1]
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
call8 dspi_dotprod_s8_ansi # [6] dspi_dotprod_s8_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB34_dspi_dotprod_off_s8_aes3: # 0x1a1
ee.ld.128.usar.ip q0,a15,16 # [0] id:760
ee.ld.128.usar.ip q2,a15,16 # [1] id:761
ee.src.q.ld.ip q3,a15,16,q0,q2 # [3] id:762
beqz.n a6,.Lt_0_24578 # [4]
movi.n a10,32 # [0]
l32i a12,a1,64 # [1] gra_spill_temp_0
movi.n a11,-16 # [2]
addi a12,a12,-32 # [3]
loopgtz a6,.LBB163_dspi_dotprod_off_s8_aes3 # [4]
.LBB161_dspi_dotprod_off_s8_aes3: # 0x1b9
ee.vmulas.s8.accx.ld.ip q1,a2,16,q0,q6 # [0*II+0] id:763
ee.vmulas.s8.accx.ld.xp.qup q1,a15,a12,q1,q0,q2,q3 # [0*II+2] id:764
ee.vmulas.s8.accx.ld.ip q0,a2,16,q2,q6 # [0*II+3] id:765
ee.vmulas.s8.accx.ld.xp.qup q2,a15,a11,q0,q2,q3,q1 # [0*II+5] id:766
ee.vmulas.s8.accx.ld.ip q1,a2,16,q3,q6 # [0*II+6] id:768
ee.ld.128.usar.xp q0,a15,a10 # [0*II+7] id:767
ee.vmulas.s8.accx.ld.ip.qup q3,a15,16,q1,q3,q0,q2 # [0*II+9] id:769
.LBB163_dspi_dotprod_off_s8_aes3: # 0x1d4
st.qr q1,a1,48 # [0] q0
j .Lt_0_24578 # [1]
.LBB43_dspi_dotprod_off_s8_aes3: # 0x1da
srli a3,a6,1 # [0]
l32i a12,a1,64 # [1] gra_spill_temp_0
ee.ld.128.usar.ip q1,a15,16 # [2] id:772
ee.ld.128.usar.ip q2,a15,16 # [3] id:773
addi a12,a12,-16 # [5]
ee.src.q.ld.xp q3,a15,a12,q1,q2 # [6] id:774
beqz.n a3,.Lt_0_26626 # [7]
ld.qr q0,a1,48 # [0] q0
movi.n a10,32 # [1]
movi.n a11,-16 # [2]
loopnez a3,.LBB186_dspi_dotprod_off_s8_aes3 # [3]
.LBB184_dspi_dotprod_off_s8_aes3: # 0x1f8
ee.vmulas.s8.accx.ld.xp.qup q0,a15,a11,q0,q1,q2,q3 # [0*II+0] id:775
ee.vmulas.s8.accx.ld.ip q3,a2,16,q1,q6 # [0*II+1] id:776
ee.ld.128.usar.xp q1,a15,a10 # [0*II+2] id:777
ee.vmulas.s8.accx.ld.xp.qup q3,a15,a12,q3,q2,q1,q0 # [0*II+4] id:778
ee.vmulas.s8.accx.ld.ip q4,a2,16,q2,q6 # [0*II+5] id:779
ee.vmulas.s8.accx.ld.xp.qup q2,a15,a11,q4,q1,q0,q3 # [0*II+7] id:780
ee.vmulas.s8.accx.ld.ip q3,a2,16,q1,q6 # [0*II+8] id:781
ee.ld.128.usar.xp q1,a15,a10 # [0*II+9] id:782
ee.vmulas.s8.accx.ld.xp.qup q3,a15,a12,q3,q0,q1,q2 # [0*II+11] id:783
ee.vmulas.s8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+12] id:784
.LBB186_dspi_dotprod_off_s8_aes3: # 0x21e
st.qr q0,a1,48 # [0] q0
j .Lt_0_26626 # [1]
.LBB50_dspi_dotprod_off_s8_aes3: # 0x224
srli a3,a3,2 # [0]
movi.n a13,-16 # [1]
l32i a11,a1,64 # [2] gra_spill_temp_0
addi a15,a15,16 # [3]
addi a11,a11,16 # [4]
ee.ld.128.usar.xp q2,a15,a13 # [5] id:785
ee.ld.128.usar.xp q1,a15,a11 # [6] id:786
ee.src.q.ld.xp q3,a15,a13,q1,q2 # [8] id:787
ee.ld.128.usar.xp q2,a15,a11 # [9] id:788
beqz.n a3,.Lt_0_28162 # [10]
ld.qr q0,a1,48 # [0] q0
movi.n a10,-16 # [1]
loopnez a3,.LBB209_dspi_dotprod_off_s8_aes3 # [2]
.LBB207_dspi_dotprod_off_s8_aes3: # 0x248
ee.vmulas.s8.accx.ld.xp.qup q3,a15,a10,q0,q1,q2,q3 # [0*II+0] id:789
ee.vmulas.s8.accx.ld.ip q0,a2,16,q1,q6 # [0*II+1] id:790
ee.ld.128.usar.xp q1,a15,a11 # [0*II+2] id:791
ee.vmulas.s8.accx.ld.xp.qup q3,a15,a10,q0,q2,q1,q3 # [0*II+4] id:792
ee.vmulas.s8.accx.ld.ip q0,a2,16,q2,q6 # [0*II+5] id:793
ee.ld.128.usar.xp q4,a15,a11 # [0*II+6] id:794
ee.vmulas.s8.accx.ld.xp.qup q3,a15,a10,q0,q1,q4,q3 # [0*II+8] id:795
ee.vmulas.s8.accx.ld.ip q0,a2,16,q1,q6 # [0*II+9] id:796
ee.ld.128.usar.xp q1,a15,a11 # [0*II+10] id:797
ee.vmulas.s8.accx.ld.xp.qup q3,a15,a10,q0,q4,q1,q3 # [0*II+12] id:798
ee.vmulas.s8.accx.ld.ip q0,a2,16,q4,q6 # [0*II+13] id:799
ee.ld.128.usar.xp q2,a15,a11 # [0*II+14] id:800
.LBB209_dspi_dotprod_off_s8_aes3: # 0x274
st.qr q0,a1,48 # [0] q0
j .Lt_0_28162 # [1]
.LBB57_dspi_dotprod_off_s8_aes3: # 0x27a
ee.ld.128.usar.ip q1,a15,16 # [0] id:801
ee.ld.128.usar.ip q2,a15,16 # [1] id:802
ee.src.q.ld.ip q3,a15,16,q1,q2 # [3] id:803
beqz.n a3,.Lt_0_29698 # [4]
ld.qr q0,a1,48 # [0] q0
movi.n a10,32 # [1]
l32i a12,a1,64 # [2] gra_spill_temp_0
movi.n a11,-16 # [3]
sub a12,a12,a5 # [4]
addi a12,a12,16 # [5]
loopnez a3,.LBB232_dspi_dotprod_off_s8_aes3 # [6]
.LBB230_dspi_dotprod_off_s8_aes3: # 0x298
ee.vmulas.s8.accx.ld.ip.qup q0,a15,16,q0,q1,q2,q3 # [0*II+0] id:804
ee.vmulas.s8.accx.ld.ip q4,a2,16,q1,q6 # [0*II+1] id:805
ee.vmulas.s8.accx.ld.xp.qup q4,a15,a12,q4,q2,q3,q0 # [0*II+3] id:806
ee.vmulas.s8.accx.ld.ip q1,a2,16,q2,q6 # [0*II+4] id:807
ee.vmulas.s8.accx.ld.xp.qup q2,a15,a11,q1,q3,q0,q4 # [0*II+6] id:808
ee.vmulas.s8.accx.ld.ip q4,a2,16,q3,q6 # [0*II+7] id:809
ee.ld.128.usar.xp q1,a15,a10 # [0*II+8] id:810
ee.vmulas.s8.accx.ld.ip.qup q3,a15,16,q4,q0,q1,q2 # [0*II+10] id:811
ee.vmulas.s8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+11] id:812
.LBB232_dspi_dotprod_off_s8_aes3: # 0x2bb
st.qr q0,a1,48 # [0] q0
j .Lt_0_29698 # [1]
.LBB64_dspi_dotprod_off_s8_aes3: # 0x2c1
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
l32i a12,a1,64 # [2] gra_spill_temp_0
ee.ld.128.usar.ip q1,a15,16 # [3] id:813
ee.ld.128.usar.ip q2,a15,16 # [4] id:814
sub a12,a12,a5 # [6]
addi a12,a12,16 # [7]
ld.qr q0,a1,48 # [8] q0
ee.src.q.ld.ip q3,a15,16,q1,q2 # [9] id:815
mov.n a8,a15 # [10]
loopnez a3,.LBB254_dspi_dotprod_off_s8_aes3 # [11]
.LBB252_dspi_dotprod_off_s8_aes3: # 0x2df
ee.vmulas.s8.accx.ld.ip.qup q0,a8,16,q0,q1,q2,q3 # [0*II+0] id:816
ee.vmulas.s8.accx.ld.ip q4,a2,16,q1,q6 # [0*II+1] id:817
ee.vmulas.s8.accx.ld.ip.qup q4,a8,16,q4,q2,q3,q0 # [0*II+3] id:818
ee.vmulas.s8.accx.ld.ip q1,a2,16,q2,q6 # [0*II+4] id:819
ee.vmulas.s8.accx.ld.ip.qup q1,a8,16,q1,q3,q0,q4 # [0*II+6] id:820
ee.vmulas.s8.accx.ld.ip q5,a2,16,q3,q6 # [0*II+7] id:821
ee.vmulas.s8.accx.ld.ip.qup q5,a8,16,q5,q0,q4,q1 # [0*II+9] id:822
ee.vmulas.s8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+10] id:823
ee.vmulas.s8.accx.ld.ip.qup q0,a8,16,q0,q4,q1,q5 # [0*II+12] id:824
ee.vmulas.s8.accx.ld.ip q4,a2,16,q4,q6 # [0*II+13] id:825
ee.vmulas.s8.accx.ld.xp.qup q4,a8,a12,q4,q1,q5,q0 # [0*II+15] id:826
ee.vmulas.s8.accx.ld.ip q1,a2,16,q1,q6 # [0*II+16] id:827
ee.vmulas.s8.accx.ld.xp.qup q2,a8,a11,q1,q5,q0,q4 # [0*II+18] id:828
ee.vmulas.s8.accx.ld.ip q4,a2,16,q5,q6 # [0*II+19] id:829
ee.ld.128.usar.xp q1,a8,a10 # [0*II+20] id:830
ee.vmulas.s8.accx.ld.ip.qup q3,a8,16,q4,q0,q1,q2 # [0*II+22] id:831
ee.vmulas.s8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+23] id:832
.LBB254_dspi_dotprod_off_s8_aes3: # 0x322
movi.n a2,0 # [0]
movi.n a11,1 # [1]
addi.n a12,a7,-1 # [2]
rur.accx_0 a10 # [3]
ssl a12 # [4]
sll a11,a11 # [5]
ssr a7 # [6]
add.n a10,a10,a11 # [7]
sra a10,a10 # [8]
s8i a10,a4,0 # [9] id:854
retw.n # [10]
#endif // dsps_dotprod_s16_aes3_enabled

View File

@@ -0,0 +1,49 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod.h"
esp_err_t dspi_dotprod_off_s8_ansi(image2d_t *in_image, image2d_t *filter, int8_t *out_value, int count_x, int count_y, int shift, int8_t offset)
{
if (in_image->step_x * count_x > in_image->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (in_image->step_y * count_y > in_image->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_x * count_x > filter->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_y * count_y > filter->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
int8_t *i_data = (int8_t *)in_image->data;
int8_t *f_data = (int8_t *)filter->data;
int i_step = in_image->stride_x * in_image->step_y;
int f_step = filter->stride_x * filter->step_y;
int32_t acc = 0;
for (int y = 0; y < count_y; y++) {
for (int x = 0; x < count_x; x++) {
acc += (int16_t)i_data[in_image->step_x * x] * ((int16_t)f_data[filter->step_x * x] + (int16_t)offset);
}
i_data += i_step;
f_data += f_step;
}
acc += 1 << (shift - 1); // round operation
acc >>= shift;
*out_value = acc;
return ESP_OK;
}

View File

@@ -0,0 +1,102 @@
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_arp4_enabled == 1)
#include "dsp_err_codes.h"
.text
.align 4
.global dspi_dotprod_off_s8_arp4
.global dspi_dotprod_off_s8_ansi
.type dspi_dotprod_off_s8_arp4,@function
// esp_err_t dspi_dotprod_off_s8_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift, int8_t offset);
dspi_dotprod_off_s8_arp4:
// in_image - a0
// filter - a1
// out_value - a2
// count_x - a3
// count_y - a4
// shift - a5
// offset - a6
// i_data - t0
// f_data - t1
// i_step - t2
// f_step - t3
// t4 - current i_data
// t5 - current f_data
lw t1, 4(a0) // load in_image->step_x
lw t2, 4(a1) // load filter->step_x
or t1, t1, t2
addi t1, t1, -1 // should be 0 now
andi t2, a3, 15
or t1, t1, t2
beqz t1, .dspi_dotprod_off_s8_arp4_body
j dspi_dotprod_off_s8_ansi
.dspi_dotprod_off_s8_arp4_body:
add sp, sp, -16
sw a6, 0(sp)
mv t6, sp
esp.vldbc.8.ip q2, t6, 0
lw t0, 0(a0) // i_data
lw t1, 0(a1) // f_data
lw t2, 8(a0) // step_y
lw t4, 12(a0) // stride_x
mul t2, t4, t2
lw t3, 8(a1) // step_y
lw t5, 12(a1) // stride_x
mul t3, t5, t3
srli t6, a3, 4 // t5 = len/16
addi a7, a5, -1
li t4, 1
sll t4, t4, a7
esp.zero.xacc
esp.movx.w.xacc.l t4
.loop_count_y:
mv t4, t0
mv t5, t1
esp.vld.128.ip q1, t5, 16 // q0 - i_data
esp.lp.setup 0, t6, .loop_count_x
esp.vld.128.ip q0, t4, 16 // q1 - f_data
esp.vadd.s8 q3, q2, q1
.loop_count_x: esp.vmulas.s8.xacc.ld.ip q1, t5, 16, q0, q3 // q0 - i_data
add t0, t0, t2
add t1, t1, t3
add a4,a4, -1
bgtz a4, .loop_count_y
esp.srs.s.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
sh t5, 0(a2) // store result to output buffer
li a0,0
add sp,sp,16
ret
#endif // dspi_dotprod_arp4_enabled

View File

@@ -0,0 +1,417 @@
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_aes3_enabled == 1)
.text
.align 4
.literal .LC0_1_61, 458755
# Program Unit: dspi_dotprod_off_u16_aes3
.type dspi_dotprod_off_u16_aes3, @function
.align 4
.global dspi_dotprod_off_u16_aes3
dspi_dotprod_off_u16_aes3: # 0x4
.LBB1_dspi_dotprod_off_u16_aes3: # 0x4
entry a1,144 #
l32i.n a10,a2,4 # [0] id:760
l32i.n a12,a2,12 # [1] id:759
mull a8,a10,a5 # [2]
blt a12,a8,.LBB89_dspi_dotprod_off_u16_aes3 # [4]
l32i.n a13,a2,8 # [0] id:761
l32i.n a9,a2,16 # [1] id:762
mull a11,a13,a6 # [2]
blt a9,a11,.LBB89_dspi_dotprod_off_u16_aes3 # [4]
l32i.n a15,a3,4 # [0] id:764
l32i.n a14,a3,12 # [1] id:763
mull a11,a15,a5 # [2]
blt a14,a11,.LBB89_dspi_dotprod_off_u16_aes3 # [4]
l32i.n a8,a3,16 # [0] id:766
l32i.n a9,a3,8 # [1] id:765
s32i a9,a1,104 # [2] gra_spill_temp_2
mull a9,a9,a6 # [3]
blt a8,a9,.LBB89_dspi_dotprod_off_u16_aes3 # [5]
l32i.n a8,a3,0 # [0] id:767
s32i a8,a1,100 # [1] gra_spill_temp_1
bbsi a8,0,.Lt_0_36354 # [2]
bne a14,a11,.Lt_0_36354 # [0]
bnei a15,1,.Lt_0_36354 # [0]
l32i a9,a1,104 # [0] gra_spill_temp_2
beqi a9,1,.Lt_0_19458 # [2]
.Lt_0_36354: # 0x46
.Lt_0_19714: # 0x46
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
l16ui a8,a1,144 # [6] id:768 offset+0x0
s32i.n a8,a1,0 # [7] id:876
.type dspi_dotprod_off_u16_ansi, @function
call8 dspi_dotprod_off_u16_ansi # [8] dspi_dotprod_off_u16_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB89_dspi_dotprod_off_u16_aes3: # 0x5e
l32r a2,.LC0_1_61 # [0]
retw.n # [1]
.Lt_0_19458: # 0x63
addi.n a9,a10,-1 # [0]
bnez a9,.Lt_0_37122 # [1]
addi.n a10,a13,-1 # [0]
bnez a10,.Lt_0_37122 # [1]
extui a11,a5,0,3 # [0]
bnez.n a11,.Lt_0_37122 # [1]
blti a6,4,.Lt_0_37122 # [0]
movi.n a14,32 # [0]
blt a14,a5,.LBB27_dspi_dotprod_off_u16_aes3 # [1]
.Lt_0_37634: # 0x7a
.Lt_0_21506: # 0x7a
l16ui a9,a1,144 # [0] id:768 offset+0x0
addi a8,a1,16 # [1] temp_offset
l32i.n a15,a2,0 # [2] id:769
mull a10,a12,a13 # [3]
l32i a2,a1,100 # [4] gra_spill_temp_1
slli a10,a10,1 # [5]
s32i a10,a1,96 # [6] gra_spill_temp_0
movi.n a10,2 # [7]
# loop-count fixed at 2
loop a10,.LBB143_dspi_dotprod_off_u16_aes3 # [8]
.LBB138_dspi_dotprod_off_u16_aes3: # 0x93
s16i a9,a8,0 # [0*II+0] id:770 temp_offset+0x0
s16i a9,a8,2 # [0*II+1] id:770 temp_offset+0x0
s16i a9,a8,4 # [0*II+2] id:770 temp_offset+0x0
s16i a9,a8,6 # [0*II+3] id:770 temp_offset+0x0
s16i a9,a8,8 # [0*II+4] id:770 temp_offset+0x0
s16i a9,a8,10 # [0*II+5] id:770 temp_offset+0x0
s16i a9,a8,12 # [0*II+6] id:770 temp_offset+0x0
s16i a9,a8,14 # [0*II+7] id:770 temp_offset+0x0
addi a8,a8,16 # [0*II+8]
.LBB143_dspi_dotprod_off_u16_aes3: # 0xae
mov.n a3,a6 # [0]
addi a11,a5,-24 # [1]
addi a12,a1,24 # [3] temp_offset+8
movi.n a13,0 # [4]
wur.sar_byte a13 # [5]
wur.accx_0 a13 # [6]
wur.accx_1 a13 # [7]
ee.vld.128.ip q6,a12,0 # [8] id:771
s32i.n a12,a1,48 # [9] offset_data_ptr
beqz a11,.LBB34_dspi_dotprod_off_u16_aes3 # [10]
l32i a2,a1,100 # [0] gra_spill_temp_1
ee.vld.128.ip q0,a2,16 # [2] id:787
st.qr q0,a1,64 # [3] q0
.Lt_0_25090: # 0xd1
addi a14,a5,-16 # [0]
beqz a14,.LBB43_dspi_dotprod_off_u16_aes3 # [1]
.Lt_0_27138: # 0xd7
.Lt_0_26626: # 0xd7
addi a8,a5,-8 # [0]
beqz a8,.LBB50_dspi_dotprod_off_u16_aes3 # [1]
.Lt_0_28674: # 0xdd
.Lt_0_28162: # 0xdd
addi a9,a5,-32 # [0]
beqz a9,.LBB57_dspi_dotprod_off_u16_aes3 # [1]
.Lt_0_30210: # 0xe3
.Lt_0_29698: # 0xe3
addi a10,a5,-64 # [0]
beqz a10,.LBB64_dspi_dotprod_off_u16_aes3 # [1]
movi.n a11,64 # [0]
bge a11,a5,.Lt_0_33026 # [1]
movi.n a12,0 # [0]
ee.ld.128.usar.ip q1,a15,16 # [1] id:849
ee.ld.128.usar.ip q2,a15,16 # [2] id:850
ee.src.q.ld.ip q3,a15,16,q1,q2 # [4] id:851
beqz.n a3,.Lt_0_33026 # [5]
ld.qr q0,a1,64 # [0] q0
slli a8,a5,1 # [1]
l32i a14,a1,96 # [2] gra_spill_temp_0
addi a13,a5,31 # [3]
movgez a13,a5,a5 # [4]
srai a13,a13,5 # [5]
sub a14,a14,a8 # [6]
addi a14,a14,16 # [7]
addi.n a13,a13,-1 # [8]
.Lt_0_33794: # 0x115
beqz.n a13,.Lt_0_34050 # [0]
loopnez a13,.LBB280_dspi_dotprod_off_u16_aes3 # [0]
.LBB278_dspi_dotprod_off_u16_aes3: # 0x11a
ee.vmulas.u16.accx.ld.ip.qup q0,a15,16,q0,q1,q2,q3 # [0*II+0] id:852
ee.vmulas.u16.accx.ld.ip q1,a2,16,q1,q6 # [0*II+1] id:853
ee.vmulas.u16.accx.ld.ip.qup q1,a15,16,q1,q2,q3,q0 # [0*II+3] id:854
ee.vmulas.u16.accx.ld.ip q4,a2,16,q2,q6 # [0*II+4] id:855
ee.vmulas.u16.accx.ld.ip.qup q2,a15,16,q4,q3,q0,q1 # [0*II+6] id:856
ee.vmulas.u16.accx.ld.ip q4,a2,16,q3,q6 # [0*II+7] id:857
ee.vmulas.u16.accx.ld.ip.qup q3,a15,16,q4,q0,q1,q2 # [0*II+9] id:858
ee.vmulas.u16.accx.ld.ip q0,a2,16,q0,q6 # [0*II+10] id:859
.LBB280_dspi_dotprod_off_u16_aes3: # 0x13a
.Lt_0_34050: # 0x13a
ee.vmulas.u16.accx.ld.ip.qup q4,a15,16,q0,q1,q2,q3 # [0] id:860
ee.vmulas.u16.accx.ld.ip q1,a2,16,q1,q6 # [1] id:861
movi.n a9,32 # [2]
ee.vmulas.u16.accx.ld.xp.qup q0,a15,a14,q1,q2,q3,q4 # [3] id:862
ee.vmulas.u16.accx.ld.ip q7,a2,16,q2,q6 # [4] id:863
movi.n a10,-16 # [5]
ee.vmulas.u16.accx.ld.xp.qup q2,a15,a10,q7,q3,q4,q0 # [6] id:864
ee.vmulas.u16.accx.ld.ip q5,a2,16,q3,q6 # [7] id:866
ee.ld.128.usar.xp q1,a15,a9 # [8] id:865
addi.n a12,a12,1 # [9]
ee.vmulas.u16.accx.ld.ip.qup q3,a15,16,q5,q4,q1,q2 # [10] id:867
ee.vmulas.u16.accx.ld.ip q0,a2,16,q4,q6 # [11] id:868
bne a12,a3,.Lt_0_33794 # [12]
.Lt_0_33026: # 0x166
.Lt_0_32770: # 0x166
rur.accx_0 a9 # [0]
rur.accx_1 a10 # [1]
blti a7,1,.Lt_0_35586 # [2]
movi.n a2,0 # [0]
addi a13,a7,-33 # [1]
addi.n a14,a7,-1 # [2]
ssr a14 # [3]
sra a12,a10 # [4]
src a11,a10,a9 # [5]
movgez a11,a12,a13 # [6]
addi.n a11,a11,1 # [7]
srli a11,a11,1 # [8]
s16i a11,a4,0 # [9] id:874
retw.n # [10]
.Lt_0_37122: # 0x18c
.Lt_0_20738: # 0x18c
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
l16ui a8,a1,144 # [6] id:768 offset+0x0
s32i.n a8,a1,0 # [7] id:877
call8 dspi_dotprod_off_u16_ansi # [8] dspi_dotprod_off_u16_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB27_dspi_dotprod_off_u16_aes3: # 0x1a4
extui a9,a5,0,1 # [0]
beqz a9,.Lt_0_37634 # [1]
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
l16ui a8,a1,144 # [6] id:768 offset+0x0
s32i.n a8,a1,0 # [7] id:878
call8 dspi_dotprod_off_u16_ansi # [8] dspi_dotprod_off_u16_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB34_dspi_dotprod_off_u16_aes3: # 0x1c2
ee.ld.128.usar.ip q0,a15,16 # [0] id:776
ee.ld.128.usar.ip q2,a15,16 # [1] id:777
ee.src.q.ld.ip q3,a15,16,q0,q2 # [3] id:778
beqz.n a6,.Lt_0_25090 # [4]
movi.n a10,32 # [0]
l32i a12,a1,96 # [1] gra_spill_temp_0
movi.n a11,-16 # [2]
addi a12,a12,-32 # [3]
loopgtz a6,.LBB166_dspi_dotprod_off_u16_aes3 # [4]
.LBB164_dspi_dotprod_off_u16_aes3: # 0x1da
ee.vmulas.u16.accx.ld.ip q1,a2,16,q0,q6 # [0*II+0] id:779
ee.vmulas.u16.accx.ld.xp.qup q1,a15,a12,q1,q0,q2,q3 # [0*II+2] id:780
ee.vmulas.u16.accx.ld.ip q0,a2,16,q2,q6 # [0*II+3] id:781
ee.vmulas.u16.accx.ld.xp.qup q2,a15,a11,q0,q2,q3,q1 # [0*II+5] id:782
ee.vmulas.u16.accx.ld.ip q1,a2,16,q3,q6 # [0*II+6] id:784
ee.ld.128.usar.xp q0,a15,a10 # [0*II+7] id:783
ee.vmulas.u16.accx.ld.ip.qup q3,a15,16,q1,q3,q0,q2 # [0*II+9] id:785
.LBB166_dspi_dotprod_off_u16_aes3: # 0x1f5
st.qr q1,a1,64 # [0] q0
j .Lt_0_25090 # [1]
.LBB43_dspi_dotprod_off_u16_aes3: # 0x1fb
srli a3,a6,1 # [0]
l32i a12,a1,96 # [1] gra_spill_temp_0
ee.ld.128.usar.ip q1,a15,16 # [2] id:788
ee.ld.128.usar.ip q2,a15,16 # [3] id:789
addi a12,a12,-16 # [5]
ee.src.q.ld.xp q3,a15,a12,q1,q2 # [6] id:790
beqz.n a3,.Lt_0_27138 # [7]
ld.qr q0,a1,64 # [0] q0
movi.n a10,32 # [1]
movi.n a11,-16 # [2]
loopnez a3,.LBB189_dspi_dotprod_off_u16_aes3 # [3]
.LBB187_dspi_dotprod_off_u16_aes3: # 0x219
ee.vmulas.u16.accx.ld.xp.qup q0,a15,a11,q0,q1,q2,q3 # [0*II+0] id:791
ee.vmulas.u16.accx.ld.ip q3,a2,16,q1,q6 # [0*II+1] id:792
ee.ld.128.usar.xp q1,a15,a10 # [0*II+2] id:793
ee.vmulas.u16.accx.ld.xp.qup q3,a15,a12,q3,q2,q1,q0 # [0*II+4] id:794
ee.vmulas.u16.accx.ld.ip q4,a2,16,q2,q6 # [0*II+5] id:795
ee.vmulas.u16.accx.ld.xp.qup q2,a15,a11,q4,q1,q0,q3 # [0*II+7] id:796
ee.vmulas.u16.accx.ld.ip q3,a2,16,q1,q6 # [0*II+8] id:797
ee.ld.128.usar.xp q1,a15,a10 # [0*II+9] id:798
ee.vmulas.u16.accx.ld.xp.qup q3,a15,a12,q3,q0,q1,q2 # [0*II+11] id:799
ee.vmulas.u16.accx.ld.ip q0,a2,16,q0,q6 # [0*II+12] id:800
.LBB189_dspi_dotprod_off_u16_aes3: # 0x23f
st.qr q0,a1,64 # [0] q0
j .Lt_0_27138 # [1]
.LBB50_dspi_dotprod_off_u16_aes3: # 0x245
srli a3,a3,2 # [0]
movi.n a13,-16 # [1]
l32i a11,a1,96 # [2] gra_spill_temp_0
addi a15,a15,16 # [3]
addi a11,a11,16 # [4]
ee.ld.128.usar.xp q2,a15,a13 # [5] id:801
ee.ld.128.usar.xp q1,a15,a11 # [6] id:802
ee.src.q.ld.xp q3,a15,a13,q1,q2 # [8] id:803
ee.ld.128.usar.xp q2,a15,a11 # [9] id:804
beqz.n a3,.Lt_0_28674 # [10]
ld.qr q0,a1,64 # [0] q0
movi.n a10,-16 # [1]
loopnez a3,.LBB212_dspi_dotprod_off_u16_aes3 # [2]
.LBB210_dspi_dotprod_off_u16_aes3: # 0x269
ee.vmulas.u16.accx.ld.xp.qup q3,a15,a10,q0,q1,q2,q3 # [0*II+0] id:805
ee.vmulas.u16.accx.ld.ip q0,a2,16,q1,q6 # [0*II+1] id:806
ee.ld.128.usar.xp q1,a15,a11 # [0*II+2] id:807
ee.vmulas.u16.accx.ld.xp.qup q3,a15,a10,q0,q2,q1,q3 # [0*II+4] id:808
ee.vmulas.u16.accx.ld.ip q0,a2,16,q2,q6 # [0*II+5] id:809
ee.ld.128.usar.xp q4,a15,a11 # [0*II+6] id:810
ee.vmulas.u16.accx.ld.xp.qup q3,a15,a10,q0,q1,q4,q3 # [0*II+8] id:811
ee.vmulas.u16.accx.ld.ip q0,a2,16,q1,q6 # [0*II+9] id:812
ee.ld.128.usar.xp q1,a15,a11 # [0*II+10] id:813
ee.vmulas.u16.accx.ld.xp.qup q3,a15,a10,q0,q4,q1,q3 # [0*II+12] id:814
ee.vmulas.u16.accx.ld.ip q0,a2,16,q4,q6 # [0*II+13] id:815
ee.ld.128.usar.xp q2,a15,a11 # [0*II+14] id:816
.LBB212_dspi_dotprod_off_u16_aes3: # 0x295
st.qr q0,a1,64 # [0] q0
j .Lt_0_28674 # [1]
.LBB57_dspi_dotprod_off_u16_aes3: # 0x29b
ee.ld.128.usar.ip q1,a15,16 # [0] id:817
ee.ld.128.usar.ip q2,a15,16 # [1] id:818
ee.src.q.ld.ip q3,a15,16,q1,q2 # [3] id:819
beqz.n a3,.Lt_0_30210 # [4]
ld.qr q0,a1,64 # [0] q0
movi.n a10,32 # [1]
movi.n a11,-16 # [2]
l32i a12,a1,96 # [3] gra_spill_temp_0
slli a13,a5,1 # [4]
sub a12,a12,a13 # [5]
addi a12,a12,16 # [6]
loopnez a3,.LBB235_dspi_dotprod_off_u16_aes3 # [7]
.LBB233_dspi_dotprod_off_u16_aes3: # 0x2bc
ee.vmulas.u16.accx.ld.ip.qup q0,a15,16,q0,q1,q2,q3 # [0*II+0] id:820
ee.vmulas.u16.accx.ld.ip q4,a2,16,q1,q6 # [0*II+1] id:821
ee.vmulas.u16.accx.ld.xp.qup q4,a15,a12,q4,q2,q3,q0 # [0*II+3] id:822
ee.vmulas.u16.accx.ld.ip q1,a2,16,q2,q6 # [0*II+4] id:823
ee.vmulas.u16.accx.ld.xp.qup q2,a15,a11,q1,q3,q0,q4 # [0*II+6] id:824
ee.vmulas.u16.accx.ld.ip q4,a2,16,q3,q6 # [0*II+7] id:826
ee.ld.128.usar.xp q1,a15,a10 # [0*II+8] id:825
ee.vmulas.u16.accx.ld.ip.qup q3,a15,16,q4,q0,q1,q2 # [0*II+10] id:827
ee.vmulas.u16.accx.ld.ip q0,a2,16,q0,q6 # [0*II+11] id:828
.LBB235_dspi_dotprod_off_u16_aes3: # 0x2df
st.qr q0,a1,64 # [0] q0
j .Lt_0_30210 # [1]
.LBB64_dspi_dotprod_off_u16_aes3: # 0x2e5
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
slli a13,a5,1 # [2]
l32i a12,a1,96 # [3] gra_spill_temp_0
ee.ld.128.usar.ip q1,a15,16 # [4] id:829
ee.ld.128.usar.ip q2,a15,16 # [5] id:830
sub a12,a12,a13 # [7]
addi a12,a12,16 # [8]
ld.qr q0,a1,64 # [9] q0
ee.src.q.ld.ip q3,a15,16,q1,q2 # [10] id:831
mov.n a8,a15 # [11]
loopnez a3,.LBB257_dspi_dotprod_off_u16_aes3 # [12]
.LBB255_dspi_dotprod_off_u16_aes3: # 0x306
ee.vmulas.u16.accx.ld.ip.qup q0,a8,16,q0,q1,q2,q3 # [0*II+0] id:832
ee.vmulas.u16.accx.ld.ip q4,a2,16,q1,q6 # [0*II+1] id:833
ee.vmulas.u16.accx.ld.ip.qup q4,a8,16,q4,q2,q3,q0 # [0*II+3] id:834
ee.vmulas.u16.accx.ld.ip q1,a2,16,q2,q6 # [0*II+4] id:835
ee.vmulas.u16.accx.ld.ip.qup q1,a8,16,q1,q3,q0,q4 # [0*II+6] id:836
ee.vmulas.u16.accx.ld.ip q5,a2,16,q3,q6 # [0*II+7] id:837
ee.vmulas.u16.accx.ld.ip.qup q5,a8,16,q5,q0,q4,q1 # [0*II+9] id:838
ee.vmulas.u16.accx.ld.ip q0,a2,16,q0,q6 # [0*II+10] id:839
ee.vmulas.u16.accx.ld.ip.qup q0,a8,16,q0,q4,q1,q5 # [0*II+12] id:840
ee.vmulas.u16.accx.ld.ip q4,a2,16,q4,q6 # [0*II+13] id:841
ee.vmulas.u16.accx.ld.xp.qup q4,a8,a12,q4,q1,q5,q0 # [0*II+15] id:842
ee.vmulas.u16.accx.ld.ip q1,a2,16,q1,q6 # [0*II+16] id:843
ee.vmulas.u16.accx.ld.xp.qup q2,a8,a11,q1,q5,q0,q4 # [0*II+18] id:844
ee.vmulas.u16.accx.ld.ip q4,a2,16,q5,q6 # [0*II+19] id:846
ee.ld.128.usar.xp q1,a8,a10 # [0*II+20] id:845
ee.vmulas.u16.accx.ld.ip.qup q3,a8,16,q4,q0,q1,q2 # [0*II+22] id:847
ee.vmulas.u16.accx.ld.ip q0,a2,16,q0,q6 # [0*II+23] id:848
.LBB257_dspi_dotprod_off_u16_aes3: # 0x349
j .Lt_0_33026 # [0]
.Lt_0_35586: # 0x34c
movi.n a2,0 # [0]
sext a14,a9,15 # [1]
s16i a14,a4,0 # [2] id:875
retw.n # [3]
#endif // dsps_dotprod_s16_aes3_enabled

View File

@@ -0,0 +1,49 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod.h"
esp_err_t dspi_dotprod_off_u16_ansi(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift, uint16_t offset)
{
if (in_image->step_x * count_x > in_image->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (in_image->step_y * count_y > in_image->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_x * count_x > filter->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_y * count_y > filter->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
uint16_t *i_data = (uint16_t *)in_image->data;
uint16_t *f_data = (uint16_t *)filter->data;
int i_step = in_image->stride_x * in_image->step_y;
int f_step = filter->stride_x * filter->step_y;
int64_t acc = 0;
for (int y = 0; y < count_y; y++) {
for (int x = 0; x < count_x; x++) {
acc += (int32_t)i_data[in_image->step_x * x] * ((int32_t)f_data[filter->step_x * x] + (int32_t)offset);
}
i_data += i_step;
f_data += f_step;
}
acc += 1 << (shift - 1); // round operation
acc >>= shift;
*out_value = acc;
return ESP_OK;
}

View File

@@ -0,0 +1,104 @@
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_arp4_enabled == 1)
#include "dsp_err_codes.h"
.text
.align 4
.global dspi_dotprod_off_u16_arp4
.global dspi_dotprod_off_u16_ansi
.type dspi_dotprod_off_u16_arp4,@function
// esp_err_t dspi_dotprod_off_u16_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift, unt16_t offset);
dspi_dotprod_off_u16_arp4:
// in_image - a0
// filter - a1
// out_value - a2
// count_x - a3
// count_y - a4
// shift - a5
// offset - a6
// i_data - t0
// f_data - t1
// i_step - t2
// f_step - t3
// t4 - current i_data
// t5 - current f_data
lw t1, 4(a0) // load in_image->step_x
lw t2, 4(a1) // load filter->step_x
or t1, t1, t2
addi t1, t1, -1 // should be 0 now
andi t2, a3, 7
or t1, t1, t2
beqz t1, .dspi_dotprod_off_u16_arp4_body
j dspi_dotprod_off_u16_ansi
.dspi_dotprod_off_u16_arp4_body:
add sp, sp, -16
sw a6, 0(sp)
mv t6, sp
esp.vldbc.16.ip q2, t6, 0
lw t0, 0(a0) // i_data
lw t1, 0(a1) // f_data
lw t2, 8(a0) // step_y
lw t4, 12(a0) // stride_x
mul t2, t4, t2
slli t2, t2, 1 // i_step = i_step<<1
lw t3, 8(a1) // step_y
lw t5, 12(a1) // stride_x
mul t3, t5, t3
slli t3, t3, 1 // f_step = f_step<<1
srli t6, a3, 3 // t5 = len/8
addi a7, a5, -1
li t4, 1
sll t4, t4, a7
esp.zero.xacc
esp.movx.w.xacc.l t4
.loop_count_y:
mv t4, t0
mv t5, t1
esp.vld.128.ip q1, t5, 16 // q0 - i_data
esp.lp.setup 0, t6, .loop_count_x
esp.vld.128.ip q0, t4, 16 // q1 - f_data
esp.vadd.u16 q3, q2, q1
.loop_count_x: esp.vmulas.u16.xacc.ld.ip q1, t5, 16, q0, q3 // q0 - i_data
add t0, t0, t2
add t1, t1, t3
add a4,a4, -1
bgtz a4, .loop_count_y
esp.srs.u.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
sh t5, 0(a2) // store result to output buffer
li a0,0
add sp,sp,16
ret
#endif // dspi_dotprod_arp4_enabled

View File

@@ -0,0 +1,407 @@
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_aes3_enabled == 1)
.text
.align 4
.literal .LC0_1_57, 458755
# Program Unit: dspi_dotprod_off_u8_aes3
.type dspi_dotprod_off_u8_aes3, @function
.align 4
.global dspi_dotprod_off_u8_aes3
dspi_dotprod_off_u8_aes3: # 0x4
.LBB1_dspi_dotprod_off_u8_aes3: # 0x4
entry a1,112 #
l32i.n a10,a2,4 # [0] id:745
l32i.n a12,a2,12 # [1] id:744
mull a8,a10,a5 # [2]
blt a12,a8,.LBB86_dspi_dotprod_off_u8_aes3 # [4]
l32i.n a13,a2,8 # [0] id:746
l32i.n a9,a2,16 # [1] id:747
mull a11,a13,a6 # [2]
blt a9,a11,.LBB86_dspi_dotprod_off_u8_aes3 # [4]
l32i.n a15,a3,4 # [0] id:749
l32i.n a14,a3,12 # [1] id:748
mull a11,a15,a5 # [2]
blt a14,a11,.LBB86_dspi_dotprod_off_u8_aes3 # [4]
l32i.n a8,a3,16 # [0] id:751
l32i.n a9,a3,8 # [1] id:750
s32i a9,a1,72 # [2] gra_spill_temp_2
mull a9,a9,a6 # [3]
blt a8,a9,.LBB86_dspi_dotprod_off_u8_aes3 # [5]
l32i.n a8,a3,0 # [0] id:752
s32i a8,a1,68 # [1] gra_spill_temp_1
bbsi a8,0,.Lt_0_35330 # [2]
bne a14,a11,.Lt_0_35330 # [0]
bnei a15,1,.Lt_0_35330 # [0]
l32i a11,a1,72 # [0] gra_spill_temp_2
beqi a11,1,.Lt_0_18946 # [2]
.Lt_0_35330: # 0x46
.Lt_0_19202: # 0x46
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
.type dspi_dotprod_u8_ansi, @function
call8 dspi_dotprod_u8_ansi # [6] dspi_dotprod_u8_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB86_dspi_dotprod_off_u8_aes3: # 0x59
l32r a2,.LC0_1_57 # [0]
retw.n # [1]
.Lt_0_18946: # 0x5e
addi.n a14,a10,-1 # [0]
bnez a14,.Lt_0_36098 # [1]
addi.n a15,a13,-1 # [0]
bnez a15,.Lt_0_36098 # [1]
extui a8,a5,0,4 # [0]
bnez.n a8,.Lt_0_36098 # [1]
blti a6,4,.Lt_0_36098 # [0]
movi.n a9,64 # [0]
blt a9,a5,.LBB27_dspi_dotprod_off_u8_aes3 # [1]
.Lt_0_36610: # 0x75
.Lt_0_20994: # 0x75
l8ui a9,a1,112 # [0] id:754 offset+0x0
mov.n a8,a1 # [1]
l32i.n a15,a2,0 # [2] id:753
mull a10,a12,a13 # [3]
l32i a2,a1,68 # [4] gra_spill_temp_1
s32i a10,a1,64 # [5] gra_spill_temp_0
movi.n a10,4 # [6]
# loop-count fixed at 4
loop a10,.LBB140_dspi_dotprod_off_u8_aes3 # [7]
.LBB135_dspi_dotprod_off_u8_aes3: # 0x8a
s8i a9,a8,0 # [0*II+0] id:755 temp_offset+0x0
s8i a9,a8,1 # [0*II+1] id:755 temp_offset+0x0
s8i a9,a8,2 # [0*II+2] id:755 temp_offset+0x0
s8i a9,a8,3 # [0*II+3] id:755 temp_offset+0x0
s8i a9,a8,4 # [0*II+4] id:755 temp_offset+0x0
s8i a9,a8,5 # [0*II+5] id:755 temp_offset+0x0
s8i a9,a8,6 # [0*II+6] id:755 temp_offset+0x0
s8i a9,a8,7 # [0*II+7] id:755 temp_offset+0x0
addi.n a8,a8,8 # [0*II+8]
.LBB140_dspi_dotprod_off_u8_aes3: # 0xa4
mov.n a3,a6 # [0]
addi a11,a5,-48 # [1]
addi.n a12,a1,8 # [3] temp_offset+8
movi.n a13,0 # [4]
wur.accx_0 a13 # [5]
wur.accx_1 a13 # [6]
ee.vld.128.ip q6,a12,0 # [7] id:756
s32i.n a12,a1,32 # [8] offset_data_ptr
beqz a11,.LBB34_dspi_dotprod_off_u8_aes3 # [9]
l32i a2,a1,68 # [0] gra_spill_temp_1
ee.vld.128.ip q0,a2,16 # [2] id:771
st.qr q0,a1,48 # [3] q0
.Lt_0_24578: # 0xc3
addi a14,a5,-32 # [0]
beqz a14,.LBB43_dspi_dotprod_off_u8_aes3 # [1]
.Lt_0_26626: # 0xc9
.Lt_0_26114: # 0xc9
addi a8,a5,-16 # [0]
beqz a8,.LBB50_dspi_dotprod_off_u8_aes3 # [1]
.Lt_0_28162: # 0xcf
.Lt_0_27650: # 0xcf
addi a9,a5,-64 # [0]
beqz a9,.LBB57_dspi_dotprod_off_u8_aes3 # [1]
.Lt_0_29698: # 0xd5
.Lt_0_29186: # 0xd5
addi a10,a5,-128 # [0]
beqz a10,.LBB64_dspi_dotprod_off_u8_aes3 # [1]
movi a11,128 # [0]
bge a11,a5,.Lt_0_32514 # [1]
movi.n a12,0 # [0]
ee.ld.128.usar.ip q1,a15,16 # [1] id:833
ee.ld.128.usar.ip q2,a15,16 # [2] id:834
ee.src.q.ld.ip q3,a15,16,q1,q2 # [4] id:835
beqz.n a3,.Lt_0_32514 # [5]
ld.qr q0,a1,48 # [0] q0
l32i a14,a1,64 # [1] gra_spill_temp_0
addi a13,a5,31 # [2]
movgez a13,a5,a5 # [3]
srai a13,a13,5 # [4]
sub a14,a14,a5 # [5]
addi a14,a14,16 # [6]
addi.n a13,a13,-1 # [7]
.Lt_0_33282: # 0x105
beqz.n a13,.Lt_0_33538 # [0]
loopnez a13,.LBB277_dspi_dotprod_off_u8_aes3 # [0]
.LBB275_dspi_dotprod_off_u8_aes3: # 0x10a
ee.vmulas.u8.accx.ld.ip.qup q0,a15,16,q0,q1,q2,q3 # [0*II+0] id:836
ee.vmulas.u8.accx.ld.ip q1,a2,16,q1,q6 # [0*II+1] id:837
ee.vmulas.u8.accx.ld.ip.qup q1,a15,16,q1,q2,q3,q0 # [0*II+3] id:838
ee.vmulas.u8.accx.ld.ip q4,a2,16,q2,q6 # [0*II+4] id:839
ee.vmulas.u8.accx.ld.ip.qup q2,a15,16,q4,q3,q0,q1 # [0*II+6] id:840
ee.vmulas.u8.accx.ld.ip q4,a2,16,q3,q6 # [0*II+7] id:841
ee.vmulas.u8.accx.ld.ip.qup q3,a15,16,q4,q0,q1,q2 # [0*II+9] id:842
ee.vmulas.u8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+10] id:843
.LBB277_dspi_dotprod_off_u8_aes3: # 0x12a
.Lt_0_33538: # 0x12a
ee.vmulas.u8.accx.ld.ip.qup q4,a15,16,q0,q1,q2,q3 # [0] id:844
ee.vmulas.u8.accx.ld.ip q1,a2,16,q1,q6 # [1] id:845
movi.n a8,32 # [2]
ee.vmulas.u8.accx.ld.xp.qup q0,a15,a14,q1,q2,q3,q4 # [3] id:846
ee.vmulas.u8.accx.ld.ip q7,a2,16,q2,q6 # [4] id:847
movi.n a9,-16 # [5]
ee.vmulas.u8.accx.ld.xp.qup q2,a15,a9,q7,q3,q4,q0 # [6] id:848
ee.vmulas.u8.accx.ld.ip q5,a2,16,q3,q6 # [7] id:850
ee.ld.128.usar.xp q1,a15,a8 # [8] id:849
addi.n a12,a12,1 # [9]
ee.vmulas.u8.accx.ld.ip.qup q3,a15,16,q5,q4,q1,q2 # [10] id:851
ee.vmulas.u8.accx.ld.ip q0,a2,16,q4,q6 # [11] id:852
bne a12,a3,.Lt_0_33282 # [12]
.Lt_0_32514: # 0x156
.Lt_0_32258: # 0x156
movi.n a2,0 # [0]
rur.accx_0 a10 # [1]
addi.n a12,a7,-1 # [2]
movi.n a11,1 # [3]
ssl a12 # [4]
sll a11,a11 # [5]
ssr a7 # [6]
add.n a10,a10,a11 # [7]
sra a10,a10 # [8]
s8i a10,a4,0 # [9] id:854
retw.n # [10]
.Lt_0_36098: # 0x172
.Lt_0_20226: # 0x172
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
call8 dspi_dotprod_u8_ansi # [6] dspi_dotprod_u8_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB27_dspi_dotprod_off_u8_aes3: # 0x185
extui a14,a5,0,1 # [0]
beqz a14,.Lt_0_36610 # [1]
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
call8 dspi_dotprod_u8_ansi # [6] dspi_dotprod_u8_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB34_dspi_dotprod_off_u8_aes3: # 0x19e
ee.ld.128.usar.ip q0,a15,16 # [0] id:760
ee.ld.128.usar.ip q2,a15,16 # [1] id:761
ee.src.q.ld.ip q3,a15,16,q0,q2 # [3] id:762
beqz.n a6,.Lt_0_24578 # [4]
movi.n a10,32 # [0]
l32i a12,a1,64 # [1] gra_spill_temp_0
movi.n a11,-16 # [2]
addi a12,a12,-32 # [3]
loopgtz a6,.LBB163_dspi_dotprod_off_u8_aes3 # [4]
.LBB161_dspi_dotprod_off_u8_aes3: # 0x1b6
ee.vmulas.u8.accx.ld.ip q1,a2,16,q0,q6 # [0*II+0] id:763
ee.vmulas.u8.accx.ld.xp.qup q1,a15,a12,q1,q0,q2,q3 # [0*II+2] id:764
ee.vmulas.u8.accx.ld.ip q0,a2,16,q2,q6 # [0*II+3] id:765
ee.vmulas.u8.accx.ld.xp.qup q2,a15,a11,q0,q2,q3,q1 # [0*II+5] id:766
ee.vmulas.u8.accx.ld.ip q1,a2,16,q3,q6 # [0*II+6] id:768
ee.ld.128.usar.xp q0,a15,a10 # [0*II+7] id:767
ee.vmulas.u8.accx.ld.ip.qup q3,a15,16,q1,q3,q0,q2 # [0*II+9] id:769
.LBB163_dspi_dotprod_off_u8_aes3: # 0x1d1
st.qr q1,a1,48 # [0] q0
j .Lt_0_24578 # [1]
.LBB43_dspi_dotprod_off_u8_aes3: # 0x1d7
srli a3,a6,1 # [0]
l32i a12,a1,64 # [1] gra_spill_temp_0
ee.ld.128.usar.ip q1,a15,16 # [2] id:772
ee.ld.128.usar.ip q2,a15,16 # [3] id:773
addi a12,a12,-16 # [5]
ee.src.q.ld.xp q3,a15,a12,q1,q2 # [6] id:774
beqz.n a3,.Lt_0_26626 # [7]
ld.qr q0,a1,48 # [0] q0
movi.n a10,32 # [1]
movi.n a11,-16 # [2]
loopnez a3,.LBB186_dspi_dotprod_off_u8_aes3 # [3]
.LBB184_dspi_dotprod_off_u8_aes3: # 0x1f5
ee.vmulas.u8.accx.ld.xp.qup q0,a15,a11,q0,q1,q2,q3 # [0*II+0] id:775
ee.vmulas.u8.accx.ld.ip q3,a2,16,q1,q6 # [0*II+1] id:776
ee.ld.128.usar.xp q1,a15,a10 # [0*II+2] id:777
ee.vmulas.u8.accx.ld.xp.qup q3,a15,a12,q3,q2,q1,q0 # [0*II+4] id:778
ee.vmulas.u8.accx.ld.ip q4,a2,16,q2,q6 # [0*II+5] id:779
ee.vmulas.u8.accx.ld.xp.qup q2,a15,a11,q4,q1,q0,q3 # [0*II+7] id:780
ee.vmulas.u8.accx.ld.ip q3,a2,16,q1,q6 # [0*II+8] id:781
ee.ld.128.usar.xp q1,a15,a10 # [0*II+9] id:782
ee.vmulas.u8.accx.ld.xp.qup q3,a15,a12,q3,q0,q1,q2 # [0*II+11] id:783
ee.vmulas.u8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+12] id:784
.LBB186_dspi_dotprod_off_u8_aes3: # 0x21b
st.qr q0,a1,48 # [0] q0
j .Lt_0_26626 # [1]
.LBB50_dspi_dotprod_off_u8_aes3: # 0x221
srli a3,a3,2 # [0]
movi.n a13,-16 # [1]
l32i a11,a1,64 # [2] gra_spill_temp_0
addi a15,a15,16 # [3]
addi a11,a11,16 # [4]
ee.ld.128.usar.xp q2,a15,a13 # [5] id:785
ee.ld.128.usar.xp q1,a15,a11 # [6] id:786
ee.src.q.ld.xp q3,a15,a13,q1,q2 # [8] id:787
ee.ld.128.usar.xp q2,a15,a11 # [9] id:788
beqz.n a3,.Lt_0_28162 # [10]
ld.qr q0,a1,48 # [0] q0
movi.n a10,-16 # [1]
loopnez a3,.LBB209_dspi_dotprod_off_u8_aes3 # [2]
.LBB207_dspi_dotprod_off_u8_aes3: # 0x245
ee.vmulas.u8.accx.ld.xp.qup q3,a15,a10,q0,q1,q2,q3 # [0*II+0] id:789
ee.vmulas.u8.accx.ld.ip q0,a2,16,q1,q6 # [0*II+1] id:790
ee.ld.128.usar.xp q1,a15,a11 # [0*II+2] id:791
ee.vmulas.u8.accx.ld.xp.qup q3,a15,a10,q0,q2,q1,q3 # [0*II+4] id:792
ee.vmulas.u8.accx.ld.ip q0,a2,16,q2,q6 # [0*II+5] id:793
ee.ld.128.usar.xp q4,a15,a11 # [0*II+6] id:794
ee.vmulas.u8.accx.ld.xp.qup q3,a15,a10,q0,q1,q4,q3 # [0*II+8] id:795
ee.vmulas.u8.accx.ld.ip q0,a2,16,q1,q6 # [0*II+9] id:796
ee.ld.128.usar.xp q1,a15,a11 # [0*II+10] id:797
ee.vmulas.u8.accx.ld.xp.qup q3,a15,a10,q0,q4,q1,q3 # [0*II+12] id:798
ee.vmulas.u8.accx.ld.ip q0,a2,16,q4,q6 # [0*II+13] id:799
ee.ld.128.usar.xp q2,a15,a11 # [0*II+14] id:800
.LBB209_dspi_dotprod_off_u8_aes3: # 0x271
st.qr q0,a1,48 # [0] q0
j .Lt_0_28162 # [1]
.LBB57_dspi_dotprod_off_u8_aes3: # 0x277
ee.ld.128.usar.ip q1,a15,16 # [0] id:801
ee.ld.128.usar.ip q2,a15,16 # [1] id:802
ee.src.q.ld.ip q3,a15,16,q1,q2 # [3] id:803
beqz.n a3,.Lt_0_29698 # [4]
ld.qr q0,a1,48 # [0] q0
movi.n a10,32 # [1]
l32i a12,a1,64 # [2] gra_spill_temp_0
movi.n a11,-16 # [3]
sub a12,a12,a5 # [4]
addi a12,a12,16 # [5]
loopnez a3,.LBB232_dspi_dotprod_off_u8_aes3 # [6]
.LBB230_dspi_dotprod_off_u8_aes3: # 0x295
ee.vmulas.u8.accx.ld.ip.qup q0,a15,16,q0,q1,q2,q3 # [0*II+0] id:804
ee.vmulas.u8.accx.ld.ip q4,a2,16,q1,q6 # [0*II+1] id:805
ee.vmulas.u8.accx.ld.xp.qup q4,a15,a12,q4,q2,q3,q0 # [0*II+3] id:806
ee.vmulas.u8.accx.ld.ip q1,a2,16,q2,q6 # [0*II+4] id:807
ee.vmulas.u8.accx.ld.xp.qup q2,a15,a11,q1,q3,q0,q4 # [0*II+6] id:808
ee.vmulas.u8.accx.ld.ip q4,a2,16,q3,q6 # [0*II+7] id:809
ee.ld.128.usar.xp q1,a15,a10 # [0*II+8] id:810
ee.vmulas.u8.accx.ld.ip.qup q3,a15,16,q4,q0,q1,q2 # [0*II+10] id:811
ee.vmulas.u8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+11] id:812
.LBB232_dspi_dotprod_off_u8_aes3: # 0x2b8
st.qr q0,a1,48 # [0] q0
j .Lt_0_29698 # [1]
.LBB64_dspi_dotprod_off_u8_aes3: # 0x2be
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
l32i a12,a1,64 # [2] gra_spill_temp_0
ee.ld.128.usar.ip q1,a15,16 # [3] id:813
ee.ld.128.usar.ip q2,a15,16 # [4] id:814
sub a12,a12,a5 # [6]
addi a12,a12,16 # [7]
ld.qr q0,a1,48 # [8] q0
ee.src.q.ld.ip q3,a15,16,q1,q2 # [9] id:815
mov.n a8,a15 # [10]
loopnez a3,.LBB254_dspi_dotprod_off_u8_aes3 # [11]
.LBB252_dspi_dotprod_off_u8_aes3: # 0x2dc
ee.vmulas.u8.accx.ld.ip.qup q0,a8,16,q0,q1,q2,q3 # [0*II+0] id:816
ee.vmulas.u8.accx.ld.ip q4,a2,16,q1,q6 # [0*II+1] id:817
ee.vmulas.u8.accx.ld.ip.qup q4,a8,16,q4,q2,q3,q0 # [0*II+3] id:818
ee.vmulas.u8.accx.ld.ip q1,a2,16,q2,q6 # [0*II+4] id:819
ee.vmulas.u8.accx.ld.ip.qup q1,a8,16,q1,q3,q0,q4 # [0*II+6] id:820
ee.vmulas.u8.accx.ld.ip q5,a2,16,q3,q6 # [0*II+7] id:821
ee.vmulas.u8.accx.ld.ip.qup q5,a8,16,q5,q0,q4,q1 # [0*II+9] id:822
ee.vmulas.u8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+10] id:823
ee.vmulas.u8.accx.ld.ip.qup q0,a8,16,q0,q4,q1,q5 # [0*II+12] id:824
ee.vmulas.u8.accx.ld.ip q4,a2,16,q4,q6 # [0*II+13] id:825
ee.vmulas.u8.accx.ld.xp.qup q4,a8,a12,q4,q1,q5,q0 # [0*II+15] id:826
ee.vmulas.u8.accx.ld.ip q1,a2,16,q1,q6 # [0*II+16] id:827
ee.vmulas.u8.accx.ld.xp.qup q2,a8,a11,q1,q5,q0,q4 # [0*II+18] id:828
ee.vmulas.u8.accx.ld.ip q4,a2,16,q5,q6 # [0*II+19] id:829
ee.ld.128.usar.xp q1,a8,a10 # [0*II+20] id:830
ee.vmulas.u8.accx.ld.ip.qup q3,a8,16,q4,q0,q1,q2 # [0*II+22] id:831
ee.vmulas.u8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+23] id:832
.LBB254_dspi_dotprod_off_u8_aes3: # 0x31f
movi.n a2,0 # [0]
movi.n a11,1 # [1]
addi.n a12,a7,-1 # [2]
rur.accx_0 a10 # [3]
ssl a12 # [4]
sll a11,a11 # [5]
ssr a7 # [6]
add.n a10,a10,a11 # [7]
sra a10,a10 # [8]
s8i a10,a4,0 # [9] id:854
retw.n # [10]
#endif // dsps_dotprod_s16_aes3_enabled

View File

@@ -0,0 +1,49 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod.h"
esp_err_t dspi_dotprod_off_u8_ansi(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift, uint8_t offset)
{
if (in_image->step_x * count_x > in_image->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (in_image->step_y * count_y > in_image->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_x * count_x > filter->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_y * count_y > filter->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
uint8_t *i_data = (uint8_t *)in_image->data;
uint8_t *f_data = (uint8_t *)filter->data;
int i_step = in_image->stride_x * in_image->step_y;
int f_step = filter->stride_x * filter->step_y;
int32_t acc = 0;
for (int y = 0; y < count_y; y++) {
for (int x = 0; x < count_x; x++) {
acc += (int16_t)i_data[in_image->step_x * x] * ((int16_t)f_data[filter->step_x * x] + (int16_t)offset);
}
i_data += i_step;
f_data += f_step;
}
acc += 1 << (shift - 1); // round operation
acc >>= shift;
*out_value = acc;
return ESP_OK;
}

View File

@@ -0,0 +1,102 @@
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_arp4_enabled == 1)
#include "dsp_err_codes.h"
.text
.align 4
.global dspi_dotprod_off_u8_arp4
.global dspi_dotprod_off_u8_ansi
.type dspi_dotprod_off_u8_arp4,@function
// esp_err_t dspi_dotprod_off_u8_arp4(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift, uint8_t offset);
dspi_dotprod_off_u8_arp4:
// in_image - a0
// filter - a1
// out_value - a2
// count_x - a3
// count_y - a4
// shift - a5
// offset - a6
// i_data - t0
// f_data - t1
// i_step - t2
// f_step - t3
// t4 - current i_data
// t5 - current f_data
lw t1, 4(a0) // load in_image->step_x
lw t2, 4(a1) // load filter->step_x
or t1, t1, t2
addi t1, t1, -1 // should be 0 now
andi t2, a3, 15
or t1, t1, t2
beqz t1, .dspi_dotprod_off_u8_arp4_body
j dspi_dotprod_off_u8_ansi
.dspi_dotprod_off_u8_arp4_body:
add sp, sp, -16
sw a6, 0(sp)
mv t6, sp
esp.vldbc.8.ip q2, t6, 0
lw t0, 0(a0) // i_data
lw t1, 0(a1) // f_data
lw t2, 8(a0) // step_y
lw t4, 12(a0) // stride_x
mul t2, t4, t2
lw t3, 8(a1) // step_y
lw t5, 12(a1) // stride_x
mul t3, t5, t3
srli t6, a3, 4 // t5 = len/16
addi a7, a5, -1
li t4, 1
sll t4, t4, a7
esp.zero.xacc
esp.movx.w.xacc.l t4
.loop_count_y:
mv t4, t0
mv t5, t1
esp.vld.128.ip q1, t5, 16 // q0 - i_data
esp.lp.setup 0, t6, .loop_count_x
esp.vld.128.ip q0, t4, 16 // q1 - f_data
esp.vadd.u8 q3, q2, q1
.loop_count_x: esp.vmulas.u8.xacc.ld.ip q1, t5, 16, q0, q3 // q0 - i_data
add t0, t0, t2
add t1, t1, t3
add a4,a4, -1
bgtz a4, .loop_count_y
esp.srs.u.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
sh t5, 0(a2) // store result to output buffer
li a0,0
add sp,sp,16
ret
#endif // dspi_dotprod_arp4_enabled

View File

@@ -0,0 +1,372 @@
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_aes3_enabled == 1)
.text
.align 4
.literal .LC0_1_53, 458755
# Program Unit: dspi_dotprod_s16_aes3
.type dspi_dotprod_s16_aes3, @function
.align 4
.global dspi_dotprod_s16_aes3
dspi_dotprod_s16_aes3: # 0x4
.LBB1_dspi_dotprod_s16_aes3: # 0x4
entry a1,64 #
l32i.n a10,a2,4 # [0] id:678
l32i.n a11,a2,12 # [1] id:677
mull a8,a10,a5 # [2]
blt a11,a8,.LBB81_dspi_dotprod_s16_aes3 # [4]
l32i.n a12,a2,8 # [0] id:679
l32i.n a9,a2,16 # [1] id:680
mull a13,a12,a6 # [2]
blt a9,a13,.LBB81_dspi_dotprod_s16_aes3 # [4]
l32i.n a15,a3,4 # [0] id:682
l32i.n a14,a3,12 # [1] id:681
mull a13,a15,a5 # [2]
blt a14,a13,.LBB81_dspi_dotprod_s16_aes3 # [4]
l32i.n a8,a3,16 # [0] id:684
l32i.n a9,a3,8 # [1] id:683
s32i.n a9,a1,24 # [2] gra_spill_temp_2
mull a9,a9,a6 # [3]
blt a8,a9,.LBB81_dspi_dotprod_s16_aes3 # [5]
l32i.n a8,a3,0 # [0] id:685
s32i.n a8,a1,20 # [1] gra_spill_temp_1
bbsi a8,0,.Lt_0_34050 # [2]
bne a14,a13,.Lt_0_34050 # [0]
bnei a15,1,.Lt_0_34050 # [0]
l32i.n a9,a1,24 # [0] gra_spill_temp_2
beqi a9,1,.Lt_0_18178 # [2]
.Lt_0_34050: # 0x43
.Lt_0_18434: # 0x43
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
.type dspi_dotprod_s16_ansi, @function
call8 dspi_dotprod_s16_ansi # [6] dspi_dotprod_s16_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB81_dspi_dotprod_s16_aes3: # 0x56
l32r a2,.LC0_1_53 # [0]
retw.n # [1]
.Lt_0_18178: # 0x5b
addi.n a13,a10,-1 # [0]
bnez a13,.Lt_0_34818 # [1]
addi.n a14,a12,-1 # [0]
bnez a14,.Lt_0_34818 # [1]
extui a15,a5,0,3 # [0]
bnez.n a15,.Lt_0_34818 # [1]
blti a6,4,.Lt_0_34818 # [0]
movi.n a8,32 # [0]
bge a8,a5,.Lt_0_35330 # [1]
extui a9,a5,0,1 # [0]
bnez a9,.LBB28_dspi_dotprod_s16_aes3 # [1]
.Lt_0_35330: # 0x78
.Lt_0_20226: # 0x78
mov.n a3,a6 # [0]
addi a10,a5,-24 # [1]
mull a13,a11,a12 # [2]
l32i.n a15,a1,20 # [3] gra_spill_temp_1
l32i.n a2,a2,0 # [4] id:686
movi.n a14,0 # [5]
wur.sar_byte a14 # [6]
wur.accx_0 a14 # [8]
wur.accx_1 a14 # [9]
ee.vld.128.ip q0,a15,16 # [10] id:690
slli a13,a13,1 # [11]
s32i.n a13,a1,16 # [12] gra_spill_temp_0
beqz a10,.LBB32_dspi_dotprod_s16_aes3 # [13]
.Lt_0_23298: # 0x99
.Lt_0_22786: # 0x99
addi a8,a5,-16 # [0]
beqz a8,.LBB38_dspi_dotprod_s16_aes3 # [1]
.Lt_0_24834: # 0x9f
.Lt_0_24322: # 0x9f
addi a9,a5,-8 # [0]
beqz a9,.LBB44_dspi_dotprod_s16_aes3 # [1]
.Lt_0_26370: # 0xa5
.Lt_0_25858: # 0xa5
addi a10,a5,-32 # [0]
beqz a10,.LBB50_dspi_dotprod_s16_aes3 # [1]
.Lt_0_27906: # 0xab
.Lt_0_27394: # 0xab
addi a11,a5,-64 # [0]
beqz a11,.LBB56_dspi_dotprod_s16_aes3 # [1]
movi.n a12,64 # [0]
bge a12,a5,.Lt_0_30722 # [1]
movi.n a12,0 # [0]
ee.ld.128.usar.ip q1,a2,16 # [1] id:762
ee.ld.128.usar.ip q2,a2,16 # [2] id:763
ee.src.q.ld.ip q3,a2,16,q1,q2 # [4] id:764
beqz.n a3,.Lt_0_30722 # [5]
slli a8,a5,1 # [0]
l32i.n a14,a1,16 # [1] gra_spill_temp_0
addi a13,a5,31 # [2]
movgez a13,a5,a5 # [3]
srai a13,a13,5 # [4]
sub a14,a14,a8 # [5]
addi a14,a14,16 # [6]
addi.n a13,a13,-1 # [7]
.Lt_0_31490: # 0xd9
addi.n a12,a12,1 # [0]
movi.n a9,32 # [1]
beqz.n a13,.Lt_0_31746 # [2]
loopnez a13,.LBB221_dspi_dotprod_s16_aes3 # [0]
.LBB219_dspi_dotprod_s16_aes3: # 0xe2
ee.vld.128.ip q5,a15,16 # [0*II+0] id:766
ee.vmulas.s16.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:765
ee.vld.128.ip q0,a15,16 # [0*II+2] id:768
ee.vmulas.s16.accx.ld.ip.qup q1,a2,16,q5,q2,q3,q4 # [0*II+3] id:767
ee.vld.128.ip q5,a15,16 # [0*II+4] id:770
ee.vmulas.s16.accx.ld.ip.qup q2,a2,16,q0,q3,q4,q1 # [0*II+5] id:769
ee.vld.128.ip q0,a15,16 # [0*II+6] id:772
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+7] id:771
.LBB221_dspi_dotprod_s16_aes3: # 0xfe
.Lt_0_31746: # 0xfe
ee.vmulas.s16.accx.ld.ip.qup q5,a2,16,q0,q1,q2,q3 # [0] id:773
movi.n a10,-16 # [1]
ee.vld.128.ip q0,a15,16 # [2] id:774
ee.vld.128.ip q6,a15,16 # [3] id:776
ee.vmulas.s16.accx.ld.xp.qup q7,a2,a14,q0,q2,q3,q5 # [4] id:775
ee.vld.128.ip q4,a15,16 # [5] id:779
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a10,q6,q3,q5,q7 # [6] id:777
ee.ld.128.usar.xp q1,a2,a9 # [7] id:778
ee.vld.128.ip q0,a15,16 # [8] id:781
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q4,q5,q1,q2 # [9] id:780
bne a12,a3,.Lt_0_31490 # [10]
.Lt_0_30722: # 0x122
.Lt_0_30466: # 0x122
rur.accx_0 a9 # [0]
rur.accx_1 a10 # [1]
blti a7,1,.Lt_0_33282 # [2]
movi.n a2,0 # [0]
addi a13,a7,-33 # [1]
addi.n a14,a7,-1 # [2]
ssr a14 # [3]
sra a12,a10 # [4]
src a11,a10,a9 # [5]
movgez a11,a12,a13 # [6]
addi.n a11,a11,1 # [7]
srai a11,a11,1 # [8]
s16i a11,a4,0 # [9] id:787
retw.n # [10]
.Lt_0_34818: # 0x148
.Lt_0_19458: # 0x148
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
call8 dspi_dotprod_s16_ansi # [6] dspi_dotprod_s16_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB32_dspi_dotprod_s16_aes3: # 0x15b
ee.ld.128.usar.ip q1,a2,16 # [0] id:691
ee.ld.128.usar.ip q2,a2,16 # [1] id:692
ee.src.q.ld.ip q3,a2,16,q1,q2 # [3] id:693
beqz.n a6,.Lt_0_23298 # [4]
addi a12,a13,-32 # [0]
movi.n a10,32 # [1]
movi.n a11,-16 # [2]
loopgtz a6,.LBB107_dspi_dotprod_s16_aes3 # [3]
.LBB105_dspi_dotprod_s16_aes3: # 0x170
ee.vld.128.ip q4,a15,16 # [0*II+0] id:695
ee.vmulas.s16.accx.ld.xp.qup q1,a2,a12,q0,q1,q2,q3 # [0*II+1] id:694
ee.vld.128.ip q5,a15,16 # [0*II+2] id:697
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a11,q4,q2,q3,q1 # [0*II+3] id:696
ee.ld.128.usar.xp q1,a2,a10 # [0*II+4] id:698
ee.vld.128.ip q0,a15,16 # [0*II+5] id:700
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q5,q3,q1,q2 # [0*II+6] id:699
.LBB107_dspi_dotprod_s16_aes3: # 0x188
j .Lt_0_23298 # [0]
.LBB38_dspi_dotprod_s16_aes3: # 0x18b
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
srli a3,a6,1 # [2]
l32i.n a12,a1,16 # [3] gra_spill_temp_0
ee.ld.128.usar.ip q1,a2,16 # [4] id:701
ee.ld.128.usar.ip q2,a2,16 # [5] id:702
addi a12,a12,-16 # [7]
ee.src.q.ld.xp q3,a2,a12,q1,q2 # [8] id:703
loopnez a3,.LBB130_dspi_dotprod_s16_aes3 # [9]
.LBB128_dspi_dotprod_s16_aes3: # 0x1a3
ee.vld.128.ip q4,a15,16 # [0*II+0] id:705
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a11,q0,q1,q2,q3 # [0*II+1] id:704
ee.ld.128.usar.xp q1,a2,a10 # [0*II+2] id:706
ee.vld.128.ip q0,a15,16 # [0*II+3] id:708
ee.vmulas.s16.accx.ld.xp.qup q4,a2,a12,q4,q2,q1,q3 # [0*II+4] id:707
ee.vld.128.ip q5,a15,16 # [0*II+5] id:710
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a11,q0,q1,q3,q4 # [0*II+6] id:709
ee.ld.128.usar.xp q1,a2,a10 # [0*II+7] id:711
ee.vld.128.ip q0,a15,16 # [0*II+8] id:713
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a12,q5,q3,q1,q2 # [0*II+9] id:712
.LBB130_dspi_dotprod_s16_aes3: # 0x1c5
j .Lt_0_24834 # [0]
.LBB44_dspi_dotprod_s16_aes3: # 0x1c8
srli a3,a3,2 # [0]
movi.n a10,-16 # [1]
l32i.n a11,a1,16 # [2] gra_spill_temp_0
addi a8,a2,16 # [3]
addi a11,a11,16 # [4]
ee.ld.128.usar.xp q2,a8,a10 # [5] id:714
ee.ld.128.usar.xp q1,a8,a11 # [6] id:715
ee.src.q.ld.xp q3,a8,a10,q1,q2 # [8] id:716
ee.ld.128.usar.xp q2,a8,a11 # [9] id:717
loopnez a3,.LBB153_dspi_dotprod_s16_aes3 # [10]
.LBB151_dspi_dotprod_s16_aes3: # 0x1e4
ee.vld.128.ip q4,a15,16 # [0*II+0] id:719
ee.vmulas.s16.accx.ld.xp.qup q3,a8,a10,q0,q1,q2,q3 # [0*II+1] id:718
ee.ld.128.usar.xp q1,a8,a11 # [0*II+2] id:720
ee.vld.128.ip q0,a15,16 # [0*II+3] id:722
ee.vmulas.s16.accx.ld.xp.qup q4,a8,a10,q4,q2,q1,q3 # [0*II+4] id:721
ee.ld.128.usar.xp q3,a8,a11 # [0*II+5] id:723
ee.vld.128.ip q5,a15,16 # [0*II+6] id:725
ee.vmulas.s16.accx.ld.xp.qup q4,a8,a10,q0,q1,q3,q4 # [0*II+7] id:724
ee.ld.128.usar.xp q1,a8,a11 # [0*II+8] id:726
ee.vld.128.ip q0,a15,16 # [0*II+9] id:728
ee.vmulas.s16.accx.ld.xp.qup q3,a8,a10,q5,q3,q1,q4 # [0*II+10] id:727
ee.ld.128.usar.xp q2,a8,a11 # [0*II+11] id:729
.LBB153_dspi_dotprod_s16_aes3: # 0x20c
mov.n a2,a8 # [0]
j .Lt_0_26370 # [1]
.LBB50_dspi_dotprod_s16_aes3: # 0x211
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
slli a13,a5,1 # [2]
l32i.n a12,a1,16 # [3] gra_spill_temp_0
ee.ld.128.usar.ip q1,a2,16 # [4] id:730
ee.ld.128.usar.ip q2,a2,16 # [5] id:731
sub a12,a12,a13 # [6]
ee.src.q.ld.ip q3,a2,16,q1,q2 # [8] id:732
addi a12,a12,16 # [9]
loopnez a3,.LBB176_dspi_dotprod_s16_aes3 # [10]
.LBB174_dspi_dotprod_s16_aes3: # 0x22c
ee.vld.128.ip q5,a15,16 # [0*II+0] id:734
ee.vmulas.s16.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:733
ee.vld.128.ip q1,a15,16 # [0*II+2] id:736
ee.vmulas.s16.accx.ld.xp.qup q0,a2,a12,q5,q2,q3,q4 # [0*II+3] id:735
ee.vld.128.ip q5,a15,16 # [0*II+4] id:739
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a11,q1,q3,q4,q0 # [0*II+5] id:737
ee.ld.128.usar.xp q1,a2,a10 # [0*II+6] id:738
ee.vld.128.ip q0,a15,16 # [0*II+7] id:741
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+8] id:740
.LBB176_dspi_dotprod_s16_aes3: # 0x24b
j .Lt_0_27906 # [0]
.LBB56_dspi_dotprod_s16_aes3: # 0x24e
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
slli a13,a5,1 # [2]
l32i.n a12,a1,16 # [3] gra_spill_temp_0
ee.ld.128.usar.ip q1,a2,16 # [4] id:742
ee.ld.128.usar.ip q2,a2,16 # [5] id:743
sub a12,a12,a13 # [7]
addi a12,a12,16 # [8]
ee.src.q.ld.ip q3,a2,16,q1,q2 # [9] id:744
loopnez a3,.LBB198_dspi_dotprod_s16_aes3 # [10]
.LBB196_dspi_dotprod_s16_aes3: # 0x269
ee.vld.128.ip q4,a15,16 # [0*II+0] id:746
ee.vmulas.s16.accx.ld.ip.qup q1,a2,16,q0,q1,q2,q3 # [0*II+1] id:745
ee.vld.128.ip q0,a15,16 # [0*II+2] id:748
ee.vmulas.s16.accx.ld.ip.qup q4,a2,16,q4,q2,q3,q1 # [0*II+3] id:747
ee.vld.128.ip q5,a15,16 # [0*II+4] id:750
ee.vmulas.s16.accx.ld.ip.qup q0,a2,16,q0,q3,q1,q4 # [0*II+5] id:749
ee.vld.128.ip q6,a15,16 # [0*II+6] id:752
ee.vmulas.s16.accx.ld.ip.qup q1,a2,16,q5,q1,q4,q0 # [0*II+7] id:751
ee.vld.128.ip q5,a15,16 # [0*II+8] id:754
ee.vmulas.s16.accx.ld.ip.qup q4,a2,16,q6,q4,q0,q1 # [0*II+9] id:753
ee.vld.128.ip q6,a15,16 # [0*II+10] id:756
ee.vmulas.s16.accx.ld.xp.qup q0,a2,a12,q5,q0,q1,q4 # [0*II+11] id:755
ee.vld.128.ip q5,a15,16 # [0*II+12] id:759
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a11,q6,q1,q4,q0 # [0*II+13] id:757
ee.ld.128.usar.xp q1,a2,a10 # [0*II+14] id:758
ee.vld.128.ip q0,a15,16 # [0*II+15] id:761
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+16] id:760
.LBB198_dspi_dotprod_s16_aes3: # 0x2a4
j .Lt_0_30722 # [0]
.Lt_0_33282: # 0x2a7
movi.n a2,0 # [0]
sext a14,a9,15 # [1]
s16i a14,a4,0 # [2] id:788
retw.n # [3]
.LBB28_dspi_dotprod_s16_aes3: # 0x2b1
mov.n a15,a7 # [0]
mov.n a14,a6 # [1]
mov.n a13,a5 # [2]
mov.n a12,a4 # [3]
mov.n a11,a3 # [4]
mov.n a10,a2 # [5]
call8 dspi_dotprod_s16_ansi # [6] dspi_dotprod_s16_ansi
mov.n a2,a10 # [0]
retw.n # [1]
#endif // dsps_dotprod_s16_aes3_enabled

View File

@@ -0,0 +1,49 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod.h"
esp_err_t dspi_dotprod_s16_ansi(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift)
{
if (in_image->step_x * count_x > in_image->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (in_image->step_y * count_y > in_image->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_x * count_x > filter->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_y * count_y > filter->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
int16_t *i_data = (int16_t *)in_image->data;
int16_t *f_data = (int16_t *)filter->data;
int i_step = in_image->stride_x * in_image->step_y;
int f_step = filter->stride_x * filter->step_y;
int64_t acc = 0;
for (int y = 0; y < count_y; y++) {
for (int x = 0; x < count_x; x++) {
acc += (int32_t)i_data[in_image->step_x * x] * (int32_t)f_data[filter->step_x * x];
}
i_data += i_step;
f_data += f_step;
}
acc += 1 << (shift - 1); // round operation
acc >>= shift;
*out_value = acc;
return ESP_OK;
}

View File

@@ -0,0 +1,95 @@
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_arp4_enabled == 1)
#include "dsp_err_codes.h"
.text
.align 4
.global dspi_dotprod_s16_arp4
.global dspi_dotprod_s16_ansi
.type dspi_dotprod_s16_arp4,@function
// esp_err_t dspi_dotprod_s16_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift);
dspi_dotprod_s16_arp4:
// in_image - a0
// filter - a1
// out_value - a2
// count_x - a3
// count_y - a4
// shift - a5
// i_data - t0
// f_data - t1
// i_step - t2
// f_step - t3
// t4 - current i_data
// t5 - current f_data
lw t1, 4(a0) // load in_image->step_x
lw t2, 4(a1) // load filter->step_x
or t1, t1, t2
addi t1, t1, -1 // should be 0 now
andi t2, a3, 7
or t1, t1, t2
beqz t1, .dspi_dotprod_s16_arp4_body
j dspi_dotprod_s16_ansi
.dspi_dotprod_s16_arp4_body:
add sp, sp, -16
lw t0, 0(a0) // i_data
lw t1, 0(a1) // f_data
lw t2, 8(a0) // step_y
lw t4, 12(a0) // stride_x
mul t2, t4, t2
slli t2, t2, 1 // i_step = i_step<<1
lw t3, 8(a1) // step_y
lw t5, 12(a1) // stride_x
mul t3, t5, t3
slli t3, t3, 1 // f_step = f_step<<1
srli t6, a3, 3 // t5 = len/8
addi a6, a5, -1
li t4, 1
sll t4, t4, a6
esp.zero.xacc
esp.movx.w.xacc.l t4
.loop_count_y:
mv t4, t0
mv t5, t1
esp.vld.128.ip q0, t4, 16 // q0 - i_data
esp.lp.setup 0, t6, .loop_count_x
esp.vld.128.ip q1, t5, 16 // q1 - f_data
.loop_count_x: esp.vmulas.s16.xacc.ld.ip q0, t4, 16, q0, q1 // q0 - i_data
add t0, t0, t2
add t1, t1, t3
add a4,a4, -1
bgtz a4, .loop_count_y
esp.srs.s.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
sh t5, 0(a2) // store result to output buffer
li a0,0
add sp,sp,16
ret
#endif // dspi_dotprod_arp4_enabled

View File

@@ -0,0 +1,370 @@
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_aes3_enabled == 1)
.text
.align 4
.literal .LC0_1_52, 458755
# Program Unit: dspi_dotprod_s8_aes3
.type dspi_dotprod_s8_aes3, @function
.align 4
.global dspi_dotprod_s8_aes3
dspi_dotprod_s8_aes3: # 0x4
.LBB1_dspi_dotprod_s8_aes3: # 0x4
entry a1,48 #
l32i.n a10,a2,4 # [0] id:668
l32i.n a11,a2,12 # [1] id:667
mull a8,a10,a5 # [2]
blt a11,a8,.LBB78_dspi_dotprod_s8_aes3 # [4]
l32i.n a12,a2,8 # [0] id:669
l32i.n a9,a2,16 # [1] id:670
mull a13,a12,a6 # [2]
blt a9,a13,.LBB78_dspi_dotprod_s8_aes3 # [4]
l32i.n a15,a3,4 # [0] id:672
l32i.n a14,a3,12 # [1] id:671
mull a13,a15,a5 # [2]
blt a14,a13,.LBB78_dspi_dotprod_s8_aes3 # [4]
l32i.n a8,a3,16 # [0] id:674
l32i.n a9,a3,8 # [1] id:673
s32i.n a9,a1,8 # [2] gra_spill_temp_2
mull a9,a9,a6 # [3]
blt a8,a9,.LBB78_dspi_dotprod_s8_aes3 # [5]
l32i.n a8,a3,0 # [0] id:675
s32i.n a8,a1,4 # [1] gra_spill_temp_1
bbsi a8,0,.Lt_0_33026 # [2]
bne a14,a13,.Lt_0_33026 # [0]
bnei a15,1,.Lt_0_33026 # [0]
l32i.n a13,a1,8 # [0] gra_spill_temp_2
beqi a13,1,.Lt_0_17666 # [2]
.Lt_0_33026: # 0x43
.Lt_0_17922: # 0x43
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
.type dspi_dotprod_s8_ansi, @function
call8 dspi_dotprod_s8_ansi # [6] dspi_dotprod_s8_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB78_dspi_dotprod_s8_aes3: # 0x56
l32r a2,.LC0_1_52 # [0]
retw.n # [1]
.Lt_0_17666: # 0x5b
addi.n a14,a10,-1 # [0]
bnez a14,.Lt_0_33794 # [1]
addi.n a15,a12,-1 # [0]
bnez a15,.Lt_0_33794 # [1]
extui a8,a5,0,4 # [0]
bnez.n a8,.Lt_0_33794 # [1]
blti a6,4,.Lt_0_33794 # [0]
movi.n a9,64 # [0]
bge a9,a5,.Lt_0_34306 # [1]
extui a10,a5,0,1 # [0]
bnez a10,.LBB28_dspi_dotprod_s8_aes3 # [1]
.Lt_0_34306: # 0x78
.Lt_0_19714: # 0x78
mov.n a3,a6 # [0]
addi a13,a5,-48 # [1]
movi.n a14,0 # [2]
mull a15,a11,a12 # [3]
l32i.n a2,a2,0 # [4] id:676
s32i.n a15,a1,0 # [6] gra_spill_temp_0
wur.accx_0 a14 # [7]
l32i.n a15,a1,4 # [8] gra_spill_temp_1
wur.accx_1 a14 # [9]
ee.vld.128.ip q0,a15,16 # [10] id:679
beqz a13,.LBB32_dspi_dotprod_s8_aes3 # [11]
.Lt_0_22786: # 0x93
.Lt_0_22274: # 0x93
addi a8,a5,-32 # [0]
beqz a8,.LBB38_dspi_dotprod_s8_aes3 # [1]
.Lt_0_24322: # 0x99
.Lt_0_23810: # 0x99
addi a9,a5,-16 # [0]
beqz a9,.LBB44_dspi_dotprod_s8_aes3 # [1]
.Lt_0_25858: # 0x9f
.Lt_0_25346: # 0x9f
addi a10,a5,-64 # [0]
beqz a10,.LBB50_dspi_dotprod_s8_aes3 # [1]
.Lt_0_27394: # 0xa5
.Lt_0_26882: # 0xa5
addi a11,a5,-128 # [0]
beqz a11,.LBB56_dspi_dotprod_s8_aes3 # [1]
movi a12,128 # [0]
bge a12,a5,.Lt_0_30210 # [1]
movi.n a12,0 # [0]
ee.ld.128.usar.ip q1,a2,16 # [1] id:751
ee.ld.128.usar.ip q2,a2,16 # [2] id:752
ee.src.q.ld.ip q3,a2,16,q1,q2 # [4] id:753
beqz.n a3,.Lt_0_30210 # [5]
l32i.n a14,a1,0 # [0] gra_spill_temp_0
addi a13,a5,63 # [1]
movgez a13,a5,a5 # [2]
srai a13,a13,6 # [3]
sub a14,a14,a5 # [4]
addi a14,a14,16 # [5]
addi.n a13,a13,-1 # [6]
.Lt_0_30978: # 0xd1
addi.n a12,a12,1 # [0]
movi.n a8,32 # [1]
movi.n a9,-16 # [2]
beqz.n a13,.Lt_0_31234 # [3]
loopnez a13,.LBB218_dspi_dotprod_s8_aes3 # [0]
.LBB216_dspi_dotprod_s8_aes3: # 0xdc
ee.vld.128.ip q5,a15,16 # [0*II+0] id:755
ee.vmulas.s8.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:754
ee.vld.128.ip q0,a15,16 # [0*II+2] id:757
ee.vmulas.s8.accx.ld.ip.qup q1,a2,16,q5,q2,q3,q4 # [0*II+3] id:756
ee.vld.128.ip q5,a15,16 # [0*II+4] id:759
ee.vmulas.s8.accx.ld.ip.qup q2,a2,16,q0,q3,q4,q1 # [0*II+5] id:758
ee.vld.128.ip q0,a15,16 # [0*II+6] id:761
ee.vmulas.s8.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+7] id:760
.LBB218_dspi_dotprod_s8_aes3: # 0xf8
.Lt_0_31234: # 0xf8
ee.vmulas.s8.accx.ld.ip.qup q5,a2,16,q0,q1,q2,q3 # [0] id:762
ee.vld.128.ip q0,a15,16 # [1] id:763
ee.vld.128.ip q6,a15,16 # [2] id:765
ee.vmulas.s8.accx.ld.xp.qup q7,a2,a14,q0,q2,q3,q5 # [3] id:764
ee.vld.128.ip q4,a15,16 # [4] id:768
ee.vmulas.s8.accx.ld.xp.qup q2,a2,a9,q6,q3,q5,q7 # [5] id:766
ee.ld.128.usar.xp q1,a2,a8 # [6] id:767
ee.vld.128.ip q0,a15,16 # [7] id:770
ee.vmulas.s8.accx.ld.ip.qup q3,a2,16,q4,q5,q1,q2 # [8] id:769
bne a12,a3,.Lt_0_30978 # [9]
.Lt_0_30210: # 0x11a
.Lt_0_29954: # 0x11a
movi.n a2,0 # [0]
rur.accx_0 a10 # [1]
addi.n a12,a7,-1 # [2]
movi.n a11,1 # [3]
ssl a12 # [4]
sll a11,a11 # [5]
ssr a7 # [6]
add.n a10,a10,a11 # [7]
sra a10,a10 # [8]
s8i a10,a4,0 # [9] id:772
retw.n # [10]
.Lt_0_33794: # 0x136
.Lt_0_18946: # 0x136
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
call8 dspi_dotprod_s8_ansi # [6] dspi_dotprod_s8_ansi
#.LBB25_dspi_dotprod_s8_aes3: # 0x145
mov.n a2,a10 # [0]
retw.n # [1]
.LBB32_dspi_dotprod_s8_aes3: # 0x149
ee.ld.128.usar.ip q1,a2,16 # [0] id:680
ee.ld.128.usar.ip q2,a2,16 # [1] id:681
ee.src.q.ld.ip q3,a2,16,q1,q2 # [3] id:682
beqz.n a6,.Lt_0_22786 # [4]
movi.n a10,32 # [0]
l32i.n a12,a1,0 # [1] gra_spill_temp_0
movi.n a11,-16 # [2]
addi a12,a12,-32 # [3]
loopgtz a6,.LBB104_dspi_dotprod_s8_aes3 # [4]
.LBB102_dspi_dotprod_s8_aes3: # 0x160
ee.vld.128.ip q4,a15,16 # [0*II+0] id:684
ee.vmulas.s8.accx.ld.xp.qup q1,a2,a12,q0,q1,q2,q3 # [0*II+1] id:683
ee.vld.128.ip q5,a15,16 # [0*II+2] id:686
ee.vmulas.s8.accx.ld.xp.qup q2,a2,a11,q4,q2,q3,q1 # [0*II+3] id:685
ee.ld.128.usar.xp q1,a2,a10 # [0*II+4] id:687
ee.vld.128.ip q0,a15,16 # [0*II+5] id:689
ee.vmulas.s8.accx.ld.ip.qup q3,a2,16,q5,q3,q1,q2 # [0*II+6] id:688
.LBB104_dspi_dotprod_s8_aes3: # 0x178
j .Lt_0_22786 # [0]
.LBB38_dspi_dotprod_s8_aes3: # 0x17b
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
srli a3,a6,1 # [2]
l32i.n a12,a1,0 # [3] gra_spill_temp_0
ee.ld.128.usar.ip q1,a2,16 # [4] id:690
ee.ld.128.usar.ip q2,a2,16 # [5] id:691
addi a12,a12,-16 # [7]
ee.src.q.ld.xp q3,a2,a12,q1,q2 # [8] id:692
loopnez a3,.LBB127_dspi_dotprod_s8_aes3 # [9]
.LBB125_dspi_dotprod_s8_aes3: # 0x193
ee.vld.128.ip q4,a15,16 # [0*II+0] id:694
ee.vmulas.s8.accx.ld.xp.qup q3,a2,a11,q0,q1,q2,q3 # [0*II+1] id:693
ee.ld.128.usar.xp q1,a2,a10 # [0*II+2] id:695
ee.vld.128.ip q0,a15,16 # [0*II+3] id:697
ee.vmulas.s8.accx.ld.xp.qup q4,a2,a12,q4,q2,q1,q3 # [0*II+4] id:696
ee.vld.128.ip q5,a15,16 # [0*II+5] id:699
ee.vmulas.s8.accx.ld.xp.qup q2,a2,a11,q0,q1,q3,q4 # [0*II+6] id:698
ee.ld.128.usar.xp q1,a2,a10 # [0*II+7] id:700
ee.vld.128.ip q0,a15,16 # [0*II+8] id:702
ee.vmulas.s8.accx.ld.xp.qup q3,a2,a12,q5,q3,q1,q2 # [0*II+9] id:701
.LBB127_dspi_dotprod_s8_aes3: # 0x1b5
j .Lt_0_24322 # [0]
.LBB44_dspi_dotprod_s8_aes3: # 0x1b8
srli a3,a3,2 # [0]
movi.n a10,-16 # [1]
l32i.n a11,a1,0 # [2] gra_spill_temp_0
addi a8,a2,16 # [3]
addi a11,a11,16 # [4]
ee.ld.128.usar.xp q2,a8,a10 # [5] id:703
ee.ld.128.usar.xp q1,a8,a11 # [6] id:704
ee.src.q.ld.xp q3,a8,a10,q1,q2 # [8] id:705
ee.ld.128.usar.xp q2,a8,a11 # [9] id:706
loopnez a3,.LBB150_dspi_dotprod_s8_aes3 # [10]
.LBB148_dspi_dotprod_s8_aes3: # 0x1d4
ee.vld.128.ip q4,a15,16 # [0*II+0] id:708
ee.vmulas.s8.accx.ld.xp.qup q3,a8,a10,q0,q1,q2,q3 # [0*II+1] id:707
ee.ld.128.usar.xp q1,a8,a11 # [0*II+2] id:709
ee.vld.128.ip q0,a15,16 # [0*II+3] id:711
ee.vmulas.s8.accx.ld.xp.qup q4,a8,a10,q4,q2,q1,q3 # [0*II+4] id:710
ee.ld.128.usar.xp q3,a8,a11 # [0*II+5] id:712
ee.vld.128.ip q5,a15,16 # [0*II+6] id:714
ee.vmulas.s8.accx.ld.xp.qup q4,a8,a10,q0,q1,q3,q4 # [0*II+7] id:713
ee.ld.128.usar.xp q1,a8,a11 # [0*II+8] id:715
ee.vld.128.ip q0,a15,16 # [0*II+9] id:717
ee.vmulas.s8.accx.ld.xp.qup q3,a8,a10,q5,q3,q1,q4 # [0*II+10] id:716
ee.ld.128.usar.xp q2,a8,a11 # [0*II+11] id:718
.LBB150_dspi_dotprod_s8_aes3: # 0x1fc
mov.n a2,a8 # [0]
j .Lt_0_25858 # [1]
.LBB50_dspi_dotprod_s8_aes3: # 0x201
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
l32i.n a12,a1,0 # [2] gra_spill_temp_0
ee.ld.128.usar.ip q1,a2,16 # [3] id:719
ee.ld.128.usar.ip q2,a2,16 # [4] id:720
sub a12,a12,a5 # [5]
ee.src.q.ld.ip q3,a2,16,q1,q2 # [7] id:721
addi a12,a12,16 # [8]
loopnez a3,.LBB173_dspi_dotprod_s8_aes3 # [9]
.LBB171_dspi_dotprod_s8_aes3: # 0x219
ee.vld.128.ip q5,a15,16 # [0*II+0] id:723
ee.vmulas.s8.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:722
ee.vld.128.ip q1,a15,16 # [0*II+2] id:725
ee.vmulas.s8.accx.ld.xp.qup q0,a2,a12,q5,q2,q3,q4 # [0*II+3] id:724
ee.vld.128.ip q5,a15,16 # [0*II+4] id:728
ee.vmulas.s8.accx.ld.xp.qup q2,a2,a11,q1,q3,q4,q0 # [0*II+5] id:726
ee.ld.128.usar.xp q1,a2,a10 # [0*II+6] id:727
ee.vld.128.ip q0,a15,16 # [0*II+7] id:730
ee.vmulas.s8.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+8] id:729
.LBB173_dspi_dotprod_s8_aes3: # 0x238
j .Lt_0_27394 # [0]
.LBB56_dspi_dotprod_s8_aes3: # 0x23b
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
l32i.n a12,a1,0 # [2] gra_spill_temp_0
ee.ld.128.usar.ip q1,a2,16 # [3] id:731
ee.ld.128.usar.ip q2,a2,16 # [4] id:732
sub a12,a12,a5 # [6]
addi a12,a12,16 # [7]
ee.src.q.ld.ip q3,a2,16,q1,q2 # [8] id:733
loopnez a3,.LBB195_dspi_dotprod_s8_aes3 # [9]
.LBB193_dspi_dotprod_s8_aes3: # 0x253
ee.vld.128.ip q4,a15,16 # [0*II+0] id:735
ee.vmulas.s8.accx.ld.ip.qup q1,a2,16,q0,q1,q2,q3 # [0*II+1] id:734
ee.vld.128.ip q0,a15,16 # [0*II+2] id:737
ee.vmulas.s8.accx.ld.ip.qup q4,a2,16,q4,q2,q3,q1 # [0*II+3] id:736
ee.vld.128.ip q5,a15,16 # [0*II+4] id:739
ee.vmulas.s8.accx.ld.ip.qup q0,a2,16,q0,q3,q1,q4 # [0*II+5] id:738
ee.vld.128.ip q6,a15,16 # [0*II+6] id:741
ee.vmulas.s8.accx.ld.ip.qup q1,a2,16,q5,q1,q4,q0 # [0*II+7] id:740
ee.vld.128.ip q5,a15,16 # [0*II+8] id:743
ee.vmulas.s8.accx.ld.ip.qup q4,a2,16,q6,q4,q0,q1 # [0*II+9] id:742
ee.vld.128.ip q6,a15,16 # [0*II+10] id:745
ee.vmulas.s8.accx.ld.xp.qup q0,a2,a12,q5,q0,q1,q4 # [0*II+11] id:744
ee.vld.128.ip q5,a15,16 # [0*II+12] id:748
ee.vmulas.s8.accx.ld.xp.qup q2,a2,a11,q6,q1,q4,q0 # [0*II+13] id:746
ee.ld.128.usar.xp q1,a2,a10 # [0*II+14] id:747
ee.vld.128.ip q0,a15,16 # [0*II+15] id:750
ee.vmulas.s8.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+16] id:749
.LBB195_dspi_dotprod_s8_aes3: # 0x28e
movi.n a2,0 # [0]
movi.n a11,1 # [1]
addi.n a12,a7,-1 # [2]
rur.accx_0 a10 # [3]
ssl a12 # [4]
sll a11,a11 # [5]
ssr a7 # [6]
add.n a10,a10,a11 # [7]
sra a10,a10 # [8]
s8i a10,a4,0 # [9] id:772
retw.n # [10]
.LBB28_dspi_dotprod_s8_aes3: # 0x2aa
mov.n a15,a7 # [0]
mov.n a14,a6 # [1]
mov.n a13,a5 # [2]
mov.n a12,a4 # [3]
mov.n a11,a3 # [4]
mov.n a10,a2 # [5]
call8 dspi_dotprod_s8_ansi # [6] dspi_dotprod_s8_ansi
#.LBB29_dspi_dotprod_s8_aes3: # 0x2b9
mov.n a2,a10 # [0]
retw.n # [1]
#endif // dsps_dotprod_s16_aes3_enabled

View File

@@ -0,0 +1,49 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod.h"
esp_err_t dspi_dotprod_s8_ansi(image2d_t *in_image, image2d_t *filter, int8_t *out_value, int count_x, int count_y, int shift)
{
if (in_image->step_x * count_x > in_image->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (in_image->step_y * count_y > in_image->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_x * count_x > filter->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_y * count_y > filter->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
int8_t *i_data = (int8_t *)in_image->data;
int8_t *f_data = (int8_t *)filter->data;
int i_step = in_image->stride_x * in_image->step_y;
int f_step = filter->stride_x * filter->step_y;
int32_t acc = 0;
for (int y = 0; y < count_y; y++) {
for (int x = 0; x < count_x; x++) {
acc += (int16_t)i_data[in_image->step_x * x] * (int16_t)f_data[filter->step_x * x];
}
i_data += i_step;
f_data += f_step;
}
acc += 1 << (shift - 1); // round operation
acc >>= shift;
*out_value = acc;
return ESP_OK;
}

View File

@@ -0,0 +1,93 @@
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_arp4_enabled == 1)
#include "dsp_err_codes.h"
.text
.align 4
.global dspi_dotprod_s8_arp4
.global dspi_dotprod_s8_ansi
.type dspi_dotprod_s8_arp4,@function
// esp_err_t dspi_dotprod_s8_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift);
dspi_dotprod_s8_arp4:
// in_image - a0
// filter - a1
// out_value - a2
// count_x - a3
// count_y - a4
// shift - a5
// i_data - t0
// f_data - t1
// i_step - t2
// f_step - t3
// t4 - current i_data
// t5 - current f_data
lw t1, 4(a0) // load in_image->step_x
lw t2, 4(a1) // load filter->step_x
or t1, t1, t2
addi t1, t1, -1 // should be 0 now
andi t2, a3, 15
or t1, t1, t2
beqz t1, .dspi_dotprod_s8_arp4_body
j dspi_dotprod_s8_ansi
.dspi_dotprod_s8_arp4_body:
add sp, sp, -16
lw t0, 0(a0) // i_data
lw t1, 0(a1) // f_data
lw t2, 8(a0) // step_y
lw t4, 12(a0) // stride_x
mul t2, t4, t2
lw t3, 8(a1) // step_y
lw t5, 12(a1) // stride_x
mul t3, t5, t3
srli t6, a3, 4 // t5 = len/16
addi a6, a5, -1
li t4, 1
sll t4, t4, a6
esp.zero.xacc
esp.movx.w.xacc.l t4
.loop_count_y:
mv t4, t0
mv t5, t1
esp.vld.128.ip q0, t4, 16 // q0 - i_data
esp.lp.setup 0, t6, .loop_count_x
esp.vld.128.ip q1, t5, 16 // q1 - f_data
.loop_count_x: esp.vmulas.s8.xacc.ld.ip q0, t4, 16, q0, q1 // q0 - i_data
add t0, t0, t2
add t1, t1, t3
add a4,a4, -1
bgtz a4, .loop_count_y
esp.srs.s.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
sh t5, 0(a2) // store result to output buffer
li a0,0
add sp,sp,16
ret
#endif // dspi_dotprod_arp4_enabled

View File

@@ -0,0 +1,371 @@
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_aes3_enabled == 1)
.text
.align 4
.literal .LC0_1_55, 458755
# Program Unit: dspi_dotprod_u16_aes3
.type dspi_dotprod_u16_aes3, @function
.align 4
.global dspi_dotprod_u16_aes3
dspi_dotprod_u16_aes3: # 0x4
.LBB1_dspi_dotprod_u16_aes3: # 0x4
entry a1,64 #
l32i.n a10,a2,4 # [0] id:681
l32i.n a11,a2,12 # [1] id:680
mull a8,a10,a5 # [2]
blt a11,a8,.LBB81_dspi_dotprod_u16_aes3 # [4]
l32i.n a12,a2,8 # [0] id:682
l32i.n a9,a2,16 # [1] id:683
mull a13,a12,a6 # [2]
blt a9,a13,.LBB81_dspi_dotprod_u16_aes3 # [4]
l32i.n a15,a3,4 # [0] id:685
l32i.n a14,a3,12 # [1] id:684
mull a13,a15,a5 # [2]
blt a14,a13,.LBB81_dspi_dotprod_u16_aes3 # [4]
l32i.n a8,a3,16 # [0] id:687
l32i.n a9,a3,8 # [1] id:686
s32i.n a9,a1,24 # [2] gra_spill_temp_2
mull a9,a9,a6 # [3]
blt a8,a9,.LBB81_dspi_dotprod_u16_aes3 # [5]
l32i.n a8,a3,0 # [0] id:688
s32i.n a8,a1,20 # [1] gra_spill_temp_1
bbsi a8,0,.Lt_0_34050 # [2]
bne a14,a13,.Lt_0_34050 # [0]
bnei a15,1,.Lt_0_34050 # [0]
l32i.n a9,a1,24 # [0] gra_spill_temp_2
beqi a9,1,.Lt_0_18178 # [2]
.Lt_0_34050: # 0x43
.Lt_0_18434: # 0x43
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
.type dspi_dotprod_s16_ansi, @function
call8 dspi_dotprod_s16_ansi # [6] dspi_dotprod_s16_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB81_dspi_dotprod_u16_aes3: # 0x56
l32r a2,.LC0_1_55 # [0]
retw.n # [1]
.Lt_0_18178: # 0x5b
addi.n a13,a10,-1 # [0]
bnez a13,.Lt_0_34818 # [1]
addi.n a14,a12,-1 # [0]
bnez a14,.Lt_0_34818 # [1]
extui a15,a5,0,3 # [0]
bnez.n a15,.Lt_0_34818 # [1]
blti a6,4,.Lt_0_34818 # [0]
movi.n a8,32 # [0]
bge a8,a5,.Lt_0_35330 # [1]
extui a9,a5,0,1 # [0]
bnez a9,.LBB28_dspi_dotprod_u16_aes3 # [1]
.Lt_0_35330: # 0x78
.Lt_0_20226: # 0x78
mov.n a3,a6 # [0]
addi a10,a5,-24 # [1]
mull a13,a11,a12 # [2]
l32i.n a15,a1,20 # [3] gra_spill_temp_1
l32i.n a2,a2,0 # [4] id:689
movi.n a14,0 # [5]
wur.sar_byte a14 # [6]
wur.accx_0 a14 # [8]
wur.accx_1 a14 # [9]
ee.vld.128.ip q0,a15,16 # [10] id:693
slli a13,a13,1 # [11]
s32i.n a13,a1,16 # [12] gra_spill_temp_0
beqz a10,.LBB32_dspi_dotprod_u16_aes3 # [13]
.Lt_0_23298: # 0x99
.Lt_0_22786: # 0x99
addi a8,a5,-16 # [0]
beqz a8,.LBB38_dspi_dotprod_u16_aes3 # [1]
.Lt_0_24834: # 0x9f
.Lt_0_24322: # 0x9f
addi a9,a5,-8 # [0]
beqz a9,.LBB44_dspi_dotprod_u16_aes3 # [1]
.Lt_0_26370: # 0xa5
.Lt_0_25858: # 0xa5
addi a10,a5,-32 # [0]
beqz a10,.LBB50_dspi_dotprod_u16_aes3 # [1]
.Lt_0_27906: # 0xab
.Lt_0_27394: # 0xab
addi a11,a5,-64 # [0]
beqz a11,.LBB56_dspi_dotprod_u16_aes3 # [1]
movi.n a12,64 # [0]
bge a12,a5,.Lt_0_30722 # [1]
movi.n a12,0 # [0]
ee.ld.128.usar.ip q1,a2,16 # [1] id:765
ee.ld.128.usar.ip q2,a2,16 # [2] id:766
ee.src.q.ld.ip q3,a2,16,q1,q2 # [4] id:767
beqz.n a3,.Lt_0_30722 # [5]
slli a8,a5,1 # [0]
l32i.n a14,a1,16 # [1] gra_spill_temp_0
addi a13,a5,31 # [2]
movgez a13,a5,a5 # [3]
srai a13,a13,5 # [4]
sub a14,a14,a8 # [5]
addi a14,a14,16 # [6]
addi.n a13,a13,-1 # [7]
.Lt_0_31490: # 0xd9
addi.n a12,a12,1 # [0]
movi.n a9,32 # [1]
beqz.n a13,.Lt_0_31746 # [2]
loopnez a13,.LBB221_dspi_dotprod_u16_aes3 # [0]
.LBB219_dspi_dotprod_u16_aes3: # 0xe2
ee.vld.128.ip q5,a15,16 # [0*II+0] id:769
ee.vmulas.u16.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:768
ee.vld.128.ip q0,a15,16 # [0*II+2] id:771
ee.vmulas.u16.accx.ld.ip.qup q1,a2,16,q5,q2,q3,q4 # [0*II+3] id:770
ee.vld.128.ip q5,a15,16 # [0*II+4] id:773
ee.vmulas.u16.accx.ld.ip.qup q2,a2,16,q0,q3,q4,q1 # [0*II+5] id:772
ee.vld.128.ip q0,a15,16 # [0*II+6] id:775
ee.vmulas.u16.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+7] id:774
.LBB221_dspi_dotprod_u16_aes3: # 0xfe
.Lt_0_31746: # 0xfe
ee.vmulas.u16.accx.ld.ip.qup q5,a2,16,q0,q1,q2,q3 # [0] id:776
movi.n a10,-16 # [1]
ee.vld.128.ip q0,a15,16 # [2] id:777
ee.vld.128.ip q6,a15,16 # [3] id:779
ee.vmulas.u16.accx.ld.xp.qup q7,a2,a14,q0,q2,q3,q5 # [4] id:778
ee.vld.128.ip q4,a15,16 # [5] id:782
ee.vmulas.u16.accx.ld.xp.qup q2,a2,a10,q6,q3,q5,q7 # [6] id:780
ee.ld.128.usar.xp q1,a2,a9 # [7] id:781
ee.vld.128.ip q0,a15,16 # [8] id:784
ee.vmulas.u16.accx.ld.ip.qup q3,a2,16,q4,q5,q1,q2 # [9] id:783
bne a12,a3,.Lt_0_31490 # [10]
.Lt_0_30722: # 0x122
.Lt_0_30466: # 0x122
rur.accx_0 a9 # [0]
rur.accx_1 a10 # [1]
blti a7,1,.Lt_0_33282 # [2]
movi.n a2,0 # [0]
addi a13,a7,-33 # [1]
addi.n a14,a7,-1 # [2]
ssr a14 # [3]
sra a12,a10 # [4]
src a11,a10,a9 # [5]
movgez a11,a12,a13 # [6]
addi.n a11,a11,1 # [7]
srli a11,a11,1 # [8]
s16i a11,a4,0 # [9] id:790
retw.n # [10]
.Lt_0_34818: # 0x148
.Lt_0_19458: # 0x148
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
call8 dspi_dotprod_s16_ansi # [6] dspi_dotprod_s16_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB32_dspi_dotprod_u16_aes3: # 0x15b
ee.ld.128.usar.ip q1,a2,16 # [0] id:694
ee.ld.128.usar.ip q2,a2,16 # [1] id:695
ee.src.q.ld.ip q3,a2,16,q1,q2 # [3] id:696
beqz.n a6,.Lt_0_23298 # [4]
addi a12,a13,-32 # [0]
movi.n a10,32 # [1]
movi.n a11,-16 # [2]
loopgtz a6,.LBB107_dspi_dotprod_u16_aes3 # [3]
.LBB105_dspi_dotprod_u16_aes3: # 0x170
ee.vld.128.ip q4,a15,16 # [0*II+0] id:698
ee.vmulas.u16.accx.ld.xp.qup q1,a2,a12,q0,q1,q2,q3 # [0*II+1] id:697
ee.vld.128.ip q5,a15,16 # [0*II+2] id:700
ee.vmulas.u16.accx.ld.xp.qup q2,a2,a11,q4,q2,q3,q1 # [0*II+3] id:699
ee.ld.128.usar.xp q1,a2,a10 # [0*II+4] id:701
ee.vld.128.ip q0,a15,16 # [0*II+5] id:703
ee.vmulas.u16.accx.ld.ip.qup q3,a2,16,q5,q3,q1,q2 # [0*II+6] id:702
.LBB107_dspi_dotprod_u16_aes3: # 0x188
j .Lt_0_23298 # [0]
.LBB38_dspi_dotprod_u16_aes3: # 0x18b
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
srli a3,a6,1 # [2]
l32i.n a12,a1,16 # [3] gra_spill_temp_0
ee.ld.128.usar.ip q1,a2,16 # [4] id:704
ee.ld.128.usar.ip q2,a2,16 # [5] id:705
addi a12,a12,-16 # [7]
ee.src.q.ld.xp q3,a2,a12,q1,q2 # [8] id:706
loopnez a3,.LBB130_dspi_dotprod_u16_aes3 # [9]
.LBB128_dspi_dotprod_u16_aes3: # 0x1a3
ee.vld.128.ip q4,a15,16 # [0*II+0] id:708
ee.vmulas.u16.accx.ld.xp.qup q3,a2,a11,q0,q1,q2,q3 # [0*II+1] id:707
ee.ld.128.usar.xp q1,a2,a10 # [0*II+2] id:709
ee.vld.128.ip q0,a15,16 # [0*II+3] id:711
ee.vmulas.u16.accx.ld.xp.qup q4,a2,a12,q4,q2,q1,q3 # [0*II+4] id:710
ee.vld.128.ip q5,a15,16 # [0*II+5] id:713
ee.vmulas.u16.accx.ld.xp.qup q2,a2,a11,q0,q1,q3,q4 # [0*II+6] id:712
ee.ld.128.usar.xp q1,a2,a10 # [0*II+7] id:714
ee.vld.128.ip q0,a15,16 # [0*II+8] id:716
ee.vmulas.u16.accx.ld.xp.qup q3,a2,a12,q5,q3,q1,q2 # [0*II+9] id:715
.LBB130_dspi_dotprod_u16_aes3: # 0x1c5
j .Lt_0_24834 # [0]
.LBB44_dspi_dotprod_u16_aes3: # 0x1c8
srli a3,a3,2 # [0]
movi.n a10,-16 # [1]
l32i.n a11,a1,16 # [2] gra_spill_temp_0
addi a8,a2,16 # [3]
addi a11,a11,16 # [4]
ee.ld.128.usar.xp q2,a8,a10 # [5] id:717
ee.ld.128.usar.xp q1,a8,a11 # [6] id:718
ee.src.q.ld.xp q3,a8,a10,q1,q2 # [8] id:719
ee.ld.128.usar.xp q2,a8,a11 # [9] id:720
loopnez a3,.LBB153_dspi_dotprod_u16_aes3 # [10]
.LBB151_dspi_dotprod_u16_aes3: # 0x1e4
ee.vld.128.ip q4,a15,16 # [0*II+0] id:722
ee.vmulas.u16.accx.ld.xp.qup q3,a8,a10,q0,q1,q2,q3 # [0*II+1] id:721
ee.ld.128.usar.xp q1,a8,a11 # [0*II+2] id:723
ee.vld.128.ip q0,a15,16 # [0*II+3] id:725
ee.vmulas.u16.accx.ld.xp.qup q4,a8,a10,q4,q2,q1,q3 # [0*II+4] id:724
ee.ld.128.usar.xp q3,a8,a11 # [0*II+5] id:726
ee.vld.128.ip q5,a15,16 # [0*II+6] id:728
ee.vmulas.u16.accx.ld.xp.qup q4,a8,a10,q0,q1,q3,q4 # [0*II+7] id:727
ee.ld.128.usar.xp q1,a8,a11 # [0*II+8] id:729
ee.vld.128.ip q0,a15,16 # [0*II+9] id:731
ee.vmulas.u16.accx.ld.xp.qup q3,a8,a10,q5,q3,q1,q4 # [0*II+10] id:730
ee.ld.128.usar.xp q2,a8,a11 # [0*II+11] id:732
.LBB153_dspi_dotprod_u16_aes3: # 0x20c
mov.n a2,a8 # [0]
j .Lt_0_26370 # [1]
.LBB50_dspi_dotprod_u16_aes3: # 0x211
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
slli a13,a5,1 # [2]
l32i.n a12,a1,16 # [3] gra_spill_temp_0
ee.ld.128.usar.ip q1,a2,16 # [4] id:733
ee.ld.128.usar.ip q2,a2,16 # [5] id:734
sub a12,a12,a13 # [6]
ee.src.q.ld.ip q3,a2,16,q1,q2 # [8] id:735
addi a12,a12,16 # [9]
loopnez a3,.LBB176_dspi_dotprod_u16_aes3 # [10]
.LBB174_dspi_dotprod_u16_aes3: # 0x22c
ee.vld.128.ip q5,a15,16 # [0*II+0] id:737
ee.vmulas.u16.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:736
ee.vld.128.ip q1,a15,16 # [0*II+2] id:739
ee.vmulas.u16.accx.ld.xp.qup q0,a2,a12,q5,q2,q3,q4 # [0*II+3] id:738
ee.vld.128.ip q5,a15,16 # [0*II+4] id:742
ee.vmulas.u16.accx.ld.xp.qup q2,a2,a11,q1,q3,q4,q0 # [0*II+5] id:740
ee.ld.128.usar.xp q1,a2,a10 # [0*II+6] id:741
ee.vld.128.ip q0,a15,16 # [0*II+7] id:744
ee.vmulas.u16.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+8] id:743
.LBB176_dspi_dotprod_u16_aes3: # 0x24b
j .Lt_0_27906 # [0]
.LBB56_dspi_dotprod_u16_aes3: # 0x24e
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
slli a13,a5,1 # [2]
l32i.n a12,a1,16 # [3] gra_spill_temp_0
ee.ld.128.usar.ip q1,a2,16 # [4] id:745
ee.ld.128.usar.ip q2,a2,16 # [5] id:746
sub a12,a12,a13 # [7]
addi a12,a12,16 # [8]
ee.src.q.ld.ip q3,a2,16,q1,q2 # [9] id:747
loopnez a3,.LBB198_dspi_dotprod_u16_aes3 # [10]
.LBB196_dspi_dotprod_u16_aes3: # 0x269
ee.vld.128.ip q4,a15,16 # [0*II+0] id:749
ee.vmulas.u16.accx.ld.ip.qup q1,a2,16,q0,q1,q2,q3 # [0*II+1] id:748
ee.vld.128.ip q0,a15,16 # [0*II+2] id:751
ee.vmulas.u16.accx.ld.ip.qup q4,a2,16,q4,q2,q3,q1 # [0*II+3] id:750
ee.vld.128.ip q5,a15,16 # [0*II+4] id:753
ee.vmulas.u16.accx.ld.ip.qup q0,a2,16,q0,q3,q1,q4 # [0*II+5] id:752
ee.vld.128.ip q6,a15,16 # [0*II+6] id:755
ee.vmulas.u16.accx.ld.ip.qup q1,a2,16,q5,q1,q4,q0 # [0*II+7] id:754
ee.vld.128.ip q5,a15,16 # [0*II+8] id:757
ee.vmulas.u16.accx.ld.ip.qup q4,a2,16,q6,q4,q0,q1 # [0*II+9] id:756
ee.vld.128.ip q6,a15,16 # [0*II+10] id:759
ee.vmulas.u16.accx.ld.xp.qup q0,a2,a12,q5,q0,q1,q4 # [0*II+11] id:758
ee.vld.128.ip q5,a15,16 # [0*II+12] id:762
ee.vmulas.u16.accx.ld.xp.qup q2,a2,a11,q6,q1,q4,q0 # [0*II+13] id:760
ee.ld.128.usar.xp q1,a2,a10 # [0*II+14] id:761
ee.vld.128.ip q0,a15,16 # [0*II+15] id:764
ee.vmulas.u16.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+16] id:763
.LBB198_dspi_dotprod_u16_aes3: # 0x2a4
j .Lt_0_30722 # [0]
.Lt_0_33282: # 0x2a7
movi.n a2,0 # [0]
sext a14,a9,15 # [1]
s16i a14,a4,0 # [2] id:791
retw.n # [3]
.LBB28_dspi_dotprod_u16_aes3: # 0x2b1
mov.n a15,a7 # [0]
mov.n a14,a6 # [1]
mov.n a13,a5 # [2]
mov.n a12,a4 # [3]
mov.n a11,a3 # [4]
mov.n a10,a2 # [5]
call8 dspi_dotprod_s16_ansi # [6] dspi_dotprod_s16_ansi
mov.n a2,a10 # [0]
retw.n # [1]
#endif // dsps_dotprod_s16_aes3_enabled

View File

@@ -0,0 +1,49 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod.h"
esp_err_t dspi_dotprod_u16_ansi(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift)
{
if (in_image->step_x * count_x > in_image->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (in_image->step_y * count_y > in_image->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_x * count_x > filter->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_y * count_y > filter->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
uint16_t *i_data = (uint16_t *)in_image->data;
uint16_t *f_data = (uint16_t *)filter->data;
int i_step = in_image->stride_x * in_image->step_y;
int f_step = filter->stride_x * filter->step_y;
int64_t acc = 0;
for (int y = 0; y < count_y; y++) {
for (int x = 0; x < count_x; x++) {
acc += (int32_t)i_data[in_image->step_x * x] * (int32_t)f_data[filter->step_x * x];
}
i_data += i_step;
f_data += f_step;
}
acc += 1 << (shift - 1); // round operation
acc >>= shift;
*out_value = acc;
return ESP_OK;
}

View File

@@ -0,0 +1,95 @@
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_arp4_enabled == 1)
#include "dsp_err_codes.h"
.text
.align 4
.global dspi_dotprod_u16_arp4
.global dspi_dotprod_u16_ansi
.type dspi_dotprod_u16_arp4,@function
// esp_err_t dspi_dotprod_u16_arp4(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift);
dspi_dotprod_u16_arp4:
// in_image - a0
// filter - a1
// out_value - a2
// count_x - a3
// count_y - a4
// shift - a5
// i_data - t0
// f_data - t1
// i_step - t2
// f_step - t3
// t4 - current i_data
// t5 - current f_data
lw t1, 4(a0) // load in_image->step_x
lw t2, 4(a1) // load filter->step_x
or t1, t1, t2
addi t1, t1, -1 // should be 0 now
andi t2, a3, 7
or t1, t1, t2
beqz t1, .dspi_dotprod_u16_arp4_body
j dspi_dotprod_u16_ansi
.dspi_dotprod_u16_arp4_body:
add sp, sp, -16
lw t0, 0(a0) // i_data
lw t1, 0(a1) // f_data
lw t2, 8(a0) // step_y
lw t4, 12(a0) // stride_x
mul t2, t4, t2
slli t2, t2, 1 // i_step = i_step<<1
lw t3, 8(a1) // step_y
lw t5, 12(a1) // stride_x
mul t3, t5, t3
slli t3, t3, 1 // f_step = f_step<<1
srli t6, a3, 3 // t5 = len/8
addi a6, a5, -1
li t4, 1
sll t4, t4, a6
esp.zero.xacc
esp.movx.w.xacc.l t4
.loop_count_y:
mv t4, t0
mv t5, t1
esp.vld.128.ip q0, t4, 16 // q0 - i_data
esp.lp.setup 0, t6, .loop_count_x
esp.vld.128.ip q1, t5, 16 // q1 - f_data
.loop_count_x: esp.vmulas.u16.xacc.ld.ip q0, t4, 16, q0, q1 // q0 - i_data
add t0, t0, t2
add t1, t1, t3
add a4,a4, -1
bgtz a4, .loop_count_y
esp.srs.u.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
sh t5, 0(a2) // store result to output buffer
li a0,0
add sp,sp,16
ret
#endif // dspi_dotprod_arp4_enabled

View File

@@ -0,0 +1,367 @@
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_aes3_enabled == 1)
.text
.align 4
.literal .LC0_1_52, 458755
.type dspi_dotprod_u8_aes3, @function
.align 4
.global dspi_dotprod_u8_aes3
dspi_dotprod_u8_aes3: # 0x4
.LBB1_dspi_dotprod_u8_aes3: # 0x4
entry a1,48 #
l32i.n a10,a2,4 # [0] id:669
l32i.n a11,a2,12 # [1] id:668
mull a8,a10,a5 # [2]
blt a11,a8,.LBB78_dspi_dotprod_u8_aes3 # [4]
l32i.n a12,a2,8 # [0] id:670
l32i.n a9,a2,16 # [1] id:671
mull a13,a12,a6 # [2]
blt a9,a13,.LBB78_dspi_dotprod_u8_aes3 # [4]
l32i.n a15,a3,4 # [0] id:673
l32i.n a14,a3,12 # [1] id:672
mull a13,a15,a5 # [2]
blt a14,a13,.LBB78_dspi_dotprod_u8_aes3 # [4]
l32i.n a8,a3,16 # [0] id:675
l32i.n a9,a3,8 # [1] id:674
s32i.n a9,a1,8 # [2] gra_spill_temp_2
mull a9,a9,a6 # [3]
blt a8,a9,.LBB78_dspi_dotprod_u8_aes3 # [5]
l32i.n a8,a3,0 # [0] id:676
s32i.n a8,a1,4 # [1] gra_spill_temp_1
bbsi a8,0,.Lt_0_33026 # [2]
bne a14,a13,.Lt_0_33026 # [0]
bnei a15,1,.Lt_0_33026 # [0]
l32i.n a13,a1,8 # [0] gra_spill_temp_2
beqi a13,1,.Lt_0_17666 # [2]
.Lt_0_33026: # 0x43
.Lt_0_17922: # 0x43
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
.type dspi_dotprod_u8_ansi, @function
call8 dspi_dotprod_u8_ansi # [6] dspi_dotprod_u8_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB78_dspi_dotprod_u8_aes3: # 0x56
l32r a2,.LC0_1_52 # [0]
retw.n # [1]
.Lt_0_17666: # 0x5b
addi.n a14,a10,-1 # [0]
bnez a14,.Lt_0_33794 # [1]
addi.n a15,a12,-1 # [0]
bnez a15,.Lt_0_33794 # [1]
extui a8,a5,0,4 # [0]
bnez.n a8,.Lt_0_33794 # [1]
blti a6,4,.Lt_0_33794 # [0]
movi.n a9,64 # [0]
bge a9,a5,.Lt_0_34306 # [1]
extui a10,a5,0,1 # [0]
bnez a10,.LBB28_dspi_dotprod_u8_aes3 # [1]
.Lt_0_34306: # 0x78
.Lt_0_19714: # 0x78
mov.n a3,a6 # [0]
addi a13,a5,-48 # [1]
movi.n a14,0 # [2]
mull a15,a11,a12 # [3]
l32i.n a2,a2,0 # [4] id:677
s32i.n a15,a1,0 # [6] gra_spill_temp_0
wur.accx_0 a14 # [7]
l32i.n a15,a1,4 # [8] gra_spill_temp_1
wur.accx_1 a14 # [9]
ee.vld.128.ip q0,a15,16 # [10] id:680
beqz a13,.LBB32_dspi_dotprod_u8_aes3 # [11]
.Lt_0_22786: # 0x93
.Lt_0_22274: # 0x93
addi a8,a5,-32 # [0]
beqz a8,.LBB38_dspi_dotprod_u8_aes3 # [1]
.Lt_0_24322: # 0x99
.Lt_0_23810: # 0x99
addi a9,a5,-16 # [0]
beqz a9,.LBB44_dspi_dotprod_u8_aes3 # [1]
.Lt_0_25858: # 0x9f
.Lt_0_25346: # 0x9f
addi a10,a5,-64 # [0]
beqz a10,.LBB50_dspi_dotprod_u8_aes3 # [1]
.Lt_0_27394: # 0xa5
.Lt_0_26882: # 0xa5
addi a11,a5,-128 # [0]
beqz a11,.LBB56_dspi_dotprod_u8_aes3 # [1]
movi a12,128 # [0]
bge a12,a5,.Lt_0_30210 # [1]
movi.n a12,0 # [0]
ee.ld.128.usar.ip q1,a2,16 # [1] id:752
ee.ld.128.usar.ip q2,a2,16 # [2] id:753
ee.src.q.ld.ip q3,a2,16,q1,q2 # [4] id:754
beqz.n a3,.Lt_0_30210 # [5]
l32i.n a14,a1,0 # [0] gra_spill_temp_0
addi a13,a5,31 # [1]
movgez a13,a5,a5 # [2]
srai a13,a13,5 # [3]
sub a14,a14,a5 # [4]
addi a14,a14,16 # [5]
addi.n a13,a13,-1 # [6]
.Lt_0_30978: # 0xd1
addi.n a12,a12,1 # [0]
movi.n a8,32 # [1]
movi.n a9,-16 # [2]
beqz.n a13,.Lt_0_31234 # [3]
loopnez a13,.LBB218_dspi_dotprod_u8_aes3 # [0]
.LBB216_dspi_dotprod_u8_aes3: # 0xdc
ee.vld.128.ip q5,a15,16 # [0*II+0] id:756
ee.vmulas.u8.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:755
ee.vld.128.ip q0,a15,16 # [0*II+2] id:758
ee.vmulas.u8.accx.ld.ip.qup q1,a2,16,q5,q2,q3,q4 # [0*II+3] id:757
ee.vld.128.ip q5,a15,16 # [0*II+4] id:760
ee.vmulas.u8.accx.ld.ip.qup q2,a2,16,q0,q3,q4,q1 # [0*II+5] id:759
ee.vld.128.ip q0,a15,16 # [0*II+6] id:762
ee.vmulas.u8.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+7] id:761
.LBB218_dspi_dotprod_u8_aes3: # 0xf8
.Lt_0_31234: # 0xf8
ee.vmulas.u8.accx.ld.ip.qup q5,a2,16,q0,q1,q2,q3 # [0] id:763
ee.vld.128.ip q0,a15,16 # [1] id:764
ee.vld.128.ip q6,a15,16 # [2] id:766
ee.vmulas.u8.accx.ld.xp.qup q7,a2,a14,q0,q2,q3,q5 # [3] id:765
ee.vld.128.ip q4,a15,16 # [4] id:769
ee.vmulas.u8.accx.ld.xp.qup q2,a2,a9,q6,q3,q5,q7 # [5] id:767
ee.ld.128.usar.xp q1,a2,a8 # [6] id:768
ee.vld.128.ip q0,a15,16 # [7] id:771
ee.vmulas.u8.accx.ld.ip.qup q3,a2,16,q4,q5,q1,q2 # [8] id:770
bne a12,a3,.Lt_0_30978 # [9]
.Lt_0_30210: # 0x11a
.Lt_0_29954: # 0x11a
movi.n a2,0 # [0]
rur.accx_0 a10 # [1]
addi.n a12,a7,-1 # [2]
movi.n a11,1 # [3]
ssl a12 # [4]
sll a11,a11 # [5]
ssr a7 # [6]
add.n a10,a10,a11 # [7]
srl a10,a10 # [8]
s8i a10,a4,0 # [9] id:773
retw.n # [10]
.Lt_0_33794: # 0x136
.Lt_0_18946: # 0x136
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
call8 dspi_dotprod_u8_ansi # [6] dspi_dotprod_u8_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB32_dspi_dotprod_u8_aes3: # 0x149
ee.ld.128.usar.ip q1,a2,16 # [0] id:681
ee.ld.128.usar.ip q2,a2,16 # [1] id:682
ee.src.q.ld.ip q3,a2,16,q1,q2 # [3] id:683
beqz.n a6,.Lt_0_22786 # [4]
movi.n a10,32 # [0]
l32i.n a12,a1,0 # [1] gra_spill_temp_0
movi.n a11,-16 # [2]
addi a12,a12,-32 # [3]
loopgtz a6,.LBB104_dspi_dotprod_u8_aes3 # [4]
.LBB102_dspi_dotprod_u8_aes3: # 0x160
ee.vld.128.ip q4,a15,16 # [0*II+0] id:685
ee.vmulas.u8.accx.ld.xp.qup q1,a2,a12,q0,q1,q2,q3 # [0*II+1] id:684
ee.vld.128.ip q5,a15,16 # [0*II+2] id:687
ee.vmulas.u8.accx.ld.xp.qup q2,a2,a11,q4,q2,q3,q1 # [0*II+3] id:686
ee.ld.128.usar.xp q1,a2,a10 # [0*II+4] id:688
ee.vld.128.ip q0,a15,16 # [0*II+5] id:690
ee.vmulas.u8.accx.ld.ip.qup q3,a2,16,q5,q3,q1,q2 # [0*II+6] id:689
.LBB104_dspi_dotprod_u8_aes3: # 0x178
j .Lt_0_22786 # [0]
.LBB38_dspi_dotprod_u8_aes3: # 0x17b
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
srli a3,a6,1 # [2]
l32i.n a12,a1,0 # [3] gra_spill_temp_0
ee.ld.128.usar.ip q1,a2,16 # [4] id:691
ee.ld.128.usar.ip q2,a2,16 # [5] id:692
addi a12,a12,-16 # [7]
ee.src.q.ld.xp q3,a2,a12,q1,q2 # [8] id:693
loopnez a3,.LBB127_dspi_dotprod_u8_aes3 # [9]
.LBB125_dspi_dotprod_u8_aes3: # 0x193
ee.vld.128.ip q4,a15,16 # [0*II+0] id:695
ee.vmulas.u8.accx.ld.xp.qup q3,a2,a11,q0,q1,q2,q3 # [0*II+1] id:694
ee.ld.128.usar.xp q1,a2,a10 # [0*II+2] id:696
ee.vld.128.ip q0,a15,16 # [0*II+3] id:698
ee.vmulas.u8.accx.ld.xp.qup q4,a2,a12,q4,q2,q1,q3 # [0*II+4] id:697
ee.vld.128.ip q5,a15,16 # [0*II+5] id:700
ee.vmulas.u8.accx.ld.xp.qup q2,a2,a11,q0,q1,q3,q4 # [0*II+6] id:699
ee.ld.128.usar.xp q1,a2,a10 # [0*II+7] id:701
ee.vld.128.ip q0,a15,16 # [0*II+8] id:703
ee.vmulas.u8.accx.ld.xp.qup q3,a2,a12,q5,q3,q1,q2 # [0*II+9] id:702
.LBB127_dspi_dotprod_u8_aes3: # 0x1b5
j .Lt_0_24322 # [0]
.LBB44_dspi_dotprod_u8_aes3: # 0x1b8
srli a3,a3,2 # [0]
movi.n a10,-16 # [1]
l32i.n a11,a1,0 # [2] gra_spill_temp_0
addi a8,a2,16 # [3]
addi a11,a11,16 # [4]
ee.ld.128.usar.xp q2,a8,a10 # [5] id:704
ee.ld.128.usar.xp q1,a8,a11 # [6] id:705
ee.src.q.ld.xp q3,a8,a10,q1,q2 # [8] id:706
ee.ld.128.usar.xp q2,a8,a11 # [9] id:707
loopnez a3,.LBB150_dspi_dotprod_u8_aes3 # [10]
.LBB148_dspi_dotprod_u8_aes3: # 0x1d4
ee.vld.128.ip q4,a15,16 # [0*II+0] id:709
ee.vmulas.u8.accx.ld.xp.qup q3,a8,a10,q0,q1,q2,q3 # [0*II+1] id:708
ee.ld.128.usar.xp q1,a8,a11 # [0*II+2] id:710
ee.vld.128.ip q0,a15,16 # [0*II+3] id:712
ee.vmulas.u8.accx.ld.xp.qup q4,a8,a10,q4,q2,q1,q3 # [0*II+4] id:711
ee.ld.128.usar.xp q3,a8,a11 # [0*II+5] id:713
ee.vld.128.ip q5,a15,16 # [0*II+6] id:715
ee.vmulas.u8.accx.ld.xp.qup q4,a8,a10,q0,q1,q3,q4 # [0*II+7] id:714
ee.ld.128.usar.xp q1,a8,a11 # [0*II+8] id:716
ee.vld.128.ip q0,a15,16 # [0*II+9] id:718
ee.vmulas.u8.accx.ld.xp.qup q3,a8,a10,q5,q3,q1,q4 # [0*II+10] id:717
ee.ld.128.usar.xp q2,a8,a11 # [0*II+11] id:719
.LBB150_dspi_dotprod_u8_aes3: # 0x1fc
mov.n a2,a8 # [0]
j .Lt_0_25858 # [1]
.LBB50_dspi_dotprod_u8_aes3: # 0x201
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
l32i.n a12,a1,0 # [2] gra_spill_temp_0
ee.ld.128.usar.ip q1,a2,16 # [3] id:720
ee.ld.128.usar.ip q2,a2,16 # [4] id:721
sub a12,a12,a5 # [5]
ee.src.q.ld.ip q3,a2,16,q1,q2 # [7] id:722
addi a12,a12,16 # [8]
loopnez a3,.LBB173_dspi_dotprod_u8_aes3 # [9]
.LBB171_dspi_dotprod_u8_aes3: # 0x219
ee.vld.128.ip q5,a15,16 # [0*II+0] id:724
ee.vmulas.u8.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:723
ee.vld.128.ip q1,a15,16 # [0*II+2] id:726
ee.vmulas.u8.accx.ld.xp.qup q0,a2,a12,q5,q2,q3,q4 # [0*II+3] id:725
ee.vld.128.ip q5,a15,16 # [0*II+4] id:729
ee.vmulas.u8.accx.ld.xp.qup q2,a2,a11,q1,q3,q4,q0 # [0*II+5] id:727
ee.ld.128.usar.xp q1,a2,a10 # [0*II+6] id:728
ee.vld.128.ip q0,a15,16 # [0*II+7] id:731
ee.vmulas.u8.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+8] id:730
.LBB173_dspi_dotprod_u8_aes3: # 0x238
j .Lt_0_27394 # [0]
.LBB56_dspi_dotprod_u8_aes3: # 0x23b
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
l32i.n a12,a1,0 # [2] gra_spill_temp_0
ee.ld.128.usar.ip q1,a2,16 # [3] id:732
ee.ld.128.usar.ip q2,a2,16 # [4] id:733
sub a12,a12,a5 # [6]
addi a12,a12,16 # [7]
ee.src.q.ld.ip q3,a2,16,q1,q2 # [8] id:734
loopnez a3,.LBB195_dspi_dotprod_u8_aes3 # [9]
.LBB193_dspi_dotprod_u8_aes3: # 0x253
ee.vld.128.ip q4,a15,16 # [0*II+0] id:736
ee.vmulas.u8.accx.ld.ip.qup q1,a2,16,q0,q1,q2,q3 # [0*II+1] id:735
ee.vld.128.ip q0,a15,16 # [0*II+2] id:738
ee.vmulas.u8.accx.ld.ip.qup q4,a2,16,q4,q2,q3,q1 # [0*II+3] id:737
ee.vld.128.ip q5,a15,16 # [0*II+4] id:740
ee.vmulas.u8.accx.ld.ip.qup q0,a2,16,q0,q3,q1,q4 # [0*II+5] id:739
ee.vld.128.ip q6,a15,16 # [0*II+6] id:742
ee.vmulas.u8.accx.ld.ip.qup q1,a2,16,q5,q1,q4,q0 # [0*II+7] id:741
ee.vld.128.ip q5,a15,16 # [0*II+8] id:744
ee.vmulas.u8.accx.ld.ip.qup q4,a2,16,q6,q4,q0,q1 # [0*II+9] id:743
ee.vld.128.ip q6,a15,16 # [0*II+10] id:746
ee.vmulas.u8.accx.ld.xp.qup q0,a2,a12,q5,q0,q1,q4 # [0*II+11] id:745
ee.vld.128.ip q5,a15,16 # [0*II+12] id:749
ee.vmulas.u8.accx.ld.xp.qup q2,a2,a11,q6,q1,q4,q0 # [0*II+13] id:747
ee.ld.128.usar.xp q1,a2,a10 # [0*II+14] id:748
ee.vld.128.ip q0,a15,16 # [0*II+15] id:751
ee.vmulas.u8.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+16] id:750
.LBB195_dspi_dotprod_u8_aes3: # 0x28e
movi.n a2,0 # [0]
movi.n a11,1 # [1]
addi.n a12,a7,-1 # [2]
rur.accx_0 a10 # [3]
ssl a12 # [4]
sll a11,a11 # [5]
ssr a7 # [6]
add.n a10,a10,a11 # [7]
srl a10,a10 # [8]
s8i a10,a4,0 # [9] id:773
retw.n # [10]
.LBB28_dspi_dotprod_u8_aes3: # 0x2aa
mov.n a15,a7 # [0]
mov.n a14,a6 # [1]
mov.n a13,a5 # [2]
mov.n a12,a4 # [3]
mov.n a11,a3 # [4]
mov.n a10,a2 # [5]
call8 dspi_dotprod_u8_ansi # [6] dspi_dotprod_u8_ansi
mov.n a2,a10 # [0]
retw.n # [1]
#endif // dsps_dotprod_s16_aes3_enabled

View File

@@ -0,0 +1,49 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod.h"
esp_err_t dspi_dotprod_u8_ansi(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift)
{
if (in_image->step_x * count_x > in_image->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (in_image->step_y * count_y > in_image->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_x * count_x > filter->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_y * count_y > filter->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
uint8_t *i_data = (uint8_t *)in_image->data;
uint8_t *f_data = (uint8_t *)filter->data;
int i_step = in_image->stride_x * in_image->step_y;
int f_step = filter->stride_x * filter->step_y;
int32_t acc = 0;
for (int y = 0; y < count_y; y++) {
for (int x = 0; x < count_x; x++) {
acc += (int16_t)i_data[in_image->step_x * x] * (int16_t)f_data[filter->step_x * x];
}
i_data += i_step;
f_data += f_step;
}
acc += 1 << (shift - 1); // round operation
acc >>= shift;
*out_value = acc;
return ESP_OK;
}

View File

@@ -0,0 +1,93 @@
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_arp4_enabled == 1)
#include "dsp_err_codes.h"
.text
.align 4
.global dspi_dotprod_u8_arp4
.global dspi_dotprod_u8_ansi
.type dspi_dotprod_u8_arp4,@function
// esp_err_t dspi_dotprod_u8_arp4(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift);
dspi_dotprod_u8_arp4:
// in_image - a0
// filter - a1
// out_value - a2
// count_x - a3
// count_y - a4
// shift - a5
// i_data - t0
// f_data - t1
// i_step - t2
// f_step - t3
// t4 - current i_data
// t5 - current f_data
lw t1, 4(a0) // load in_image->step_x
lw t2, 4(a1) // load filter->step_x
or t1, t1, t2
addi t1, t1, -1 // should be 0 now
andi t2, a3, 15
or t1, t1, t2
beqz t1, .dspi_dotprod_u8_arp4_body
j dspi_dotprod_u8_ansi
.dspi_dotprod_u8_arp4_body:
add sp, sp, -16
lw t0, 0(a0) // i_data
lw t1, 0(a1) // f_data
lw t2, 8(a0) // step_y
lw t4, 12(a0) // stride_x
mul t2, t4, t2
lw t3, 8(a1) // step_y
lw t5, 12(a1) // stride_x
mul t3, t5, t3
srli t6, a3, 4 // t5 = len/16
addi a6, a5, -1
li t4, 1
sll t4, t4, a6
esp.zero.xacc
esp.movx.w.xacc.l t4
.loop_count_y:
mv t4, t0
mv t5, t1
esp.vld.128.ip q0, t4, 16 // q0 - i_data
esp.lp.setup 0, t6, .loop_count_x
esp.vld.128.ip q1, t5, 16 // q1 - f_data
.loop_count_x: esp.vmulas.u8.xacc.ld.ip q0, t4, 16, q0, q1 // q0 - i_data
add t0, t0, t2
add t1, t1, t3
add a4,a4, -1
bgtz a4, .loop_count_y
esp.srs.u.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
sh t5, 0(a2) // store result to output buffer
li a0,0
add sp,sp,16
ret
#endif // dspi_dotprod_arp4_enabled

View File

@@ -0,0 +1,80 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dsps_dotprod_platform.h"
#if (dsps_dotprod_s16_ae32_enabled == 1)
#include "dsps_dotprod_s16_m_ae32.S"
#include "dsp_err_codes.h"
.text
.align 4
.global dsps_dotprod_s16_ae32
.type dsps_dotprod_s16_ae32,@function
//esp_err_t dsps_dotprod_s16_ae32(const int16_t* src1, const int16_t* src2, int16_t* dest, int len, int8_t shift);
dsps_dotprod_s16_ae32:
// src1 - a2
// src2 - a3
// dest - a4
// len - a5
// shift - a6
entry a1, 16
// Check minimum length
movi a8, 4
blt a5, a8, dsps_dotprod_s16_ae32_error
// Clear accumulator
movi a8, 0
wsr a8, acchi
// Prepare and load round value
movi a8, 0x7fff
ssr a6
srl a8, a8
wsr a8, acclo // initialize acc with shifted round value
// Compensate for pre-increment
// Right shift to 16 bits
// RS = -shift + 15
neg a6, a6
addi a6, a6, 15
/* number of loop iterations (see below):
* a7 = count / 4 - 1
*/
srli a7, a5, 2
addi a7, a7, -1
movi.n a10, 0 // load 0 to the a10 to increment second array
dotprod_s16_ae32_full a2, a3, a7, a5
/* Get accumulator */
ssr a6
rsr a2, acchi
rsr a3, acclo
src a2, a2, a3
s16i a2, a4, 0
movi.n a2, 0
retw.n
dsps_dotprod_s16_ae32_error:
movi.n a2, ESP_ERR_DSP_INVALID_LENGTH
retw.n
#endif // dsps_dotprod_s16_ae32_enabled

View File

@@ -0,0 +1,33 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dsps_dotprod.h"
esp_err_t dsps_dotprod_s16_ansi(const int16_t *src1, const int16_t *src2, int16_t *dest, int len, int8_t shift)
{
// To make correct round operation we have to shift round value
long long acc = 0x7fff >> shift;
for (int i = 0 ; i < len ; i++) {
acc += (int32_t)src1[i] * (int32_t)src2[i];
}
int final_shift = shift - 15;
if (final_shift > 0) {
*dest = (acc << final_shift);
} else {
*dest = (acc >> (-final_shift));
}
return ESP_OK;
}

View File

@@ -0,0 +1,74 @@
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dsps_dotprod_platform.h"
#if (dsps_dotprod_s16_arp4_enabled == 1)
#include "dsp_err_codes.h"
.text
.align 4
.global dsps_dotprod_s16_arp4
.global dsps_dotprod_s16_ansi
.type dsps_dotprod_s16_arp4,@function
//esp_err_t dsps_dotprod_s16_arp4(const int16_t* src1, const int16_t* src2, int16_t* dest, int len, int8_t shift);
dsps_dotprod_s16_arp4:
// src1 - a0
// src2 - a1
// dest - a2
// len - a3
// shift - a4
andi a5, a3, 7
beqz a5, .dsps_dotprod_s16_arp4_body
j dsps_dotprod_s16_ansi
.dsps_dotprod_s16_arp4_body:
add sp,sp,-16
// Enable analigned data access
esp.movx.r.cfg t6
or t6, t6, 2
esp.movx.w.cfg t6
add t6, a4, -15
neg t6, t6 // t6 - real_shift
li t3, 0x7fff
srl t3, t3, a4
esp.zero.xacc
esp.movx.w.xacc.l t3
mv t3, a0
mv t4, a1
esp.vld.128.ip q0, t3, 16 //q0 - src1
srli t5, a3, 3 // t5 = len>>3
# esp.lp.setup 0, t5, .main_loop
# esp.vld.128.ip q1, t4, 16 // q1 - src1
# .main_loop: esp.vmulas.s16.xacc.ld.ip q0, t3, 16, q0, q1 // q0 - src2
.main_loop:
esp.vld.128.ip q1, t4, 16 // q1 - src1
esp.vmulas.s16.xacc.ld.ip q0, t3, 16, q0, q1 // q0 - src2
add t5, t5, -1
bgtz t5, .main_loop
esp.srs.s.xacc t5, t6 // shift accx register by final_shift amount (a6), save the lower 32bits to a15
sh t5, 0(a2) // store result to output buffer
li a0,0
add sp,sp,16
ret
#endif // dsps_dotprod_s16_ae32_enabled

View File

@@ -0,0 +1,104 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
.macro dotprod_s16_ae32 x1, x2, count
// This macro calculates fixed point dot product for ((count + 1)*4) int16 samples
// x1 - input array1 register (for example a2)
// x2 - input array2 register (for example a3)
// count - counter register (for example a7)
// count - samples_count / 4 - 1
// acc += x1[i + 0]*x2[i + 0] + x1[i + 1]*x2[i + 1] + x1[i + 2]*x2[i + 2] + x1[i + 3]*x2[i + 3]; i: 0..count
// acchi, and acclo have to be initialize before
// Result - acchi || acclo
// Modifies:
// m0, m1, m2, m3
// acchi || acclo - must be loaded before (for example 0x3fff to acclo).
/*
* Data schedule. Each line represents instruction, columns represent
* register contents. Last column (MUL) shows the multiplication which
* takes place. Values loaded in the given cycle are shown in square brackets.
*
* m0 m1 m2 m3 MUL
* --------- pre-load ------------
*[x0 x1] (no MULs in the first 3 instructions)
* x0 x1 [y0 y1]
* x0 x1 [x2 x3] y0 y1
* x0 x1 x2 x3 y0 y1 [y2 y3] x0*y0
* ---------- loop -------------- (the following 4 instructions are
*[x4 x5] x2 x3 y0 y1 y2 y3 x1*y1 repeated as much as needed)
* x4 x5 x2 x3 [y4 y5] y2 y3 x2*y2
* x4 x5 [x6 x7] y4 y5 y2 y3 x3*y3
* x4 x5 x6 x7 y4 y5 [y6 y7] x4*y4
* --------- finalize ------------
* x4 x5 x6 x7 y4 y5 y6 y7 x5*y5 (nothing is load)
* x4 x5 x6 x7 y4 y5 y6 y7 x6*y6
* x4 x5 x6 x7 y4 y5 y6 y7 x7*y7
*/
addi \x1, \x1, -4 // To arrange fist pointer
addi \x2, \x2, -4 // To arrange fist pointer
//lddec m0, \x1
//lddec m2, \x2 // To arrange fist pointer
ldinc m0, \x1
ldinc m2, \x2
ldinc m1, \x1
mula.dd.ll.ldinc m3, \x2, m0, m2
loopnez \count, .loop_end
.loop:
mula.dd.hh.ldinc m0, \x1, m0, m2
mula.dd.ll.ldinc m2, \x2, m1, m3
mula.dd.hh.ldinc m1, \x1, m1, m3
mula.dd.ll.ldinc m3, \x2, m0, m2
.loop_end:
mula.dd.hh m0, m2
mula.dd.ll m1, m3
mula.dd.hh m1, m3
.endm // dotprod_s16_ae32
.macro dotprod_s16_ae32_full x1, x2, count, full_count
// This macro calculates fixed point dot product for ((count + 1)*4) int16 samples
// x1 - input array1 register (for example a2)
// x2 - input array2 register (for example a3)
// count - counter register (for example a7)
// count - samples_count / 4 - 1
// full_count - samples_count
// acc += x1[i + 0]*x2[i + 0] + x1[i + 1]*x2[i + 1] + x1[i + 2]*x2[i + 2] + x1[i + 3]*x2[i + 3]; i: 0..count
// acchi, and acclo have to be initialize before
// Result - acchi || acclo
// Modifies:
// m0, m1, m2, m3
// acchi || acclo - must be loaded before (for example 0x3fff to acclo).
dotprod_s16_ae32 \x1, \x2, \count
bbci \full_count, 1, .mod2chk
ldinc m0, \x1
ldinc m2, \x2
mula.dd.hh m0, m2
mula.dd.ll m0, m2
.mod2chk:
bbci \full_count, 0, .mod1chk
ldinc m0, \x1
ldinc m2, \x2
mula.dd.ll m0, m2
.mod1chk:
.endm // dotprod_s16_ae32_full