add some code
This commit is contained in:
@@ -0,0 +1,398 @@
|
||||
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_aes3_enabled == 1)
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.literal .LC0_1_61, 458755
|
||||
|
||||
# Program Unit: dspi_dotprod_off_s16_aes3
|
||||
.type dspi_dotprod_off_s16_aes3, @function
|
||||
.align 4
|
||||
.global dspi_dotprod_off_s16_aes3
|
||||
dspi_dotprod_off_s16_aes3: # 0x4
|
||||
.LBB1_dspi_dotprod_off_s16_aes3: # 0x4
|
||||
entry a1,128 #
|
||||
l32i.n a10,a2,4 # [0] id:760
|
||||
l32i.n a12,a2,12 # [1] id:759
|
||||
mull a8,a10,a5 # [2]
|
||||
blt a12,a8,.LBB83_dspi_dotprod_off_s16_aes3 # [4]
|
||||
|
||||
l32i.n a13,a2,8 # [0] id:761
|
||||
l32i.n a9,a2,16 # [1] id:762
|
||||
mull a11,a13,a6 # [2]
|
||||
blt a9,a11,.LBB83_dspi_dotprod_off_s16_aes3 # [4]
|
||||
|
||||
l32i.n a15,a3,4 # [0] id:764
|
||||
l32i.n a14,a3,12 # [1] id:763
|
||||
mull a11,a15,a5 # [2]
|
||||
blt a14,a11,.LBB83_dspi_dotprod_off_s16_aes3 # [4]
|
||||
|
||||
l32i.n a8,a3,16 # [0] id:766
|
||||
l32i.n a9,a3,8 # [1] id:765
|
||||
s32i a9,a1,88 # [2] gra_spill_temp_2
|
||||
mull a9,a9,a6 # [3]
|
||||
blt a8,a9,.LBB83_dspi_dotprod_off_s16_aes3 # [5]
|
||||
|
||||
l32i.n a8,a3,0 # [0] id:767
|
||||
s32i a8,a1,84 # [1] gra_spill_temp_1
|
||||
bbsi a8,0,.Lt_0_36354 # [2]
|
||||
|
||||
bne a14,a11,.Lt_0_36354 # [0]
|
||||
|
||||
bnei a15,1,.Lt_0_36354 # [0]
|
||||
|
||||
l32i a9,a1,88 # [0] gra_spill_temp_2
|
||||
beqi a9,1,.Lt_0_19458 # [2]
|
||||
|
||||
.Lt_0_36354: # 0x46
|
||||
.Lt_0_19714: # 0x46
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
l16si a8,a1,128 # [6] id:768 offset+0x0
|
||||
s32i.n a8,a1,0 # [7] id:875
|
||||
.type dspi_dotprod_off_s16_ansi, @function
|
||||
call8 dspi_dotprod_off_s16_ansi # [8] dspi_dotprod_off_s16_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB83_dspi_dotprod_off_s16_aes3: # 0x5e
|
||||
l32r a2,.LC0_1_61 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.Lt_0_19458: # 0x63
|
||||
addi.n a9,a10,-1 # [0]
|
||||
bnez a9,.Lt_0_37122 # [1]
|
||||
|
||||
addi.n a10,a13,-1 # [0]
|
||||
bnez a10,.Lt_0_37122 # [1]
|
||||
|
||||
extui a11,a5,0,3 # [0]
|
||||
bnez.n a11,.Lt_0_37122 # [1]
|
||||
|
||||
blti a6,4,.Lt_0_37122 # [0]
|
||||
|
||||
movi.n a14,32 # [0]
|
||||
blt a14,a5,.LBB27_dspi_dotprod_off_s16_aes3 # [1]
|
||||
|
||||
.Lt_0_37634: # 0x7a
|
||||
.Lt_0_21506: # 0x7a
|
||||
l32i a15,a1,84 # [0] gra_spill_temp_1
|
||||
l32i.n a2,a2,0 # [1] id:769
|
||||
l16si a9,a1,128 # [2] id:768 offset+0x0
|
||||
mull a10,a12,a13 # [3]
|
||||
addi a8,a1,16 # [4] temp_offset
|
||||
slli a10,a10,1 # [5]
|
||||
s32i a10,a1,80 # [6] gra_spill_temp_0
|
||||
movi.n a10,2 # [7]
|
||||
# loop-count fixed at 2
|
||||
loop a10,.LBB137_dspi_dotprod_off_s16_aes3 # [8]
|
||||
|
||||
.LBB132_dspi_dotprod_off_s16_aes3: # 0x93
|
||||
s16i a9,a8,0 # [0*II+0] id:770 temp_offset+0x0
|
||||
s16i a9,a8,2 # [0*II+1] id:770 temp_offset+0x0
|
||||
s16i a9,a8,4 # [0*II+2] id:770 temp_offset+0x0
|
||||
s16i a9,a8,6 # [0*II+3] id:770 temp_offset+0x0
|
||||
s16i a9,a8,8 # [0*II+4] id:770 temp_offset+0x0
|
||||
s16i a9,a8,10 # [0*II+5] id:770 temp_offset+0x0
|
||||
s16i a9,a8,12 # [0*II+6] id:770 temp_offset+0x0
|
||||
s16i a9,a8,14 # [0*II+7] id:770 temp_offset+0x0
|
||||
addi a8,a8,16 # [0*II+8]
|
||||
|
||||
.LBB137_dspi_dotprod_off_s16_aes3: # 0xae
|
||||
mov.n a3,a6 # [0]
|
||||
addi a11,a5,-24 # [1]
|
||||
addi a12,a1,24 # [3] temp_offset+8
|
||||
movi.n a13,0 # [4]
|
||||
wur.sar_byte a13 # [5]
|
||||
wur.accx_0 a13 # [6]
|
||||
wur.accx_1 a13 # [7]
|
||||
ee.vld.128.ip q6,a12,0 # [8] id:771
|
||||
s32i.n a12,a1,48 # [9] offset_data_ptr
|
||||
beqz a11,.LBB34_dspi_dotprod_off_s16_aes3 # [10]
|
||||
|
||||
.Lt_0_25602: # 0xc8
|
||||
.Lt_0_25090: # 0xc8
|
||||
ee.vld.128.ip q0,a15,16 # [0] id:786
|
||||
addi a14,a5,-16 # [1]
|
||||
beqz a14,.LBB40_dspi_dotprod_off_s16_aes3 # [2]
|
||||
|
||||
.Lt_0_27138: # 0xd1
|
||||
.Lt_0_26626: # 0xd1
|
||||
addi a8,a5,-8 # [0]
|
||||
beqz a8,.LBB46_dspi_dotprod_off_s16_aes3 # [1]
|
||||
|
||||
.Lt_0_28674: # 0xd7
|
||||
.Lt_0_28162: # 0xd7
|
||||
addi a9,a5,-32 # [0]
|
||||
beqz a9,.LBB52_dspi_dotprod_off_s16_aes3 # [1]
|
||||
|
||||
.Lt_0_30210: # 0xdd
|
||||
.Lt_0_29698: # 0xdd
|
||||
addi a10,a5,-64 # [0]
|
||||
beqz a10,.LBB58_dspi_dotprod_off_s16_aes3 # [1]
|
||||
|
||||
movi.n a11,64 # [0]
|
||||
bge a11,a5,.Lt_0_33026 # [1]
|
||||
|
||||
movi.n a12,0 # [0]
|
||||
ee.ld.128.usar.ip q1,a2,16 # [1] id:848
|
||||
ee.ld.128.usar.ip q2,a2,16 # [2] id:849
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [4] id:850
|
||||
beqz.n a3,.Lt_0_33026 # [5]
|
||||
|
||||
slli a8,a5,1 # [0]
|
||||
l32i a14,a1,80 # [1] gra_spill_temp_0
|
||||
addi a13,a5,31 # [2]
|
||||
movgez a13,a5,a5 # [3]
|
||||
srai a13,a13,5 # [4]
|
||||
sub a14,a14,a8 # [5]
|
||||
addi a14,a14,16 # [6]
|
||||
addi.n a13,a13,-1 # [7]
|
||||
|
||||
.Lt_0_33794: # 0x10c
|
||||
beqz.n a13,.Lt_0_34050 # [0]
|
||||
|
||||
loopnez a13,.LBB273_dspi_dotprod_off_s16_aes3 # [0]
|
||||
|
||||
.LBB271_dspi_dotprod_off_s16_aes3: # 0x111
|
||||
ee.vmulas.s16.accx.ld.ip.qup q0,a2,16,q0,q1,q2,q3 # [0*II+0] id:851
|
||||
ee.vmulas.s16.accx.ld.ip q1,a15,16,q1,q6 # [0*II+1] id:852
|
||||
ee.vmulas.s16.accx.ld.ip.qup q1,a2,16,q1,q2,q3,q0 # [0*II+3] id:853
|
||||
ee.vmulas.s16.accx.ld.ip q4,a15,16,q2,q6 # [0*II+4] id:854
|
||||
ee.vmulas.s16.accx.ld.ip.qup q2,a2,16,q4,q3,q0,q1 # [0*II+6] id:855
|
||||
ee.vmulas.s16.accx.ld.ip q4,a15,16,q3,q6 # [0*II+7] id:856
|
||||
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q4,q0,q1,q2 # [0*II+9] id:857
|
||||
ee.vmulas.s16.accx.ld.ip q0,a15,16,q0,q6 # [0*II+10] id:858
|
||||
|
||||
.LBB273_dspi_dotprod_off_s16_aes3: # 0x131
|
||||
|
||||
.Lt_0_34050: # 0x131
|
||||
ee.vmulas.s16.accx.ld.ip.qup q0,a2,16,q0,q1,q2,q3 # [0] id:859
|
||||
ee.vmulas.s16.accx.ld.ip q1,a15,16,q1,q6 # [1] id:860
|
||||
movi.n a9,32 # [2]
|
||||
ee.vmulas.s16.accx.ld.xp.qup q7,a2,a14,q1,q2,q3,q0 # [3] id:861
|
||||
ee.vmulas.s16.accx.ld.ip q5,a15,16,q2,q6 # [4] id:862
|
||||
movi.n a10,-16 # [5]
|
||||
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a10,q5,q3,q0,q7 # [6] id:863
|
||||
ee.vmulas.s16.accx.ld.ip q4,a15,16,q3,q6 # [7] id:865
|
||||
ee.ld.128.usar.xp q1,a2,a9 # [8] id:864
|
||||
addi.n a12,a12,1 # [9]
|
||||
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q4,q0,q1,q2 # [10] id:866
|
||||
ee.vmulas.s16.accx.ld.ip q0,a15,16,q0,q6 # [11] id:867
|
||||
bne a12,a3,.Lt_0_33794 # [12]
|
||||
|
||||
.Lt_0_33026: # 0x15d
|
||||
.Lt_0_32770: # 0x15d
|
||||
rur.accx_0 a9 # [0]
|
||||
rur.accx_1 a10 # [1]
|
||||
blti a7,1,.Lt_0_35586 # [2]
|
||||
|
||||
movi.n a2,0 # [0]
|
||||
addi a13,a7,-33 # [1]
|
||||
addi.n a14,a7,-1 # [2]
|
||||
ssr a14 # [3]
|
||||
sra a12,a10 # [4]
|
||||
src a11,a10,a9 # [5]
|
||||
movgez a11,a12,a13 # [6]
|
||||
addi.n a11,a11,1 # [7]
|
||||
srai a11,a11,1 # [8]
|
||||
s16i a11,a4,0 # [9] id:873
|
||||
retw.n # [10]
|
||||
|
||||
.Lt_0_37122: # 0x183
|
||||
.Lt_0_20738: # 0x183
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
l16si a8,a1,128 # [6] id:768 offset+0x0
|
||||
s32i.n a8,a1,0 # [7] id:876
|
||||
call8 dspi_dotprod_off_s16_ansi # [8] dspi_dotprod_off_s16_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB27_dspi_dotprod_off_s16_aes3: # 0x19b
|
||||
extui a9,a5,0,1 # [0]
|
||||
beqz a9,.Lt_0_37634 # [1]
|
||||
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
l16si a8,a1,128 # [6] id:768 offset+0x0
|
||||
s32i.n a8,a1,0 # [7] id:877
|
||||
call8 dspi_dotprod_off_s16_ansi # [8] dspi_dotprod_off_s16_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB34_dspi_dotprod_off_s16_aes3: # 0x1b9
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
l32i a12,a1,80 # [2] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q0,a2,16 # [3] id:776
|
||||
ee.ld.128.usar.ip q2,a2,16 # [4] id:777
|
||||
addi a12,a12,-32 # [5]
|
||||
ee.src.q.ld.ip q3,a2,16,q0,q2 # [6] id:778
|
||||
loopgtz a6,.LBB159_dspi_dotprod_off_s16_aes3 # [7]
|
||||
|
||||
.LBB157_dspi_dotprod_off_s16_aes3: # 0x1cf
|
||||
ee.vmulas.s16.accx.ld.ip q1,a15,16,q0,q6 # [0*II+0] id:779
|
||||
ee.vmulas.s16.accx.ld.xp.qup q1,a2,a12,q1,q0,q2,q3 # [0*II+2] id:780
|
||||
ee.vmulas.s16.accx.ld.ip q0,a15,16,q2,q6 # [0*II+3] id:781
|
||||
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a11,q0,q2,q3,q1 # [0*II+5] id:782
|
||||
ee.vmulas.s16.accx.ld.ip q1,a15,16,q3,q6 # [0*II+6] id:784
|
||||
ee.ld.128.usar.xp q0,a2,a10 # [0*II+7] id:783
|
||||
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q1,q3,q0,q2 # [0*II+9] id:785
|
||||
|
||||
.LBB159_dspi_dotprod_off_s16_aes3: # 0x1ea
|
||||
j .Lt_0_25602 # [0]
|
||||
|
||||
.LBB40_dspi_dotprod_off_s16_aes3: # 0x1ed
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
srli a3,a6,1 # [2]
|
||||
l32i a12,a1,80 # [3] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a2,16 # [4] id:787
|
||||
ee.ld.128.usar.ip q2,a2,16 # [5] id:788
|
||||
addi a12,a12,-16 # [7]
|
||||
ee.src.q.ld.xp q3,a2,a12,q1,q2 # [8] id:789
|
||||
loopnez a3,.LBB182_dspi_dotprod_off_s16_aes3 # [9]
|
||||
|
||||
.LBB180_dspi_dotprod_off_s16_aes3: # 0x206
|
||||
ee.vmulas.s16.accx.ld.xp.qup q0,a2,a11,q0,q1,q2,q3 # [0*II+0] id:790
|
||||
ee.vmulas.s16.accx.ld.ip q3,a15,16,q1,q6 # [0*II+1] id:791
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+2] id:792
|
||||
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a12,q3,q2,q1,q0 # [0*II+4] id:793
|
||||
ee.vmulas.s16.accx.ld.ip q4,a15,16,q2,q6 # [0*II+5] id:794
|
||||
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a11,q4,q1,q0,q3 # [0*II+7] id:795
|
||||
ee.vmulas.s16.accx.ld.ip q3,a15,16,q1,q6 # [0*II+8] id:796
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+9] id:797
|
||||
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a12,q3,q0,q1,q2 # [0*II+11] id:798
|
||||
ee.vmulas.s16.accx.ld.ip q0,a15,16,q0,q6 # [0*II+12] id:799
|
||||
|
||||
.LBB182_dspi_dotprod_off_s16_aes3: # 0x22c
|
||||
j .Lt_0_27138 # [0]
|
||||
|
||||
.LBB46_dspi_dotprod_off_s16_aes3: # 0x22f
|
||||
movi.n a10,-16 # [0]
|
||||
l32i a11,a1,80 # [1] gra_spill_temp_0
|
||||
addi a8,a2,16 # [2]
|
||||
addi a11,a11,16 # [3]
|
||||
ee.ld.128.usar.xp q2,a8,a10 # [4] id:800
|
||||
ee.ld.128.usar.xp q1,a8,a11 # [5] id:801
|
||||
ee.src.q.ld.xp q3,a8,a10,q1,q2 # [7] id:802
|
||||
ee.ld.128.usar.xp q2,a8,a11 # [8] id:803
|
||||
srli a3,a3,2 # [9]
|
||||
mov.n a2,a8 # [10]
|
||||
loopnez a3,.LBB205_dspi_dotprod_off_s16_aes3 # [11]
|
||||
|
||||
.LBB203_dspi_dotprod_off_s16_aes3: # 0x24e
|
||||
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a10,q0,q1,q2,q3 # [0*II+0] id:804
|
||||
ee.vmulas.s16.accx.ld.ip q0,a15,16,q1,q6 # [0*II+1] id:805
|
||||
ee.ld.128.usar.xp q1,a2,a11 # [0*II+2] id:806
|
||||
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a10,q0,q2,q1,q3 # [0*II+4] id:807
|
||||
ee.vmulas.s16.accx.ld.ip q0,a15,16,q2,q6 # [0*II+5] id:808
|
||||
ee.ld.128.usar.xp q4,a2,a11 # [0*II+6] id:809
|
||||
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a10,q0,q1,q4,q3 # [0*II+8] id:810
|
||||
ee.vmulas.s16.accx.ld.ip q0,a15,16,q1,q6 # [0*II+9] id:811
|
||||
ee.ld.128.usar.xp q1,a2,a11 # [0*II+10] id:812
|
||||
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a10,q0,q4,q1,q3 # [0*II+12] id:813
|
||||
ee.vmulas.s16.accx.ld.ip q0,a15,16,q4,q6 # [0*II+13] id:814
|
||||
ee.ld.128.usar.xp q2,a2,a11 # [0*II+14] id:815
|
||||
|
||||
.LBB205_dspi_dotprod_off_s16_aes3: # 0x27a
|
||||
j .Lt_0_28674 # [0]
|
||||
|
||||
.LBB52_dspi_dotprod_off_s16_aes3: # 0x27d
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
slli a13,a5,1 # [2]
|
||||
l32i a12,a1,80 # [3] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a2,16 # [4] id:816
|
||||
ee.ld.128.usar.ip q2,a2,16 # [5] id:817
|
||||
sub a12,a12,a13 # [6]
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [8] id:818
|
||||
addi a12,a12,16 # [9]
|
||||
loopnez a3,.LBB228_dspi_dotprod_off_s16_aes3 # [10]
|
||||
|
||||
.LBB226_dspi_dotprod_off_s16_aes3: # 0x299
|
||||
ee.vmulas.s16.accx.ld.ip.qup q0,a2,16,q0,q1,q2,q3 # [0*II+0] id:819
|
||||
ee.vmulas.s16.accx.ld.ip q4,a15,16,q1,q6 # [0*II+1] id:820
|
||||
ee.vmulas.s16.accx.ld.xp.qup q4,a2,a12,q4,q2,q3,q0 # [0*II+3] id:821
|
||||
ee.vmulas.s16.accx.ld.ip q1,a15,16,q2,q6 # [0*II+4] id:822
|
||||
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a11,q1,q3,q0,q4 # [0*II+6] id:823
|
||||
ee.vmulas.s16.accx.ld.ip q4,a15,16,q3,q6 # [0*II+7] id:825
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+8] id:824
|
||||
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q4,q0,q1,q2 # [0*II+10] id:826
|
||||
ee.vmulas.s16.accx.ld.ip q0,a15,16,q0,q6 # [0*II+11] id:827
|
||||
|
||||
.LBB228_dspi_dotprod_off_s16_aes3: # 0x2bc
|
||||
j .Lt_0_30210 # [0]
|
||||
|
||||
.LBB58_dspi_dotprod_off_s16_aes3: # 0x2bf
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
slli a13,a5,1 # [2]
|
||||
l32i a12,a1,80 # [3] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a2,16 # [4] id:828
|
||||
ee.ld.128.usar.ip q2,a2,16 # [5] id:829
|
||||
sub a12,a12,a13 # [7]
|
||||
addi a12,a12,16 # [8]
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [9] id:830
|
||||
mov.n a8,a2 # [10]
|
||||
loopnez a3,.LBB250_dspi_dotprod_off_s16_aes3 # [11]
|
||||
|
||||
.LBB248_dspi_dotprod_off_s16_aes3: # 0x2dd
|
||||
ee.vmulas.s16.accx.ld.ip.qup q0,a8,16,q0,q1,q2,q3 # [0*II+0] id:831
|
||||
ee.vmulas.s16.accx.ld.ip q4,a15,16,q1,q6 # [0*II+1] id:832
|
||||
ee.vmulas.s16.accx.ld.ip.qup q4,a8,16,q4,q2,q3,q0 # [0*II+3] id:833
|
||||
ee.vmulas.s16.accx.ld.ip q1,a15,16,q2,q6 # [0*II+4] id:834
|
||||
ee.vmulas.s16.accx.ld.ip.qup q1,a8,16,q1,q3,q0,q4 # [0*II+6] id:835
|
||||
ee.vmulas.s16.accx.ld.ip q5,a15,16,q3,q6 # [0*II+7] id:836
|
||||
ee.vmulas.s16.accx.ld.ip.qup q5,a8,16,q5,q0,q4,q1 # [0*II+9] id:837
|
||||
ee.vmulas.s16.accx.ld.ip q0,a15,16,q0,q6 # [0*II+10] id:838
|
||||
ee.vmulas.s16.accx.ld.ip.qup q0,a8,16,q0,q4,q1,q5 # [0*II+12] id:839
|
||||
ee.vmulas.s16.accx.ld.ip q4,a15,16,q4,q6 # [0*II+13] id:840
|
||||
ee.vmulas.s16.accx.ld.xp.qup q4,a8,a12,q4,q1,q5,q0 # [0*II+15] id:841
|
||||
ee.vmulas.s16.accx.ld.ip q1,a15,16,q1,q6 # [0*II+16] id:842
|
||||
ee.vmulas.s16.accx.ld.xp.qup q2,a8,a11,q1,q5,q0,q4 # [0*II+18] id:843
|
||||
ee.vmulas.s16.accx.ld.ip q4,a15,16,q5,q6 # [0*II+19] id:845
|
||||
ee.ld.128.usar.xp q1,a8,a10 # [0*II+20] id:844
|
||||
ee.vmulas.s16.accx.ld.ip.qup q3,a8,16,q4,q0,q1,q2 # [0*II+22] id:846
|
||||
ee.vmulas.s16.accx.ld.ip q0,a15,16,q0,q6 # [0*II+23] id:847
|
||||
|
||||
.LBB250_dspi_dotprod_off_s16_aes3: # 0x320
|
||||
j .Lt_0_33026 # [0]
|
||||
|
||||
.Lt_0_35586: # 0x323
|
||||
movi.n a2,0 # [0]
|
||||
sext a14,a9,15 # [1]
|
||||
s16i a14,a4,0 # [2] id:874
|
||||
retw.n # [3]
|
||||
|
||||
#endif // dsps_dotprod_s16_aes3_enabled
|
||||
@@ -0,0 +1,49 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
|
||||
esp_err_t dspi_dotprod_off_s16_ansi(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift, int16_t offset)
|
||||
{
|
||||
if (in_image->step_x * count_x > in_image->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (in_image->step_y * count_y > in_image->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_x * count_x > filter->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_y * count_y > filter->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
|
||||
int16_t *i_data = (int16_t *)in_image->data;
|
||||
int16_t *f_data = (int16_t *)filter->data;
|
||||
int i_step = in_image->stride_x * in_image->step_y;
|
||||
int f_step = filter->stride_x * filter->step_y;
|
||||
|
||||
int64_t acc = 0;
|
||||
for (int y = 0; y < count_y; y++) {
|
||||
for (int x = 0; x < count_x; x++) {
|
||||
acc += (int32_t)i_data[in_image->step_x * x] * ((int32_t)f_data[filter->step_x * x] + (int32_t)offset);
|
||||
}
|
||||
i_data += i_step;
|
||||
f_data += f_step;
|
||||
}
|
||||
acc += 1 << (shift - 1); // round operation
|
||||
acc >>= shift;
|
||||
*out_value = acc;
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,104 @@
|
||||
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_arp4_enabled == 1)
|
||||
#include "dsp_err_codes.h"
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.global dspi_dotprod_off_s16_arp4
|
||||
.global dspi_dotprod_off_s16_ansi
|
||||
.type dspi_dotprod_off_s16_arp4,@function
|
||||
|
||||
// esp_err_t dspi_dotprod_off_s16_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift, int16_t offset);
|
||||
dspi_dotprod_off_s16_arp4:
|
||||
// in_image - a0
|
||||
// filter - a1
|
||||
// out_value - a2
|
||||
// count_x - a3
|
||||
// count_y - a4
|
||||
// shift - a5
|
||||
// offset - a6
|
||||
|
||||
// i_data - t0
|
||||
// f_data - t1
|
||||
// i_step - t2
|
||||
// f_step - t3
|
||||
// current i_data - t4
|
||||
// current f_data - t5
|
||||
|
||||
lw t1, 4(a0) // load in_image->step_x
|
||||
lw t2, 4(a1) // load filter->step_x
|
||||
or t1, t1, t2
|
||||
addi t1, t1, -1 // should be 0 now
|
||||
andi t2, a3, 7
|
||||
or t1, t1, t2
|
||||
|
||||
beqz t1, .dspi_dotprod_off_s16_arp4_body
|
||||
j dspi_dotprod_off_s16_ansi
|
||||
|
||||
.dspi_dotprod_off_s16_arp4_body:
|
||||
add sp, sp, -16
|
||||
|
||||
sw a6, 0(sp)
|
||||
mv t6, sp
|
||||
esp.vldbc.16.ip q2, t6, 0
|
||||
|
||||
lw t0, 0(a0) // i_data
|
||||
lw t1, 0(a1) // f_data
|
||||
|
||||
|
||||
lw t2, 8(a0) // step_y
|
||||
lw t4, 12(a0) // stride_x
|
||||
mul t2, t4, t2
|
||||
slli t2, t2, 1 // i_step = i_step<<1
|
||||
|
||||
lw t3, 8(a1) // step_y
|
||||
lw t5, 12(a1) // stride_x
|
||||
mul t3, t5, t3
|
||||
slli t3, t3, 1 // f_step = f_step<<1
|
||||
|
||||
srli t6, a3, 3 // t5 = len/8
|
||||
|
||||
|
||||
addi a7, a5, -1
|
||||
li t4, 1
|
||||
sll t4, t4, a7
|
||||
esp.zero.xacc
|
||||
esp.movx.w.xacc.l t4
|
||||
|
||||
.loop_count_y:
|
||||
mv t4, t0
|
||||
mv t5, t1
|
||||
esp.vld.128.ip q1, t5, 16 // q0 - i_data
|
||||
|
||||
esp.lp.setup 0, t6, .loop_count_x
|
||||
esp.vld.128.ip q0, t4, 16 // q1 - f_data
|
||||
esp.vadd.s16 q3, q2, q1
|
||||
.loop_count_x: esp.vmulas.s16.xacc.ld.ip q1, t5, 16, q0, q3 // q0 - i_data
|
||||
|
||||
add t0, t0, t2
|
||||
add t1, t1, t3
|
||||
add a4,a4, -1
|
||||
bgtz a4, .loop_count_y
|
||||
|
||||
esp.srs.s.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
|
||||
sh t5, 0(a2) // store result to output buffer
|
||||
|
||||
li a0,0
|
||||
add sp,sp,16
|
||||
ret
|
||||
|
||||
#endif // dspi_dotprod_arp4_enabled
|
||||
@@ -0,0 +1,408 @@
|
||||
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_aes3_enabled == 1)
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.literal .LC0_1_57, 458755
|
||||
|
||||
# Program Unit: dspi_dotprod_off_s8_aes3
|
||||
.type dspi_dotprod_off_s8_aes3, @function
|
||||
.align 4
|
||||
.global dspi_dotprod_off_s8_aes3
|
||||
dspi_dotprod_off_s8_aes3: # 0x4
|
||||
.LBB1_dspi_dotprod_off_s8_aes3: # 0x4
|
||||
entry a1,112 #
|
||||
l32i.n a10,a2,4 # [0] id:745
|
||||
l32i.n a12,a2,12 # [1] id:744
|
||||
mull a8,a10,a5 # [2]
|
||||
blt a12,a8,.LBB86_dspi_dotprod_off_s8_aes3 # [4]
|
||||
|
||||
l32i.n a13,a2,8 # [0] id:746
|
||||
l32i.n a9,a2,16 # [1] id:747
|
||||
mull a11,a13,a6 # [2]
|
||||
blt a9,a11,.LBB86_dspi_dotprod_off_s8_aes3 # [4]
|
||||
|
||||
l32i.n a15,a3,4 # [0] id:749
|
||||
l32i.n a14,a3,12 # [1] id:748
|
||||
mull a11,a15,a5 # [2]
|
||||
blt a14,a11,.LBB86_dspi_dotprod_off_s8_aes3 # [4]
|
||||
|
||||
l32i.n a8,a3,16 # [0] id:751
|
||||
l32i.n a9,a3,8 # [1] id:750
|
||||
s32i a9,a1,72 # [2] gra_spill_temp_2
|
||||
mull a9,a9,a6 # [3]
|
||||
blt a8,a9,.LBB86_dspi_dotprod_off_s8_aes3 # [5]
|
||||
|
||||
l32i.n a8,a3,0 # [0] id:752
|
||||
s32i a8,a1,68 # [1] gra_spill_temp_1
|
||||
bbsi a8,0,.Lt_0_35330 # [2]
|
||||
|
||||
bne a14,a11,.Lt_0_35330 # [0]
|
||||
|
||||
bnei a15,1,.Lt_0_35330 # [0]
|
||||
|
||||
l32i a11,a1,72 # [0] gra_spill_temp_2
|
||||
beqi a11,1,.Lt_0_18946 # [2]
|
||||
|
||||
.Lt_0_35330: # 0x46
|
||||
.Lt_0_19202: # 0x46
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
.type dspi_dotprod_s8_ansi, @function
|
||||
call8 dspi_dotprod_s8_ansi # [6] dspi_dotprod_s8_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB86_dspi_dotprod_off_s8_aes3: # 0x59
|
||||
l32r a2,.LC0_1_57 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.Lt_0_18946: # 0x5e
|
||||
addi.n a14,a10,-1 # [0]
|
||||
bnez a14,.Lt_0_36098 # [1]
|
||||
|
||||
addi.n a15,a13,-1 # [0]
|
||||
bnez a15,.Lt_0_36098 # [1]
|
||||
|
||||
extui a8,a5,0,4 # [0]
|
||||
bnez.n a8,.Lt_0_36098 # [1]
|
||||
|
||||
blti a6,4,.Lt_0_36098 # [0]
|
||||
|
||||
movi.n a9,64 # [0]
|
||||
blt a9,a5,.LBB27_dspi_dotprod_off_s8_aes3 # [1]
|
||||
|
||||
.Lt_0_36610: # 0x75
|
||||
.Lt_0_20994: # 0x75
|
||||
mov.n a8,a1 # [0]
|
||||
l8ui a9,a1,112 # [1] id:754 offset+0x0
|
||||
l32i.n a15,a2,0 # [2] id:753
|
||||
mull a10,a12,a13 # [3]
|
||||
l32i a2,a1,68 # [4] gra_spill_temp_1
|
||||
s32i a10,a1,64 # [5] gra_spill_temp_0
|
||||
sext a9,a9,7 # [6]
|
||||
movi.n a10,4 # [7]
|
||||
# loop-count fixed at 4
|
||||
loop a10,.LBB140_dspi_dotprod_off_s8_aes3 # [8]
|
||||
|
||||
.LBB135_dspi_dotprod_off_s8_aes3: # 0x8d
|
||||
s8i a9,a8,0 # [0*II+0] id:755 temp_offset+0x0
|
||||
s8i a9,a8,1 # [0*II+1] id:755 temp_offset+0x0
|
||||
s8i a9,a8,2 # [0*II+2] id:755 temp_offset+0x0
|
||||
s8i a9,a8,3 # [0*II+3] id:755 temp_offset+0x0
|
||||
s8i a9,a8,4 # [0*II+4] id:755 temp_offset+0x0
|
||||
s8i a9,a8,5 # [0*II+5] id:755 temp_offset+0x0
|
||||
s8i a9,a8,6 # [0*II+6] id:755 temp_offset+0x0
|
||||
s8i a9,a8,7 # [0*II+7] id:755 temp_offset+0x0
|
||||
addi.n a8,a8,8 # [0*II+8]
|
||||
|
||||
.LBB140_dspi_dotprod_off_s8_aes3: # 0xa7
|
||||
mov.n a3,a6 # [0]
|
||||
addi a11,a5,-48 # [1]
|
||||
|
||||
addi.n a12,a1,8 # [3] temp_offset+8
|
||||
movi.n a13,0 # [4]
|
||||
wur.accx_0 a13 # [5]
|
||||
wur.accx_1 a13 # [6]
|
||||
ee.vld.128.ip q6,a12,0 # [7] id:756
|
||||
s32i.n a12,a1,32 # [8] offset_data_ptr
|
||||
beqz a11,.LBB34_dspi_dotprod_off_s8_aes3 # [9]
|
||||
|
||||
l32i a2,a1,68 # [0] gra_spill_temp_1
|
||||
ee.vld.128.ip q0,a2,16 # [2] id:771
|
||||
st.qr q0,a1,48 # [3] q0
|
||||
|
||||
.Lt_0_24578: # 0xc6
|
||||
addi a14,a5,-32 # [0]
|
||||
beqz a14,.LBB43_dspi_dotprod_off_s8_aes3 # [1]
|
||||
|
||||
.Lt_0_26626: # 0xcc
|
||||
.Lt_0_26114: # 0xcc
|
||||
addi a8,a5,-16 # [0]
|
||||
beqz a8,.LBB50_dspi_dotprod_off_s8_aes3 # [1]
|
||||
|
||||
.Lt_0_28162: # 0xd2
|
||||
.Lt_0_27650: # 0xd2
|
||||
addi a9,a5,-64 # [0]
|
||||
beqz a9,.LBB57_dspi_dotprod_off_s8_aes3 # [1]
|
||||
|
||||
.Lt_0_29698: # 0xd8
|
||||
.Lt_0_29186: # 0xd8
|
||||
addi a10,a5,-128 # [0]
|
||||
beqz a10,.LBB64_dspi_dotprod_off_s8_aes3 # [1]
|
||||
|
||||
movi a11,128 # [0]
|
||||
bge a11,a5,.Lt_0_32514 # [1]
|
||||
|
||||
movi.n a12,0 # [0]
|
||||
ee.ld.128.usar.ip q1,a15,16 # [1] id:833
|
||||
ee.ld.128.usar.ip q2,a15,16 # [2] id:834
|
||||
ee.src.q.ld.ip q3,a15,16,q1,q2 # [4] id:835
|
||||
beqz.n a3,.Lt_0_32514 # [5]
|
||||
|
||||
ld.qr q0,a1,48 # [0] q0
|
||||
l32i a14,a1,64 # [1] gra_spill_temp_0
|
||||
addi a13,a5,31 # [2]
|
||||
movgez a13,a5,a5 # [3]
|
||||
srai a13,a13,5 # [4]
|
||||
sub a14,a14,a5 # [5]
|
||||
addi a14,a14,16 # [6]
|
||||
addi.n a13,a13,-1 # [7]
|
||||
|
||||
.Lt_0_33282: # 0x108
|
||||
beqz.n a13,.Lt_0_33538 # [0]
|
||||
|
||||
loopnez a13,.LBB277_dspi_dotprod_off_s8_aes3 # [0]
|
||||
|
||||
.LBB275_dspi_dotprod_off_s8_aes3: # 0x10d
|
||||
ee.vmulas.s8.accx.ld.ip.qup q0,a15,16,q0,q1,q2,q3 # [0*II+0] id:836
|
||||
ee.vmulas.s8.accx.ld.ip q1,a2,16,q1,q6 # [0*II+1] id:837
|
||||
ee.vmulas.s8.accx.ld.ip.qup q1,a15,16,q1,q2,q3,q0 # [0*II+3] id:838
|
||||
ee.vmulas.s8.accx.ld.ip q4,a2,16,q2,q6 # [0*II+4] id:839
|
||||
ee.vmulas.s8.accx.ld.ip.qup q2,a15,16,q4,q3,q0,q1 # [0*II+6] id:840
|
||||
ee.vmulas.s8.accx.ld.ip q4,a2,16,q3,q6 # [0*II+7] id:841
|
||||
ee.vmulas.s8.accx.ld.ip.qup q3,a15,16,q4,q0,q1,q2 # [0*II+9] id:842
|
||||
ee.vmulas.s8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+10] id:843
|
||||
|
||||
.LBB277_dspi_dotprod_off_s8_aes3: # 0x12d
|
||||
|
||||
.Lt_0_33538: # 0x12d
|
||||
ee.vmulas.s8.accx.ld.ip.qup q4,a15,16,q0,q1,q2,q3 # [0] id:844
|
||||
ee.vmulas.s8.accx.ld.ip q1,a2,16,q1,q6 # [1] id:845
|
||||
movi.n a8,32 # [2]
|
||||
ee.vmulas.s8.accx.ld.xp.qup q0,a15,a14,q1,q2,q3,q4 # [3] id:846
|
||||
ee.vmulas.s8.accx.ld.ip q7,a2,16,q2,q6 # [4] id:847
|
||||
movi.n a9,-16 # [5]
|
||||
ee.vmulas.s8.accx.ld.xp.qup q2,a15,a9,q7,q3,q4,q0 # [6] id:848
|
||||
ee.vmulas.s8.accx.ld.ip q5,a2,16,q3,q6 # [7] id:850
|
||||
ee.ld.128.usar.xp q1,a15,a8 # [8] id:849
|
||||
addi.n a12,a12,1 # [9]
|
||||
ee.vmulas.s8.accx.ld.ip.qup q3,a15,16,q5,q4,q1,q2 # [10] id:851
|
||||
ee.vmulas.s8.accx.ld.ip q0,a2,16,q4,q6 # [11] id:852
|
||||
bne a12,a3,.Lt_0_33282 # [12]
|
||||
|
||||
.Lt_0_32514: # 0x159
|
||||
.Lt_0_32258: # 0x159
|
||||
movi.n a2,0 # [0]
|
||||
rur.accx_0 a10 # [1]
|
||||
addi.n a12,a7,-1 # [2]
|
||||
movi.n a11,1 # [3]
|
||||
ssl a12 # [4]
|
||||
sll a11,a11 # [5]
|
||||
ssr a7 # [6]
|
||||
add.n a10,a10,a11 # [7]
|
||||
sra a10,a10 # [8]
|
||||
s8i a10,a4,0 # [9] id:854
|
||||
retw.n # [10]
|
||||
|
||||
.Lt_0_36098: # 0x175
|
||||
.Lt_0_20226: # 0x175
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
call8 dspi_dotprod_s8_ansi # [6] dspi_dotprod_s8_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB27_dspi_dotprod_off_s8_aes3: # 0x188
|
||||
extui a14,a5,0,1 # [0]
|
||||
beqz a14,.Lt_0_36610 # [1]
|
||||
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
call8 dspi_dotprod_s8_ansi # [6] dspi_dotprod_s8_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB34_dspi_dotprod_off_s8_aes3: # 0x1a1
|
||||
ee.ld.128.usar.ip q0,a15,16 # [0] id:760
|
||||
ee.ld.128.usar.ip q2,a15,16 # [1] id:761
|
||||
ee.src.q.ld.ip q3,a15,16,q0,q2 # [3] id:762
|
||||
beqz.n a6,.Lt_0_24578 # [4]
|
||||
|
||||
movi.n a10,32 # [0]
|
||||
l32i a12,a1,64 # [1] gra_spill_temp_0
|
||||
movi.n a11,-16 # [2]
|
||||
addi a12,a12,-32 # [3]
|
||||
loopgtz a6,.LBB163_dspi_dotprod_off_s8_aes3 # [4]
|
||||
|
||||
.LBB161_dspi_dotprod_off_s8_aes3: # 0x1b9
|
||||
ee.vmulas.s8.accx.ld.ip q1,a2,16,q0,q6 # [0*II+0] id:763
|
||||
ee.vmulas.s8.accx.ld.xp.qup q1,a15,a12,q1,q0,q2,q3 # [0*II+2] id:764
|
||||
ee.vmulas.s8.accx.ld.ip q0,a2,16,q2,q6 # [0*II+3] id:765
|
||||
ee.vmulas.s8.accx.ld.xp.qup q2,a15,a11,q0,q2,q3,q1 # [0*II+5] id:766
|
||||
ee.vmulas.s8.accx.ld.ip q1,a2,16,q3,q6 # [0*II+6] id:768
|
||||
ee.ld.128.usar.xp q0,a15,a10 # [0*II+7] id:767
|
||||
ee.vmulas.s8.accx.ld.ip.qup q3,a15,16,q1,q3,q0,q2 # [0*II+9] id:769
|
||||
|
||||
.LBB163_dspi_dotprod_off_s8_aes3: # 0x1d4
|
||||
st.qr q1,a1,48 # [0] q0
|
||||
j .Lt_0_24578 # [1]
|
||||
|
||||
.LBB43_dspi_dotprod_off_s8_aes3: # 0x1da
|
||||
srli a3,a6,1 # [0]
|
||||
l32i a12,a1,64 # [1] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a15,16 # [2] id:772
|
||||
ee.ld.128.usar.ip q2,a15,16 # [3] id:773
|
||||
addi a12,a12,-16 # [5]
|
||||
ee.src.q.ld.xp q3,a15,a12,q1,q2 # [6] id:774
|
||||
beqz.n a3,.Lt_0_26626 # [7]
|
||||
|
||||
ld.qr q0,a1,48 # [0] q0
|
||||
movi.n a10,32 # [1]
|
||||
movi.n a11,-16 # [2]
|
||||
loopnez a3,.LBB186_dspi_dotprod_off_s8_aes3 # [3]
|
||||
|
||||
.LBB184_dspi_dotprod_off_s8_aes3: # 0x1f8
|
||||
ee.vmulas.s8.accx.ld.xp.qup q0,a15,a11,q0,q1,q2,q3 # [0*II+0] id:775
|
||||
ee.vmulas.s8.accx.ld.ip q3,a2,16,q1,q6 # [0*II+1] id:776
|
||||
ee.ld.128.usar.xp q1,a15,a10 # [0*II+2] id:777
|
||||
ee.vmulas.s8.accx.ld.xp.qup q3,a15,a12,q3,q2,q1,q0 # [0*II+4] id:778
|
||||
ee.vmulas.s8.accx.ld.ip q4,a2,16,q2,q6 # [0*II+5] id:779
|
||||
ee.vmulas.s8.accx.ld.xp.qup q2,a15,a11,q4,q1,q0,q3 # [0*II+7] id:780
|
||||
ee.vmulas.s8.accx.ld.ip q3,a2,16,q1,q6 # [0*II+8] id:781
|
||||
ee.ld.128.usar.xp q1,a15,a10 # [0*II+9] id:782
|
||||
ee.vmulas.s8.accx.ld.xp.qup q3,a15,a12,q3,q0,q1,q2 # [0*II+11] id:783
|
||||
ee.vmulas.s8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+12] id:784
|
||||
|
||||
.LBB186_dspi_dotprod_off_s8_aes3: # 0x21e
|
||||
st.qr q0,a1,48 # [0] q0
|
||||
j .Lt_0_26626 # [1]
|
||||
|
||||
.LBB50_dspi_dotprod_off_s8_aes3: # 0x224
|
||||
srli a3,a3,2 # [0]
|
||||
movi.n a13,-16 # [1]
|
||||
l32i a11,a1,64 # [2] gra_spill_temp_0
|
||||
addi a15,a15,16 # [3]
|
||||
addi a11,a11,16 # [4]
|
||||
ee.ld.128.usar.xp q2,a15,a13 # [5] id:785
|
||||
ee.ld.128.usar.xp q1,a15,a11 # [6] id:786
|
||||
ee.src.q.ld.xp q3,a15,a13,q1,q2 # [8] id:787
|
||||
ee.ld.128.usar.xp q2,a15,a11 # [9] id:788
|
||||
beqz.n a3,.Lt_0_28162 # [10]
|
||||
|
||||
ld.qr q0,a1,48 # [0] q0
|
||||
movi.n a10,-16 # [1]
|
||||
loopnez a3,.LBB209_dspi_dotprod_off_s8_aes3 # [2]
|
||||
|
||||
.LBB207_dspi_dotprod_off_s8_aes3: # 0x248
|
||||
ee.vmulas.s8.accx.ld.xp.qup q3,a15,a10,q0,q1,q2,q3 # [0*II+0] id:789
|
||||
ee.vmulas.s8.accx.ld.ip q0,a2,16,q1,q6 # [0*II+1] id:790
|
||||
ee.ld.128.usar.xp q1,a15,a11 # [0*II+2] id:791
|
||||
ee.vmulas.s8.accx.ld.xp.qup q3,a15,a10,q0,q2,q1,q3 # [0*II+4] id:792
|
||||
ee.vmulas.s8.accx.ld.ip q0,a2,16,q2,q6 # [0*II+5] id:793
|
||||
ee.ld.128.usar.xp q4,a15,a11 # [0*II+6] id:794
|
||||
ee.vmulas.s8.accx.ld.xp.qup q3,a15,a10,q0,q1,q4,q3 # [0*II+8] id:795
|
||||
ee.vmulas.s8.accx.ld.ip q0,a2,16,q1,q6 # [0*II+9] id:796
|
||||
ee.ld.128.usar.xp q1,a15,a11 # [0*II+10] id:797
|
||||
ee.vmulas.s8.accx.ld.xp.qup q3,a15,a10,q0,q4,q1,q3 # [0*II+12] id:798
|
||||
ee.vmulas.s8.accx.ld.ip q0,a2,16,q4,q6 # [0*II+13] id:799
|
||||
ee.ld.128.usar.xp q2,a15,a11 # [0*II+14] id:800
|
||||
|
||||
.LBB209_dspi_dotprod_off_s8_aes3: # 0x274
|
||||
st.qr q0,a1,48 # [0] q0
|
||||
j .Lt_0_28162 # [1]
|
||||
|
||||
.LBB57_dspi_dotprod_off_s8_aes3: # 0x27a
|
||||
ee.ld.128.usar.ip q1,a15,16 # [0] id:801
|
||||
ee.ld.128.usar.ip q2,a15,16 # [1] id:802
|
||||
ee.src.q.ld.ip q3,a15,16,q1,q2 # [3] id:803
|
||||
beqz.n a3,.Lt_0_29698 # [4]
|
||||
|
||||
ld.qr q0,a1,48 # [0] q0
|
||||
movi.n a10,32 # [1]
|
||||
l32i a12,a1,64 # [2] gra_spill_temp_0
|
||||
movi.n a11,-16 # [3]
|
||||
sub a12,a12,a5 # [4]
|
||||
addi a12,a12,16 # [5]
|
||||
loopnez a3,.LBB232_dspi_dotprod_off_s8_aes3 # [6]
|
||||
|
||||
.LBB230_dspi_dotprod_off_s8_aes3: # 0x298
|
||||
ee.vmulas.s8.accx.ld.ip.qup q0,a15,16,q0,q1,q2,q3 # [0*II+0] id:804
|
||||
ee.vmulas.s8.accx.ld.ip q4,a2,16,q1,q6 # [0*II+1] id:805
|
||||
ee.vmulas.s8.accx.ld.xp.qup q4,a15,a12,q4,q2,q3,q0 # [0*II+3] id:806
|
||||
ee.vmulas.s8.accx.ld.ip q1,a2,16,q2,q6 # [0*II+4] id:807
|
||||
ee.vmulas.s8.accx.ld.xp.qup q2,a15,a11,q1,q3,q0,q4 # [0*II+6] id:808
|
||||
ee.vmulas.s8.accx.ld.ip q4,a2,16,q3,q6 # [0*II+7] id:809
|
||||
ee.ld.128.usar.xp q1,a15,a10 # [0*II+8] id:810
|
||||
ee.vmulas.s8.accx.ld.ip.qup q3,a15,16,q4,q0,q1,q2 # [0*II+10] id:811
|
||||
ee.vmulas.s8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+11] id:812
|
||||
|
||||
.LBB232_dspi_dotprod_off_s8_aes3: # 0x2bb
|
||||
st.qr q0,a1,48 # [0] q0
|
||||
j .Lt_0_29698 # [1]
|
||||
|
||||
.LBB64_dspi_dotprod_off_s8_aes3: # 0x2c1
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
l32i a12,a1,64 # [2] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a15,16 # [3] id:813
|
||||
ee.ld.128.usar.ip q2,a15,16 # [4] id:814
|
||||
sub a12,a12,a5 # [6]
|
||||
addi a12,a12,16 # [7]
|
||||
ld.qr q0,a1,48 # [8] q0
|
||||
ee.src.q.ld.ip q3,a15,16,q1,q2 # [9] id:815
|
||||
mov.n a8,a15 # [10]
|
||||
loopnez a3,.LBB254_dspi_dotprod_off_s8_aes3 # [11]
|
||||
|
||||
.LBB252_dspi_dotprod_off_s8_aes3: # 0x2df
|
||||
ee.vmulas.s8.accx.ld.ip.qup q0,a8,16,q0,q1,q2,q3 # [0*II+0] id:816
|
||||
ee.vmulas.s8.accx.ld.ip q4,a2,16,q1,q6 # [0*II+1] id:817
|
||||
ee.vmulas.s8.accx.ld.ip.qup q4,a8,16,q4,q2,q3,q0 # [0*II+3] id:818
|
||||
ee.vmulas.s8.accx.ld.ip q1,a2,16,q2,q6 # [0*II+4] id:819
|
||||
ee.vmulas.s8.accx.ld.ip.qup q1,a8,16,q1,q3,q0,q4 # [0*II+6] id:820
|
||||
ee.vmulas.s8.accx.ld.ip q5,a2,16,q3,q6 # [0*II+7] id:821
|
||||
ee.vmulas.s8.accx.ld.ip.qup q5,a8,16,q5,q0,q4,q1 # [0*II+9] id:822
|
||||
ee.vmulas.s8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+10] id:823
|
||||
ee.vmulas.s8.accx.ld.ip.qup q0,a8,16,q0,q4,q1,q5 # [0*II+12] id:824
|
||||
ee.vmulas.s8.accx.ld.ip q4,a2,16,q4,q6 # [0*II+13] id:825
|
||||
ee.vmulas.s8.accx.ld.xp.qup q4,a8,a12,q4,q1,q5,q0 # [0*II+15] id:826
|
||||
ee.vmulas.s8.accx.ld.ip q1,a2,16,q1,q6 # [0*II+16] id:827
|
||||
ee.vmulas.s8.accx.ld.xp.qup q2,a8,a11,q1,q5,q0,q4 # [0*II+18] id:828
|
||||
ee.vmulas.s8.accx.ld.ip q4,a2,16,q5,q6 # [0*II+19] id:829
|
||||
ee.ld.128.usar.xp q1,a8,a10 # [0*II+20] id:830
|
||||
ee.vmulas.s8.accx.ld.ip.qup q3,a8,16,q4,q0,q1,q2 # [0*II+22] id:831
|
||||
ee.vmulas.s8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+23] id:832
|
||||
|
||||
.LBB254_dspi_dotprod_off_s8_aes3: # 0x322
|
||||
movi.n a2,0 # [0]
|
||||
movi.n a11,1 # [1]
|
||||
addi.n a12,a7,-1 # [2]
|
||||
rur.accx_0 a10 # [3]
|
||||
ssl a12 # [4]
|
||||
sll a11,a11 # [5]
|
||||
ssr a7 # [6]
|
||||
add.n a10,a10,a11 # [7]
|
||||
sra a10,a10 # [8]
|
||||
s8i a10,a4,0 # [9] id:854
|
||||
retw.n # [10]
|
||||
|
||||
#endif // dsps_dotprod_s16_aes3_enabled
|
||||
@@ -0,0 +1,49 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
|
||||
esp_err_t dspi_dotprod_off_s8_ansi(image2d_t *in_image, image2d_t *filter, int8_t *out_value, int count_x, int count_y, int shift, int8_t offset)
|
||||
{
|
||||
if (in_image->step_x * count_x > in_image->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (in_image->step_y * count_y > in_image->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_x * count_x > filter->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_y * count_y > filter->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
|
||||
int8_t *i_data = (int8_t *)in_image->data;
|
||||
int8_t *f_data = (int8_t *)filter->data;
|
||||
int i_step = in_image->stride_x * in_image->step_y;
|
||||
int f_step = filter->stride_x * filter->step_y;
|
||||
|
||||
int32_t acc = 0;
|
||||
for (int y = 0; y < count_y; y++) {
|
||||
for (int x = 0; x < count_x; x++) {
|
||||
acc += (int16_t)i_data[in_image->step_x * x] * ((int16_t)f_data[filter->step_x * x] + (int16_t)offset);
|
||||
}
|
||||
i_data += i_step;
|
||||
f_data += f_step;
|
||||
}
|
||||
acc += 1 << (shift - 1); // round operation
|
||||
acc >>= shift;
|
||||
*out_value = acc;
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,102 @@
|
||||
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_arp4_enabled == 1)
|
||||
#include "dsp_err_codes.h"
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.global dspi_dotprod_off_s8_arp4
|
||||
.global dspi_dotprod_off_s8_ansi
|
||||
.type dspi_dotprod_off_s8_arp4,@function
|
||||
|
||||
// esp_err_t dspi_dotprod_off_s8_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift, int8_t offset);
|
||||
dspi_dotprod_off_s8_arp4:
|
||||
// in_image - a0
|
||||
// filter - a1
|
||||
// out_value - a2
|
||||
// count_x - a3
|
||||
// count_y - a4
|
||||
// shift - a5
|
||||
// offset - a6
|
||||
|
||||
// i_data - t0
|
||||
// f_data - t1
|
||||
// i_step - t2
|
||||
// f_step - t3
|
||||
// t4 - current i_data
|
||||
// t5 - current f_data
|
||||
|
||||
lw t1, 4(a0) // load in_image->step_x
|
||||
lw t2, 4(a1) // load filter->step_x
|
||||
or t1, t1, t2
|
||||
addi t1, t1, -1 // should be 0 now
|
||||
andi t2, a3, 15
|
||||
or t1, t1, t2
|
||||
|
||||
beqz t1, .dspi_dotprod_off_s8_arp4_body
|
||||
j dspi_dotprod_off_s8_ansi
|
||||
|
||||
.dspi_dotprod_off_s8_arp4_body:
|
||||
add sp, sp, -16
|
||||
|
||||
sw a6, 0(sp)
|
||||
mv t6, sp
|
||||
esp.vldbc.8.ip q2, t6, 0
|
||||
|
||||
lw t0, 0(a0) // i_data
|
||||
lw t1, 0(a1) // f_data
|
||||
|
||||
|
||||
lw t2, 8(a0) // step_y
|
||||
lw t4, 12(a0) // stride_x
|
||||
mul t2, t4, t2
|
||||
|
||||
lw t3, 8(a1) // step_y
|
||||
lw t5, 12(a1) // stride_x
|
||||
mul t3, t5, t3
|
||||
|
||||
srli t6, a3, 4 // t5 = len/16
|
||||
|
||||
|
||||
addi a7, a5, -1
|
||||
li t4, 1
|
||||
sll t4, t4, a7
|
||||
esp.zero.xacc
|
||||
esp.movx.w.xacc.l t4
|
||||
|
||||
.loop_count_y:
|
||||
mv t4, t0
|
||||
mv t5, t1
|
||||
esp.vld.128.ip q1, t5, 16 // q0 - i_data
|
||||
|
||||
esp.lp.setup 0, t6, .loop_count_x
|
||||
esp.vld.128.ip q0, t4, 16 // q1 - f_data
|
||||
esp.vadd.s8 q3, q2, q1
|
||||
.loop_count_x: esp.vmulas.s8.xacc.ld.ip q1, t5, 16, q0, q3 // q0 - i_data
|
||||
|
||||
add t0, t0, t2
|
||||
add t1, t1, t3
|
||||
add a4,a4, -1
|
||||
bgtz a4, .loop_count_y
|
||||
|
||||
esp.srs.s.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
|
||||
sh t5, 0(a2) // store result to output buffer
|
||||
|
||||
li a0,0
|
||||
add sp,sp,16
|
||||
ret
|
||||
|
||||
#endif // dspi_dotprod_arp4_enabled
|
||||
@@ -0,0 +1,417 @@
|
||||
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_aes3_enabled == 1)
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.literal .LC0_1_61, 458755
|
||||
|
||||
# Program Unit: dspi_dotprod_off_u16_aes3
|
||||
.type dspi_dotprod_off_u16_aes3, @function
|
||||
.align 4
|
||||
.global dspi_dotprod_off_u16_aes3
|
||||
dspi_dotprod_off_u16_aes3: # 0x4
|
||||
.LBB1_dspi_dotprod_off_u16_aes3: # 0x4
|
||||
entry a1,144 #
|
||||
l32i.n a10,a2,4 # [0] id:760
|
||||
l32i.n a12,a2,12 # [1] id:759
|
||||
mull a8,a10,a5 # [2]
|
||||
blt a12,a8,.LBB89_dspi_dotprod_off_u16_aes3 # [4]
|
||||
|
||||
l32i.n a13,a2,8 # [0] id:761
|
||||
l32i.n a9,a2,16 # [1] id:762
|
||||
mull a11,a13,a6 # [2]
|
||||
blt a9,a11,.LBB89_dspi_dotprod_off_u16_aes3 # [4]
|
||||
|
||||
l32i.n a15,a3,4 # [0] id:764
|
||||
l32i.n a14,a3,12 # [1] id:763
|
||||
mull a11,a15,a5 # [2]
|
||||
blt a14,a11,.LBB89_dspi_dotprod_off_u16_aes3 # [4]
|
||||
|
||||
l32i.n a8,a3,16 # [0] id:766
|
||||
l32i.n a9,a3,8 # [1] id:765
|
||||
s32i a9,a1,104 # [2] gra_spill_temp_2
|
||||
mull a9,a9,a6 # [3]
|
||||
blt a8,a9,.LBB89_dspi_dotprod_off_u16_aes3 # [5]
|
||||
|
||||
l32i.n a8,a3,0 # [0] id:767
|
||||
s32i a8,a1,100 # [1] gra_spill_temp_1
|
||||
bbsi a8,0,.Lt_0_36354 # [2]
|
||||
|
||||
bne a14,a11,.Lt_0_36354 # [0]
|
||||
|
||||
bnei a15,1,.Lt_0_36354 # [0]
|
||||
|
||||
l32i a9,a1,104 # [0] gra_spill_temp_2
|
||||
beqi a9,1,.Lt_0_19458 # [2]
|
||||
|
||||
.Lt_0_36354: # 0x46
|
||||
.Lt_0_19714: # 0x46
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
l16ui a8,a1,144 # [6] id:768 offset+0x0
|
||||
s32i.n a8,a1,0 # [7] id:876
|
||||
.type dspi_dotprod_off_u16_ansi, @function
|
||||
call8 dspi_dotprod_off_u16_ansi # [8] dspi_dotprod_off_u16_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB89_dspi_dotprod_off_u16_aes3: # 0x5e
|
||||
l32r a2,.LC0_1_61 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.Lt_0_19458: # 0x63
|
||||
addi.n a9,a10,-1 # [0]
|
||||
bnez a9,.Lt_0_37122 # [1]
|
||||
|
||||
addi.n a10,a13,-1 # [0]
|
||||
bnez a10,.Lt_0_37122 # [1]
|
||||
|
||||
extui a11,a5,0,3 # [0]
|
||||
bnez.n a11,.Lt_0_37122 # [1]
|
||||
|
||||
blti a6,4,.Lt_0_37122 # [0]
|
||||
|
||||
movi.n a14,32 # [0]
|
||||
blt a14,a5,.LBB27_dspi_dotprod_off_u16_aes3 # [1]
|
||||
|
||||
.Lt_0_37634: # 0x7a
|
||||
.Lt_0_21506: # 0x7a
|
||||
l16ui a9,a1,144 # [0] id:768 offset+0x0
|
||||
addi a8,a1,16 # [1] temp_offset
|
||||
l32i.n a15,a2,0 # [2] id:769
|
||||
mull a10,a12,a13 # [3]
|
||||
l32i a2,a1,100 # [4] gra_spill_temp_1
|
||||
slli a10,a10,1 # [5]
|
||||
s32i a10,a1,96 # [6] gra_spill_temp_0
|
||||
movi.n a10,2 # [7]
|
||||
# loop-count fixed at 2
|
||||
loop a10,.LBB143_dspi_dotprod_off_u16_aes3 # [8]
|
||||
|
||||
.LBB138_dspi_dotprod_off_u16_aes3: # 0x93
|
||||
s16i a9,a8,0 # [0*II+0] id:770 temp_offset+0x0
|
||||
s16i a9,a8,2 # [0*II+1] id:770 temp_offset+0x0
|
||||
s16i a9,a8,4 # [0*II+2] id:770 temp_offset+0x0
|
||||
s16i a9,a8,6 # [0*II+3] id:770 temp_offset+0x0
|
||||
s16i a9,a8,8 # [0*II+4] id:770 temp_offset+0x0
|
||||
s16i a9,a8,10 # [0*II+5] id:770 temp_offset+0x0
|
||||
s16i a9,a8,12 # [0*II+6] id:770 temp_offset+0x0
|
||||
s16i a9,a8,14 # [0*II+7] id:770 temp_offset+0x0
|
||||
addi a8,a8,16 # [0*II+8]
|
||||
|
||||
.LBB143_dspi_dotprod_off_u16_aes3: # 0xae
|
||||
mov.n a3,a6 # [0]
|
||||
addi a11,a5,-24 # [1]
|
||||
addi a12,a1,24 # [3] temp_offset+8
|
||||
movi.n a13,0 # [4]
|
||||
wur.sar_byte a13 # [5]
|
||||
wur.accx_0 a13 # [6]
|
||||
wur.accx_1 a13 # [7]
|
||||
ee.vld.128.ip q6,a12,0 # [8] id:771
|
||||
s32i.n a12,a1,48 # [9] offset_data_ptr
|
||||
beqz a11,.LBB34_dspi_dotprod_off_u16_aes3 # [10]
|
||||
|
||||
l32i a2,a1,100 # [0] gra_spill_temp_1
|
||||
ee.vld.128.ip q0,a2,16 # [2] id:787
|
||||
st.qr q0,a1,64 # [3] q0
|
||||
|
||||
.Lt_0_25090: # 0xd1
|
||||
addi a14,a5,-16 # [0]
|
||||
beqz a14,.LBB43_dspi_dotprod_off_u16_aes3 # [1]
|
||||
|
||||
.Lt_0_27138: # 0xd7
|
||||
.Lt_0_26626: # 0xd7
|
||||
addi a8,a5,-8 # [0]
|
||||
beqz a8,.LBB50_dspi_dotprod_off_u16_aes3 # [1]
|
||||
|
||||
.Lt_0_28674: # 0xdd
|
||||
.Lt_0_28162: # 0xdd
|
||||
addi a9,a5,-32 # [0]
|
||||
beqz a9,.LBB57_dspi_dotprod_off_u16_aes3 # [1]
|
||||
|
||||
.Lt_0_30210: # 0xe3
|
||||
.Lt_0_29698: # 0xe3
|
||||
addi a10,a5,-64 # [0]
|
||||
beqz a10,.LBB64_dspi_dotprod_off_u16_aes3 # [1]
|
||||
|
||||
movi.n a11,64 # [0]
|
||||
bge a11,a5,.Lt_0_33026 # [1]
|
||||
|
||||
movi.n a12,0 # [0]
|
||||
ee.ld.128.usar.ip q1,a15,16 # [1] id:849
|
||||
ee.ld.128.usar.ip q2,a15,16 # [2] id:850
|
||||
ee.src.q.ld.ip q3,a15,16,q1,q2 # [4] id:851
|
||||
beqz.n a3,.Lt_0_33026 # [5]
|
||||
|
||||
ld.qr q0,a1,64 # [0] q0
|
||||
slli a8,a5,1 # [1]
|
||||
l32i a14,a1,96 # [2] gra_spill_temp_0
|
||||
addi a13,a5,31 # [3]
|
||||
movgez a13,a5,a5 # [4]
|
||||
srai a13,a13,5 # [5]
|
||||
sub a14,a14,a8 # [6]
|
||||
addi a14,a14,16 # [7]
|
||||
addi.n a13,a13,-1 # [8]
|
||||
|
||||
.Lt_0_33794: # 0x115
|
||||
beqz.n a13,.Lt_0_34050 # [0]
|
||||
|
||||
loopnez a13,.LBB280_dspi_dotprod_off_u16_aes3 # [0]
|
||||
|
||||
.LBB278_dspi_dotprod_off_u16_aes3: # 0x11a
|
||||
ee.vmulas.u16.accx.ld.ip.qup q0,a15,16,q0,q1,q2,q3 # [0*II+0] id:852
|
||||
ee.vmulas.u16.accx.ld.ip q1,a2,16,q1,q6 # [0*II+1] id:853
|
||||
ee.vmulas.u16.accx.ld.ip.qup q1,a15,16,q1,q2,q3,q0 # [0*II+3] id:854
|
||||
ee.vmulas.u16.accx.ld.ip q4,a2,16,q2,q6 # [0*II+4] id:855
|
||||
ee.vmulas.u16.accx.ld.ip.qup q2,a15,16,q4,q3,q0,q1 # [0*II+6] id:856
|
||||
ee.vmulas.u16.accx.ld.ip q4,a2,16,q3,q6 # [0*II+7] id:857
|
||||
ee.vmulas.u16.accx.ld.ip.qup q3,a15,16,q4,q0,q1,q2 # [0*II+9] id:858
|
||||
ee.vmulas.u16.accx.ld.ip q0,a2,16,q0,q6 # [0*II+10] id:859
|
||||
|
||||
.LBB280_dspi_dotprod_off_u16_aes3: # 0x13a
|
||||
|
||||
.Lt_0_34050: # 0x13a
|
||||
ee.vmulas.u16.accx.ld.ip.qup q4,a15,16,q0,q1,q2,q3 # [0] id:860
|
||||
ee.vmulas.u16.accx.ld.ip q1,a2,16,q1,q6 # [1] id:861
|
||||
movi.n a9,32 # [2]
|
||||
ee.vmulas.u16.accx.ld.xp.qup q0,a15,a14,q1,q2,q3,q4 # [3] id:862
|
||||
ee.vmulas.u16.accx.ld.ip q7,a2,16,q2,q6 # [4] id:863
|
||||
movi.n a10,-16 # [5]
|
||||
ee.vmulas.u16.accx.ld.xp.qup q2,a15,a10,q7,q3,q4,q0 # [6] id:864
|
||||
ee.vmulas.u16.accx.ld.ip q5,a2,16,q3,q6 # [7] id:866
|
||||
ee.ld.128.usar.xp q1,a15,a9 # [8] id:865
|
||||
addi.n a12,a12,1 # [9]
|
||||
ee.vmulas.u16.accx.ld.ip.qup q3,a15,16,q5,q4,q1,q2 # [10] id:867
|
||||
ee.vmulas.u16.accx.ld.ip q0,a2,16,q4,q6 # [11] id:868
|
||||
bne a12,a3,.Lt_0_33794 # [12]
|
||||
|
||||
.Lt_0_33026: # 0x166
|
||||
.Lt_0_32770: # 0x166
|
||||
rur.accx_0 a9 # [0]
|
||||
rur.accx_1 a10 # [1]
|
||||
blti a7,1,.Lt_0_35586 # [2]
|
||||
|
||||
movi.n a2,0 # [0]
|
||||
addi a13,a7,-33 # [1]
|
||||
addi.n a14,a7,-1 # [2]
|
||||
ssr a14 # [3]
|
||||
sra a12,a10 # [4]
|
||||
src a11,a10,a9 # [5]
|
||||
movgez a11,a12,a13 # [6]
|
||||
addi.n a11,a11,1 # [7]
|
||||
srli a11,a11,1 # [8]
|
||||
s16i a11,a4,0 # [9] id:874
|
||||
retw.n # [10]
|
||||
|
||||
.Lt_0_37122: # 0x18c
|
||||
.Lt_0_20738: # 0x18c
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
l16ui a8,a1,144 # [6] id:768 offset+0x0
|
||||
s32i.n a8,a1,0 # [7] id:877
|
||||
call8 dspi_dotprod_off_u16_ansi # [8] dspi_dotprod_off_u16_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB27_dspi_dotprod_off_u16_aes3: # 0x1a4
|
||||
extui a9,a5,0,1 # [0]
|
||||
beqz a9,.Lt_0_37634 # [1]
|
||||
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
l16ui a8,a1,144 # [6] id:768 offset+0x0
|
||||
s32i.n a8,a1,0 # [7] id:878
|
||||
call8 dspi_dotprod_off_u16_ansi # [8] dspi_dotprod_off_u16_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB34_dspi_dotprod_off_u16_aes3: # 0x1c2
|
||||
ee.ld.128.usar.ip q0,a15,16 # [0] id:776
|
||||
ee.ld.128.usar.ip q2,a15,16 # [1] id:777
|
||||
ee.src.q.ld.ip q3,a15,16,q0,q2 # [3] id:778
|
||||
beqz.n a6,.Lt_0_25090 # [4]
|
||||
|
||||
movi.n a10,32 # [0]
|
||||
l32i a12,a1,96 # [1] gra_spill_temp_0
|
||||
movi.n a11,-16 # [2]
|
||||
addi a12,a12,-32 # [3]
|
||||
loopgtz a6,.LBB166_dspi_dotprod_off_u16_aes3 # [4]
|
||||
|
||||
.LBB164_dspi_dotprod_off_u16_aes3: # 0x1da
|
||||
ee.vmulas.u16.accx.ld.ip q1,a2,16,q0,q6 # [0*II+0] id:779
|
||||
ee.vmulas.u16.accx.ld.xp.qup q1,a15,a12,q1,q0,q2,q3 # [0*II+2] id:780
|
||||
ee.vmulas.u16.accx.ld.ip q0,a2,16,q2,q6 # [0*II+3] id:781
|
||||
ee.vmulas.u16.accx.ld.xp.qup q2,a15,a11,q0,q2,q3,q1 # [0*II+5] id:782
|
||||
ee.vmulas.u16.accx.ld.ip q1,a2,16,q3,q6 # [0*II+6] id:784
|
||||
ee.ld.128.usar.xp q0,a15,a10 # [0*II+7] id:783
|
||||
ee.vmulas.u16.accx.ld.ip.qup q3,a15,16,q1,q3,q0,q2 # [0*II+9] id:785
|
||||
|
||||
.LBB166_dspi_dotprod_off_u16_aes3: # 0x1f5
|
||||
st.qr q1,a1,64 # [0] q0
|
||||
j .Lt_0_25090 # [1]
|
||||
|
||||
.LBB43_dspi_dotprod_off_u16_aes3: # 0x1fb
|
||||
srli a3,a6,1 # [0]
|
||||
l32i a12,a1,96 # [1] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a15,16 # [2] id:788
|
||||
ee.ld.128.usar.ip q2,a15,16 # [3] id:789
|
||||
addi a12,a12,-16 # [5]
|
||||
ee.src.q.ld.xp q3,a15,a12,q1,q2 # [6] id:790
|
||||
beqz.n a3,.Lt_0_27138 # [7]
|
||||
|
||||
ld.qr q0,a1,64 # [0] q0
|
||||
movi.n a10,32 # [1]
|
||||
movi.n a11,-16 # [2]
|
||||
loopnez a3,.LBB189_dspi_dotprod_off_u16_aes3 # [3]
|
||||
|
||||
.LBB187_dspi_dotprod_off_u16_aes3: # 0x219
|
||||
ee.vmulas.u16.accx.ld.xp.qup q0,a15,a11,q0,q1,q2,q3 # [0*II+0] id:791
|
||||
ee.vmulas.u16.accx.ld.ip q3,a2,16,q1,q6 # [0*II+1] id:792
|
||||
ee.ld.128.usar.xp q1,a15,a10 # [0*II+2] id:793
|
||||
ee.vmulas.u16.accx.ld.xp.qup q3,a15,a12,q3,q2,q1,q0 # [0*II+4] id:794
|
||||
ee.vmulas.u16.accx.ld.ip q4,a2,16,q2,q6 # [0*II+5] id:795
|
||||
ee.vmulas.u16.accx.ld.xp.qup q2,a15,a11,q4,q1,q0,q3 # [0*II+7] id:796
|
||||
ee.vmulas.u16.accx.ld.ip q3,a2,16,q1,q6 # [0*II+8] id:797
|
||||
ee.ld.128.usar.xp q1,a15,a10 # [0*II+9] id:798
|
||||
ee.vmulas.u16.accx.ld.xp.qup q3,a15,a12,q3,q0,q1,q2 # [0*II+11] id:799
|
||||
ee.vmulas.u16.accx.ld.ip q0,a2,16,q0,q6 # [0*II+12] id:800
|
||||
|
||||
.LBB189_dspi_dotprod_off_u16_aes3: # 0x23f
|
||||
st.qr q0,a1,64 # [0] q0
|
||||
j .Lt_0_27138 # [1]
|
||||
|
||||
.LBB50_dspi_dotprod_off_u16_aes3: # 0x245
|
||||
srli a3,a3,2 # [0]
|
||||
movi.n a13,-16 # [1]
|
||||
l32i a11,a1,96 # [2] gra_spill_temp_0
|
||||
addi a15,a15,16 # [3]
|
||||
addi a11,a11,16 # [4]
|
||||
ee.ld.128.usar.xp q2,a15,a13 # [5] id:801
|
||||
ee.ld.128.usar.xp q1,a15,a11 # [6] id:802
|
||||
ee.src.q.ld.xp q3,a15,a13,q1,q2 # [8] id:803
|
||||
ee.ld.128.usar.xp q2,a15,a11 # [9] id:804
|
||||
beqz.n a3,.Lt_0_28674 # [10]
|
||||
|
||||
ld.qr q0,a1,64 # [0] q0
|
||||
movi.n a10,-16 # [1]
|
||||
loopnez a3,.LBB212_dspi_dotprod_off_u16_aes3 # [2]
|
||||
|
||||
.LBB210_dspi_dotprod_off_u16_aes3: # 0x269
|
||||
ee.vmulas.u16.accx.ld.xp.qup q3,a15,a10,q0,q1,q2,q3 # [0*II+0] id:805
|
||||
ee.vmulas.u16.accx.ld.ip q0,a2,16,q1,q6 # [0*II+1] id:806
|
||||
ee.ld.128.usar.xp q1,a15,a11 # [0*II+2] id:807
|
||||
ee.vmulas.u16.accx.ld.xp.qup q3,a15,a10,q0,q2,q1,q3 # [0*II+4] id:808
|
||||
ee.vmulas.u16.accx.ld.ip q0,a2,16,q2,q6 # [0*II+5] id:809
|
||||
ee.ld.128.usar.xp q4,a15,a11 # [0*II+6] id:810
|
||||
ee.vmulas.u16.accx.ld.xp.qup q3,a15,a10,q0,q1,q4,q3 # [0*II+8] id:811
|
||||
ee.vmulas.u16.accx.ld.ip q0,a2,16,q1,q6 # [0*II+9] id:812
|
||||
ee.ld.128.usar.xp q1,a15,a11 # [0*II+10] id:813
|
||||
ee.vmulas.u16.accx.ld.xp.qup q3,a15,a10,q0,q4,q1,q3 # [0*II+12] id:814
|
||||
ee.vmulas.u16.accx.ld.ip q0,a2,16,q4,q6 # [0*II+13] id:815
|
||||
ee.ld.128.usar.xp q2,a15,a11 # [0*II+14] id:816
|
||||
|
||||
.LBB212_dspi_dotprod_off_u16_aes3: # 0x295
|
||||
st.qr q0,a1,64 # [0] q0
|
||||
j .Lt_0_28674 # [1]
|
||||
|
||||
.LBB57_dspi_dotprod_off_u16_aes3: # 0x29b
|
||||
ee.ld.128.usar.ip q1,a15,16 # [0] id:817
|
||||
ee.ld.128.usar.ip q2,a15,16 # [1] id:818
|
||||
ee.src.q.ld.ip q3,a15,16,q1,q2 # [3] id:819
|
||||
beqz.n a3,.Lt_0_30210 # [4]
|
||||
|
||||
ld.qr q0,a1,64 # [0] q0
|
||||
movi.n a10,32 # [1]
|
||||
movi.n a11,-16 # [2]
|
||||
l32i a12,a1,96 # [3] gra_spill_temp_0
|
||||
slli a13,a5,1 # [4]
|
||||
sub a12,a12,a13 # [5]
|
||||
addi a12,a12,16 # [6]
|
||||
loopnez a3,.LBB235_dspi_dotprod_off_u16_aes3 # [7]
|
||||
|
||||
.LBB233_dspi_dotprod_off_u16_aes3: # 0x2bc
|
||||
ee.vmulas.u16.accx.ld.ip.qup q0,a15,16,q0,q1,q2,q3 # [0*II+0] id:820
|
||||
ee.vmulas.u16.accx.ld.ip q4,a2,16,q1,q6 # [0*II+1] id:821
|
||||
ee.vmulas.u16.accx.ld.xp.qup q4,a15,a12,q4,q2,q3,q0 # [0*II+3] id:822
|
||||
ee.vmulas.u16.accx.ld.ip q1,a2,16,q2,q6 # [0*II+4] id:823
|
||||
ee.vmulas.u16.accx.ld.xp.qup q2,a15,a11,q1,q3,q0,q4 # [0*II+6] id:824
|
||||
ee.vmulas.u16.accx.ld.ip q4,a2,16,q3,q6 # [0*II+7] id:826
|
||||
ee.ld.128.usar.xp q1,a15,a10 # [0*II+8] id:825
|
||||
ee.vmulas.u16.accx.ld.ip.qup q3,a15,16,q4,q0,q1,q2 # [0*II+10] id:827
|
||||
ee.vmulas.u16.accx.ld.ip q0,a2,16,q0,q6 # [0*II+11] id:828
|
||||
|
||||
.LBB235_dspi_dotprod_off_u16_aes3: # 0x2df
|
||||
st.qr q0,a1,64 # [0] q0
|
||||
j .Lt_0_30210 # [1]
|
||||
|
||||
.LBB64_dspi_dotprod_off_u16_aes3: # 0x2e5
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
slli a13,a5,1 # [2]
|
||||
l32i a12,a1,96 # [3] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a15,16 # [4] id:829
|
||||
ee.ld.128.usar.ip q2,a15,16 # [5] id:830
|
||||
sub a12,a12,a13 # [7]
|
||||
addi a12,a12,16 # [8]
|
||||
ld.qr q0,a1,64 # [9] q0
|
||||
ee.src.q.ld.ip q3,a15,16,q1,q2 # [10] id:831
|
||||
mov.n a8,a15 # [11]
|
||||
loopnez a3,.LBB257_dspi_dotprod_off_u16_aes3 # [12]
|
||||
|
||||
.LBB255_dspi_dotprod_off_u16_aes3: # 0x306
|
||||
ee.vmulas.u16.accx.ld.ip.qup q0,a8,16,q0,q1,q2,q3 # [0*II+0] id:832
|
||||
ee.vmulas.u16.accx.ld.ip q4,a2,16,q1,q6 # [0*II+1] id:833
|
||||
ee.vmulas.u16.accx.ld.ip.qup q4,a8,16,q4,q2,q3,q0 # [0*II+3] id:834
|
||||
ee.vmulas.u16.accx.ld.ip q1,a2,16,q2,q6 # [0*II+4] id:835
|
||||
ee.vmulas.u16.accx.ld.ip.qup q1,a8,16,q1,q3,q0,q4 # [0*II+6] id:836
|
||||
ee.vmulas.u16.accx.ld.ip q5,a2,16,q3,q6 # [0*II+7] id:837
|
||||
ee.vmulas.u16.accx.ld.ip.qup q5,a8,16,q5,q0,q4,q1 # [0*II+9] id:838
|
||||
ee.vmulas.u16.accx.ld.ip q0,a2,16,q0,q6 # [0*II+10] id:839
|
||||
ee.vmulas.u16.accx.ld.ip.qup q0,a8,16,q0,q4,q1,q5 # [0*II+12] id:840
|
||||
ee.vmulas.u16.accx.ld.ip q4,a2,16,q4,q6 # [0*II+13] id:841
|
||||
ee.vmulas.u16.accx.ld.xp.qup q4,a8,a12,q4,q1,q5,q0 # [0*II+15] id:842
|
||||
ee.vmulas.u16.accx.ld.ip q1,a2,16,q1,q6 # [0*II+16] id:843
|
||||
ee.vmulas.u16.accx.ld.xp.qup q2,a8,a11,q1,q5,q0,q4 # [0*II+18] id:844
|
||||
ee.vmulas.u16.accx.ld.ip q4,a2,16,q5,q6 # [0*II+19] id:846
|
||||
ee.ld.128.usar.xp q1,a8,a10 # [0*II+20] id:845
|
||||
ee.vmulas.u16.accx.ld.ip.qup q3,a8,16,q4,q0,q1,q2 # [0*II+22] id:847
|
||||
ee.vmulas.u16.accx.ld.ip q0,a2,16,q0,q6 # [0*II+23] id:848
|
||||
|
||||
.LBB257_dspi_dotprod_off_u16_aes3: # 0x349
|
||||
j .Lt_0_33026 # [0]
|
||||
|
||||
.Lt_0_35586: # 0x34c
|
||||
movi.n a2,0 # [0]
|
||||
sext a14,a9,15 # [1]
|
||||
s16i a14,a4,0 # [2] id:875
|
||||
retw.n # [3]
|
||||
|
||||
#endif // dsps_dotprod_s16_aes3_enabled
|
||||
@@ -0,0 +1,49 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
|
||||
esp_err_t dspi_dotprod_off_u16_ansi(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift, uint16_t offset)
|
||||
{
|
||||
if (in_image->step_x * count_x > in_image->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (in_image->step_y * count_y > in_image->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_x * count_x > filter->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_y * count_y > filter->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
|
||||
uint16_t *i_data = (uint16_t *)in_image->data;
|
||||
uint16_t *f_data = (uint16_t *)filter->data;
|
||||
int i_step = in_image->stride_x * in_image->step_y;
|
||||
int f_step = filter->stride_x * filter->step_y;
|
||||
|
||||
int64_t acc = 0;
|
||||
for (int y = 0; y < count_y; y++) {
|
||||
for (int x = 0; x < count_x; x++) {
|
||||
acc += (int32_t)i_data[in_image->step_x * x] * ((int32_t)f_data[filter->step_x * x] + (int32_t)offset);
|
||||
}
|
||||
i_data += i_step;
|
||||
f_data += f_step;
|
||||
}
|
||||
acc += 1 << (shift - 1); // round operation
|
||||
acc >>= shift;
|
||||
*out_value = acc;
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,104 @@
|
||||
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_arp4_enabled == 1)
|
||||
#include "dsp_err_codes.h"
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.global dspi_dotprod_off_u16_arp4
|
||||
.global dspi_dotprod_off_u16_ansi
|
||||
.type dspi_dotprod_off_u16_arp4,@function
|
||||
|
||||
// esp_err_t dspi_dotprod_off_u16_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift, unt16_t offset);
|
||||
dspi_dotprod_off_u16_arp4:
|
||||
// in_image - a0
|
||||
// filter - a1
|
||||
// out_value - a2
|
||||
// count_x - a3
|
||||
// count_y - a4
|
||||
// shift - a5
|
||||
// offset - a6
|
||||
|
||||
// i_data - t0
|
||||
// f_data - t1
|
||||
// i_step - t2
|
||||
// f_step - t3
|
||||
// t4 - current i_data
|
||||
// t5 - current f_data
|
||||
|
||||
lw t1, 4(a0) // load in_image->step_x
|
||||
lw t2, 4(a1) // load filter->step_x
|
||||
or t1, t1, t2
|
||||
addi t1, t1, -1 // should be 0 now
|
||||
andi t2, a3, 7
|
||||
or t1, t1, t2
|
||||
|
||||
beqz t1, .dspi_dotprod_off_u16_arp4_body
|
||||
j dspi_dotprod_off_u16_ansi
|
||||
|
||||
.dspi_dotprod_off_u16_arp4_body:
|
||||
add sp, sp, -16
|
||||
|
||||
sw a6, 0(sp)
|
||||
mv t6, sp
|
||||
esp.vldbc.16.ip q2, t6, 0
|
||||
|
||||
lw t0, 0(a0) // i_data
|
||||
lw t1, 0(a1) // f_data
|
||||
|
||||
|
||||
lw t2, 8(a0) // step_y
|
||||
lw t4, 12(a0) // stride_x
|
||||
mul t2, t4, t2
|
||||
slli t2, t2, 1 // i_step = i_step<<1
|
||||
|
||||
lw t3, 8(a1) // step_y
|
||||
lw t5, 12(a1) // stride_x
|
||||
mul t3, t5, t3
|
||||
slli t3, t3, 1 // f_step = f_step<<1
|
||||
|
||||
srli t6, a3, 3 // t5 = len/8
|
||||
|
||||
|
||||
addi a7, a5, -1
|
||||
li t4, 1
|
||||
sll t4, t4, a7
|
||||
esp.zero.xacc
|
||||
esp.movx.w.xacc.l t4
|
||||
|
||||
.loop_count_y:
|
||||
mv t4, t0
|
||||
mv t5, t1
|
||||
esp.vld.128.ip q1, t5, 16 // q0 - i_data
|
||||
|
||||
esp.lp.setup 0, t6, .loop_count_x
|
||||
esp.vld.128.ip q0, t4, 16 // q1 - f_data
|
||||
esp.vadd.u16 q3, q2, q1
|
||||
.loop_count_x: esp.vmulas.u16.xacc.ld.ip q1, t5, 16, q0, q3 // q0 - i_data
|
||||
|
||||
add t0, t0, t2
|
||||
add t1, t1, t3
|
||||
add a4,a4, -1
|
||||
bgtz a4, .loop_count_y
|
||||
|
||||
esp.srs.u.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
|
||||
sh t5, 0(a2) // store result to output buffer
|
||||
|
||||
li a0,0
|
||||
add sp,sp,16
|
||||
ret
|
||||
|
||||
#endif // dspi_dotprod_arp4_enabled
|
||||
@@ -0,0 +1,407 @@
|
||||
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_aes3_enabled == 1)
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.literal .LC0_1_57, 458755
|
||||
|
||||
# Program Unit: dspi_dotprod_off_u8_aes3
|
||||
.type dspi_dotprod_off_u8_aes3, @function
|
||||
.align 4
|
||||
.global dspi_dotprod_off_u8_aes3
|
||||
dspi_dotprod_off_u8_aes3: # 0x4
|
||||
|
||||
.LBB1_dspi_dotprod_off_u8_aes3: # 0x4
|
||||
entry a1,112 #
|
||||
l32i.n a10,a2,4 # [0] id:745
|
||||
l32i.n a12,a2,12 # [1] id:744
|
||||
mull a8,a10,a5 # [2]
|
||||
blt a12,a8,.LBB86_dspi_dotprod_off_u8_aes3 # [4]
|
||||
|
||||
l32i.n a13,a2,8 # [0] id:746
|
||||
l32i.n a9,a2,16 # [1] id:747
|
||||
mull a11,a13,a6 # [2]
|
||||
blt a9,a11,.LBB86_dspi_dotprod_off_u8_aes3 # [4]
|
||||
|
||||
l32i.n a15,a3,4 # [0] id:749
|
||||
l32i.n a14,a3,12 # [1] id:748
|
||||
mull a11,a15,a5 # [2]
|
||||
blt a14,a11,.LBB86_dspi_dotprod_off_u8_aes3 # [4]
|
||||
|
||||
l32i.n a8,a3,16 # [0] id:751
|
||||
l32i.n a9,a3,8 # [1] id:750
|
||||
s32i a9,a1,72 # [2] gra_spill_temp_2
|
||||
mull a9,a9,a6 # [3]
|
||||
blt a8,a9,.LBB86_dspi_dotprod_off_u8_aes3 # [5]
|
||||
|
||||
l32i.n a8,a3,0 # [0] id:752
|
||||
s32i a8,a1,68 # [1] gra_spill_temp_1
|
||||
bbsi a8,0,.Lt_0_35330 # [2]
|
||||
|
||||
bne a14,a11,.Lt_0_35330 # [0]
|
||||
|
||||
bnei a15,1,.Lt_0_35330 # [0]
|
||||
|
||||
l32i a11,a1,72 # [0] gra_spill_temp_2
|
||||
beqi a11,1,.Lt_0_18946 # [2]
|
||||
|
||||
.Lt_0_35330: # 0x46
|
||||
.Lt_0_19202: # 0x46
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
.type dspi_dotprod_u8_ansi, @function
|
||||
call8 dspi_dotprod_u8_ansi # [6] dspi_dotprod_u8_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB86_dspi_dotprod_off_u8_aes3: # 0x59
|
||||
l32r a2,.LC0_1_57 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.Lt_0_18946: # 0x5e
|
||||
addi.n a14,a10,-1 # [0]
|
||||
bnez a14,.Lt_0_36098 # [1]
|
||||
|
||||
addi.n a15,a13,-1 # [0]
|
||||
bnez a15,.Lt_0_36098 # [1]
|
||||
|
||||
extui a8,a5,0,4 # [0]
|
||||
bnez.n a8,.Lt_0_36098 # [1]
|
||||
|
||||
blti a6,4,.Lt_0_36098 # [0]
|
||||
|
||||
movi.n a9,64 # [0]
|
||||
blt a9,a5,.LBB27_dspi_dotprod_off_u8_aes3 # [1]
|
||||
|
||||
.Lt_0_36610: # 0x75
|
||||
.Lt_0_20994: # 0x75
|
||||
l8ui a9,a1,112 # [0] id:754 offset+0x0
|
||||
mov.n a8,a1 # [1]
|
||||
l32i.n a15,a2,0 # [2] id:753
|
||||
mull a10,a12,a13 # [3]
|
||||
l32i a2,a1,68 # [4] gra_spill_temp_1
|
||||
s32i a10,a1,64 # [5] gra_spill_temp_0
|
||||
movi.n a10,4 # [6]
|
||||
# loop-count fixed at 4
|
||||
loop a10,.LBB140_dspi_dotprod_off_u8_aes3 # [7]
|
||||
|
||||
.LBB135_dspi_dotprod_off_u8_aes3: # 0x8a
|
||||
s8i a9,a8,0 # [0*II+0] id:755 temp_offset+0x0
|
||||
s8i a9,a8,1 # [0*II+1] id:755 temp_offset+0x0
|
||||
s8i a9,a8,2 # [0*II+2] id:755 temp_offset+0x0
|
||||
s8i a9,a8,3 # [0*II+3] id:755 temp_offset+0x0
|
||||
s8i a9,a8,4 # [0*II+4] id:755 temp_offset+0x0
|
||||
s8i a9,a8,5 # [0*II+5] id:755 temp_offset+0x0
|
||||
s8i a9,a8,6 # [0*II+6] id:755 temp_offset+0x0
|
||||
s8i a9,a8,7 # [0*II+7] id:755 temp_offset+0x0
|
||||
addi.n a8,a8,8 # [0*II+8]
|
||||
|
||||
.LBB140_dspi_dotprod_off_u8_aes3: # 0xa4
|
||||
mov.n a3,a6 # [0]
|
||||
addi a11,a5,-48 # [1]
|
||||
addi.n a12,a1,8 # [3] temp_offset+8
|
||||
movi.n a13,0 # [4]
|
||||
wur.accx_0 a13 # [5]
|
||||
wur.accx_1 a13 # [6]
|
||||
ee.vld.128.ip q6,a12,0 # [7] id:756
|
||||
s32i.n a12,a1,32 # [8] offset_data_ptr
|
||||
beqz a11,.LBB34_dspi_dotprod_off_u8_aes3 # [9]
|
||||
|
||||
l32i a2,a1,68 # [0] gra_spill_temp_1
|
||||
ee.vld.128.ip q0,a2,16 # [2] id:771
|
||||
st.qr q0,a1,48 # [3] q0
|
||||
|
||||
.Lt_0_24578: # 0xc3
|
||||
addi a14,a5,-32 # [0]
|
||||
beqz a14,.LBB43_dspi_dotprod_off_u8_aes3 # [1]
|
||||
|
||||
.Lt_0_26626: # 0xc9
|
||||
.Lt_0_26114: # 0xc9
|
||||
addi a8,a5,-16 # [0]
|
||||
beqz a8,.LBB50_dspi_dotprod_off_u8_aes3 # [1]
|
||||
|
||||
.Lt_0_28162: # 0xcf
|
||||
.Lt_0_27650: # 0xcf
|
||||
addi a9,a5,-64 # [0]
|
||||
beqz a9,.LBB57_dspi_dotprod_off_u8_aes3 # [1]
|
||||
|
||||
.Lt_0_29698: # 0xd5
|
||||
.Lt_0_29186: # 0xd5
|
||||
addi a10,a5,-128 # [0]
|
||||
beqz a10,.LBB64_dspi_dotprod_off_u8_aes3 # [1]
|
||||
|
||||
movi a11,128 # [0]
|
||||
bge a11,a5,.Lt_0_32514 # [1]
|
||||
|
||||
movi.n a12,0 # [0]
|
||||
ee.ld.128.usar.ip q1,a15,16 # [1] id:833
|
||||
ee.ld.128.usar.ip q2,a15,16 # [2] id:834
|
||||
ee.src.q.ld.ip q3,a15,16,q1,q2 # [4] id:835
|
||||
beqz.n a3,.Lt_0_32514 # [5]
|
||||
|
||||
ld.qr q0,a1,48 # [0] q0
|
||||
l32i a14,a1,64 # [1] gra_spill_temp_0
|
||||
addi a13,a5,31 # [2]
|
||||
movgez a13,a5,a5 # [3]
|
||||
srai a13,a13,5 # [4]
|
||||
sub a14,a14,a5 # [5]
|
||||
addi a14,a14,16 # [6]
|
||||
addi.n a13,a13,-1 # [7]
|
||||
|
||||
.Lt_0_33282: # 0x105
|
||||
beqz.n a13,.Lt_0_33538 # [0]
|
||||
|
||||
loopnez a13,.LBB277_dspi_dotprod_off_u8_aes3 # [0]
|
||||
|
||||
.LBB275_dspi_dotprod_off_u8_aes3: # 0x10a
|
||||
ee.vmulas.u8.accx.ld.ip.qup q0,a15,16,q0,q1,q2,q3 # [0*II+0] id:836
|
||||
ee.vmulas.u8.accx.ld.ip q1,a2,16,q1,q6 # [0*II+1] id:837
|
||||
ee.vmulas.u8.accx.ld.ip.qup q1,a15,16,q1,q2,q3,q0 # [0*II+3] id:838
|
||||
ee.vmulas.u8.accx.ld.ip q4,a2,16,q2,q6 # [0*II+4] id:839
|
||||
ee.vmulas.u8.accx.ld.ip.qup q2,a15,16,q4,q3,q0,q1 # [0*II+6] id:840
|
||||
ee.vmulas.u8.accx.ld.ip q4,a2,16,q3,q6 # [0*II+7] id:841
|
||||
ee.vmulas.u8.accx.ld.ip.qup q3,a15,16,q4,q0,q1,q2 # [0*II+9] id:842
|
||||
ee.vmulas.u8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+10] id:843
|
||||
|
||||
.LBB277_dspi_dotprod_off_u8_aes3: # 0x12a
|
||||
|
||||
.Lt_0_33538: # 0x12a
|
||||
ee.vmulas.u8.accx.ld.ip.qup q4,a15,16,q0,q1,q2,q3 # [0] id:844
|
||||
ee.vmulas.u8.accx.ld.ip q1,a2,16,q1,q6 # [1] id:845
|
||||
movi.n a8,32 # [2]
|
||||
ee.vmulas.u8.accx.ld.xp.qup q0,a15,a14,q1,q2,q3,q4 # [3] id:846
|
||||
ee.vmulas.u8.accx.ld.ip q7,a2,16,q2,q6 # [4] id:847
|
||||
movi.n a9,-16 # [5]
|
||||
ee.vmulas.u8.accx.ld.xp.qup q2,a15,a9,q7,q3,q4,q0 # [6] id:848
|
||||
ee.vmulas.u8.accx.ld.ip q5,a2,16,q3,q6 # [7] id:850
|
||||
ee.ld.128.usar.xp q1,a15,a8 # [8] id:849
|
||||
addi.n a12,a12,1 # [9]
|
||||
ee.vmulas.u8.accx.ld.ip.qup q3,a15,16,q5,q4,q1,q2 # [10] id:851
|
||||
ee.vmulas.u8.accx.ld.ip q0,a2,16,q4,q6 # [11] id:852
|
||||
bne a12,a3,.Lt_0_33282 # [12]
|
||||
|
||||
.Lt_0_32514: # 0x156
|
||||
.Lt_0_32258: # 0x156
|
||||
movi.n a2,0 # [0]
|
||||
rur.accx_0 a10 # [1]
|
||||
addi.n a12,a7,-1 # [2]
|
||||
movi.n a11,1 # [3]
|
||||
ssl a12 # [4]
|
||||
sll a11,a11 # [5]
|
||||
ssr a7 # [6]
|
||||
add.n a10,a10,a11 # [7]
|
||||
sra a10,a10 # [8]
|
||||
s8i a10,a4,0 # [9] id:854
|
||||
retw.n # [10]
|
||||
|
||||
.Lt_0_36098: # 0x172
|
||||
.Lt_0_20226: # 0x172
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
call8 dspi_dotprod_u8_ansi # [6] dspi_dotprod_u8_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB27_dspi_dotprod_off_u8_aes3: # 0x185
|
||||
extui a14,a5,0,1 # [0]
|
||||
beqz a14,.Lt_0_36610 # [1]
|
||||
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
call8 dspi_dotprod_u8_ansi # [6] dspi_dotprod_u8_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB34_dspi_dotprod_off_u8_aes3: # 0x19e
|
||||
ee.ld.128.usar.ip q0,a15,16 # [0] id:760
|
||||
ee.ld.128.usar.ip q2,a15,16 # [1] id:761
|
||||
ee.src.q.ld.ip q3,a15,16,q0,q2 # [3] id:762
|
||||
beqz.n a6,.Lt_0_24578 # [4]
|
||||
|
||||
movi.n a10,32 # [0]
|
||||
l32i a12,a1,64 # [1] gra_spill_temp_0
|
||||
movi.n a11,-16 # [2]
|
||||
addi a12,a12,-32 # [3]
|
||||
loopgtz a6,.LBB163_dspi_dotprod_off_u8_aes3 # [4]
|
||||
|
||||
.LBB161_dspi_dotprod_off_u8_aes3: # 0x1b6
|
||||
ee.vmulas.u8.accx.ld.ip q1,a2,16,q0,q6 # [0*II+0] id:763
|
||||
ee.vmulas.u8.accx.ld.xp.qup q1,a15,a12,q1,q0,q2,q3 # [0*II+2] id:764
|
||||
ee.vmulas.u8.accx.ld.ip q0,a2,16,q2,q6 # [0*II+3] id:765
|
||||
ee.vmulas.u8.accx.ld.xp.qup q2,a15,a11,q0,q2,q3,q1 # [0*II+5] id:766
|
||||
ee.vmulas.u8.accx.ld.ip q1,a2,16,q3,q6 # [0*II+6] id:768
|
||||
ee.ld.128.usar.xp q0,a15,a10 # [0*II+7] id:767
|
||||
ee.vmulas.u8.accx.ld.ip.qup q3,a15,16,q1,q3,q0,q2 # [0*II+9] id:769
|
||||
|
||||
.LBB163_dspi_dotprod_off_u8_aes3: # 0x1d1
|
||||
st.qr q1,a1,48 # [0] q0
|
||||
j .Lt_0_24578 # [1]
|
||||
|
||||
.LBB43_dspi_dotprod_off_u8_aes3: # 0x1d7
|
||||
srli a3,a6,1 # [0]
|
||||
l32i a12,a1,64 # [1] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a15,16 # [2] id:772
|
||||
ee.ld.128.usar.ip q2,a15,16 # [3] id:773
|
||||
addi a12,a12,-16 # [5]
|
||||
ee.src.q.ld.xp q3,a15,a12,q1,q2 # [6] id:774
|
||||
beqz.n a3,.Lt_0_26626 # [7]
|
||||
|
||||
ld.qr q0,a1,48 # [0] q0
|
||||
movi.n a10,32 # [1]
|
||||
movi.n a11,-16 # [2]
|
||||
loopnez a3,.LBB186_dspi_dotprod_off_u8_aes3 # [3]
|
||||
|
||||
.LBB184_dspi_dotprod_off_u8_aes3: # 0x1f5
|
||||
ee.vmulas.u8.accx.ld.xp.qup q0,a15,a11,q0,q1,q2,q3 # [0*II+0] id:775
|
||||
ee.vmulas.u8.accx.ld.ip q3,a2,16,q1,q6 # [0*II+1] id:776
|
||||
ee.ld.128.usar.xp q1,a15,a10 # [0*II+2] id:777
|
||||
ee.vmulas.u8.accx.ld.xp.qup q3,a15,a12,q3,q2,q1,q0 # [0*II+4] id:778
|
||||
ee.vmulas.u8.accx.ld.ip q4,a2,16,q2,q6 # [0*II+5] id:779
|
||||
ee.vmulas.u8.accx.ld.xp.qup q2,a15,a11,q4,q1,q0,q3 # [0*II+7] id:780
|
||||
ee.vmulas.u8.accx.ld.ip q3,a2,16,q1,q6 # [0*II+8] id:781
|
||||
ee.ld.128.usar.xp q1,a15,a10 # [0*II+9] id:782
|
||||
ee.vmulas.u8.accx.ld.xp.qup q3,a15,a12,q3,q0,q1,q2 # [0*II+11] id:783
|
||||
ee.vmulas.u8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+12] id:784
|
||||
|
||||
.LBB186_dspi_dotprod_off_u8_aes3: # 0x21b
|
||||
st.qr q0,a1,48 # [0] q0
|
||||
j .Lt_0_26626 # [1]
|
||||
|
||||
.LBB50_dspi_dotprod_off_u8_aes3: # 0x221
|
||||
srli a3,a3,2 # [0]
|
||||
movi.n a13,-16 # [1]
|
||||
l32i a11,a1,64 # [2] gra_spill_temp_0
|
||||
addi a15,a15,16 # [3]
|
||||
addi a11,a11,16 # [4]
|
||||
ee.ld.128.usar.xp q2,a15,a13 # [5] id:785
|
||||
ee.ld.128.usar.xp q1,a15,a11 # [6] id:786
|
||||
ee.src.q.ld.xp q3,a15,a13,q1,q2 # [8] id:787
|
||||
ee.ld.128.usar.xp q2,a15,a11 # [9] id:788
|
||||
beqz.n a3,.Lt_0_28162 # [10]
|
||||
|
||||
ld.qr q0,a1,48 # [0] q0
|
||||
movi.n a10,-16 # [1]
|
||||
loopnez a3,.LBB209_dspi_dotprod_off_u8_aes3 # [2]
|
||||
|
||||
.LBB207_dspi_dotprod_off_u8_aes3: # 0x245
|
||||
ee.vmulas.u8.accx.ld.xp.qup q3,a15,a10,q0,q1,q2,q3 # [0*II+0] id:789
|
||||
ee.vmulas.u8.accx.ld.ip q0,a2,16,q1,q6 # [0*II+1] id:790
|
||||
ee.ld.128.usar.xp q1,a15,a11 # [0*II+2] id:791
|
||||
ee.vmulas.u8.accx.ld.xp.qup q3,a15,a10,q0,q2,q1,q3 # [0*II+4] id:792
|
||||
ee.vmulas.u8.accx.ld.ip q0,a2,16,q2,q6 # [0*II+5] id:793
|
||||
ee.ld.128.usar.xp q4,a15,a11 # [0*II+6] id:794
|
||||
ee.vmulas.u8.accx.ld.xp.qup q3,a15,a10,q0,q1,q4,q3 # [0*II+8] id:795
|
||||
ee.vmulas.u8.accx.ld.ip q0,a2,16,q1,q6 # [0*II+9] id:796
|
||||
ee.ld.128.usar.xp q1,a15,a11 # [0*II+10] id:797
|
||||
ee.vmulas.u8.accx.ld.xp.qup q3,a15,a10,q0,q4,q1,q3 # [0*II+12] id:798
|
||||
ee.vmulas.u8.accx.ld.ip q0,a2,16,q4,q6 # [0*II+13] id:799
|
||||
ee.ld.128.usar.xp q2,a15,a11 # [0*II+14] id:800
|
||||
|
||||
.LBB209_dspi_dotprod_off_u8_aes3: # 0x271
|
||||
st.qr q0,a1,48 # [0] q0
|
||||
j .Lt_0_28162 # [1]
|
||||
|
||||
.LBB57_dspi_dotprod_off_u8_aes3: # 0x277
|
||||
ee.ld.128.usar.ip q1,a15,16 # [0] id:801
|
||||
ee.ld.128.usar.ip q2,a15,16 # [1] id:802
|
||||
ee.src.q.ld.ip q3,a15,16,q1,q2 # [3] id:803
|
||||
beqz.n a3,.Lt_0_29698 # [4]
|
||||
|
||||
ld.qr q0,a1,48 # [0] q0
|
||||
movi.n a10,32 # [1]
|
||||
l32i a12,a1,64 # [2] gra_spill_temp_0
|
||||
movi.n a11,-16 # [3]
|
||||
sub a12,a12,a5 # [4]
|
||||
addi a12,a12,16 # [5]
|
||||
loopnez a3,.LBB232_dspi_dotprod_off_u8_aes3 # [6]
|
||||
|
||||
.LBB230_dspi_dotprod_off_u8_aes3: # 0x295
|
||||
ee.vmulas.u8.accx.ld.ip.qup q0,a15,16,q0,q1,q2,q3 # [0*II+0] id:804
|
||||
ee.vmulas.u8.accx.ld.ip q4,a2,16,q1,q6 # [0*II+1] id:805
|
||||
ee.vmulas.u8.accx.ld.xp.qup q4,a15,a12,q4,q2,q3,q0 # [0*II+3] id:806
|
||||
ee.vmulas.u8.accx.ld.ip q1,a2,16,q2,q6 # [0*II+4] id:807
|
||||
ee.vmulas.u8.accx.ld.xp.qup q2,a15,a11,q1,q3,q0,q4 # [0*II+6] id:808
|
||||
ee.vmulas.u8.accx.ld.ip q4,a2,16,q3,q6 # [0*II+7] id:809
|
||||
ee.ld.128.usar.xp q1,a15,a10 # [0*II+8] id:810
|
||||
ee.vmulas.u8.accx.ld.ip.qup q3,a15,16,q4,q0,q1,q2 # [0*II+10] id:811
|
||||
ee.vmulas.u8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+11] id:812
|
||||
|
||||
.LBB232_dspi_dotprod_off_u8_aes3: # 0x2b8
|
||||
st.qr q0,a1,48 # [0] q0
|
||||
j .Lt_0_29698 # [1]
|
||||
|
||||
.LBB64_dspi_dotprod_off_u8_aes3: # 0x2be
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
l32i a12,a1,64 # [2] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a15,16 # [3] id:813
|
||||
ee.ld.128.usar.ip q2,a15,16 # [4] id:814
|
||||
sub a12,a12,a5 # [6]
|
||||
addi a12,a12,16 # [7]
|
||||
ld.qr q0,a1,48 # [8] q0
|
||||
ee.src.q.ld.ip q3,a15,16,q1,q2 # [9] id:815
|
||||
mov.n a8,a15 # [10]
|
||||
loopnez a3,.LBB254_dspi_dotprod_off_u8_aes3 # [11]
|
||||
|
||||
.LBB252_dspi_dotprod_off_u8_aes3: # 0x2dc
|
||||
ee.vmulas.u8.accx.ld.ip.qup q0,a8,16,q0,q1,q2,q3 # [0*II+0] id:816
|
||||
ee.vmulas.u8.accx.ld.ip q4,a2,16,q1,q6 # [0*II+1] id:817
|
||||
ee.vmulas.u8.accx.ld.ip.qup q4,a8,16,q4,q2,q3,q0 # [0*II+3] id:818
|
||||
ee.vmulas.u8.accx.ld.ip q1,a2,16,q2,q6 # [0*II+4] id:819
|
||||
ee.vmulas.u8.accx.ld.ip.qup q1,a8,16,q1,q3,q0,q4 # [0*II+6] id:820
|
||||
ee.vmulas.u8.accx.ld.ip q5,a2,16,q3,q6 # [0*II+7] id:821
|
||||
ee.vmulas.u8.accx.ld.ip.qup q5,a8,16,q5,q0,q4,q1 # [0*II+9] id:822
|
||||
ee.vmulas.u8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+10] id:823
|
||||
ee.vmulas.u8.accx.ld.ip.qup q0,a8,16,q0,q4,q1,q5 # [0*II+12] id:824
|
||||
ee.vmulas.u8.accx.ld.ip q4,a2,16,q4,q6 # [0*II+13] id:825
|
||||
ee.vmulas.u8.accx.ld.xp.qup q4,a8,a12,q4,q1,q5,q0 # [0*II+15] id:826
|
||||
ee.vmulas.u8.accx.ld.ip q1,a2,16,q1,q6 # [0*II+16] id:827
|
||||
ee.vmulas.u8.accx.ld.xp.qup q2,a8,a11,q1,q5,q0,q4 # [0*II+18] id:828
|
||||
ee.vmulas.u8.accx.ld.ip q4,a2,16,q5,q6 # [0*II+19] id:829
|
||||
ee.ld.128.usar.xp q1,a8,a10 # [0*II+20] id:830
|
||||
ee.vmulas.u8.accx.ld.ip.qup q3,a8,16,q4,q0,q1,q2 # [0*II+22] id:831
|
||||
ee.vmulas.u8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+23] id:832
|
||||
|
||||
.LBB254_dspi_dotprod_off_u8_aes3: # 0x31f
|
||||
movi.n a2,0 # [0]
|
||||
movi.n a11,1 # [1]
|
||||
addi.n a12,a7,-1 # [2]
|
||||
rur.accx_0 a10 # [3]
|
||||
ssl a12 # [4]
|
||||
sll a11,a11 # [5]
|
||||
ssr a7 # [6]
|
||||
add.n a10,a10,a11 # [7]
|
||||
sra a10,a10 # [8]
|
||||
s8i a10,a4,0 # [9] id:854
|
||||
retw.n # [10]
|
||||
|
||||
#endif // dsps_dotprod_s16_aes3_enabled
|
||||
@@ -0,0 +1,49 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
|
||||
esp_err_t dspi_dotprod_off_u8_ansi(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift, uint8_t offset)
|
||||
{
|
||||
if (in_image->step_x * count_x > in_image->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (in_image->step_y * count_y > in_image->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_x * count_x > filter->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_y * count_y > filter->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
|
||||
uint8_t *i_data = (uint8_t *)in_image->data;
|
||||
uint8_t *f_data = (uint8_t *)filter->data;
|
||||
int i_step = in_image->stride_x * in_image->step_y;
|
||||
int f_step = filter->stride_x * filter->step_y;
|
||||
|
||||
int32_t acc = 0;
|
||||
for (int y = 0; y < count_y; y++) {
|
||||
for (int x = 0; x < count_x; x++) {
|
||||
acc += (int16_t)i_data[in_image->step_x * x] * ((int16_t)f_data[filter->step_x * x] + (int16_t)offset);
|
||||
}
|
||||
i_data += i_step;
|
||||
f_data += f_step;
|
||||
}
|
||||
acc += 1 << (shift - 1); // round operation
|
||||
acc >>= shift;
|
||||
*out_value = acc;
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,102 @@
|
||||
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_arp4_enabled == 1)
|
||||
#include "dsp_err_codes.h"
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.global dspi_dotprod_off_u8_arp4
|
||||
.global dspi_dotprod_off_u8_ansi
|
||||
.type dspi_dotprod_off_u8_arp4,@function
|
||||
|
||||
// esp_err_t dspi_dotprod_off_u8_arp4(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift, uint8_t offset);
|
||||
dspi_dotprod_off_u8_arp4:
|
||||
// in_image - a0
|
||||
// filter - a1
|
||||
// out_value - a2
|
||||
// count_x - a3
|
||||
// count_y - a4
|
||||
// shift - a5
|
||||
// offset - a6
|
||||
|
||||
// i_data - t0
|
||||
// f_data - t1
|
||||
// i_step - t2
|
||||
// f_step - t3
|
||||
// t4 - current i_data
|
||||
// t5 - current f_data
|
||||
|
||||
lw t1, 4(a0) // load in_image->step_x
|
||||
lw t2, 4(a1) // load filter->step_x
|
||||
or t1, t1, t2
|
||||
addi t1, t1, -1 // should be 0 now
|
||||
andi t2, a3, 15
|
||||
or t1, t1, t2
|
||||
|
||||
beqz t1, .dspi_dotprod_off_u8_arp4_body
|
||||
j dspi_dotprod_off_u8_ansi
|
||||
|
||||
.dspi_dotprod_off_u8_arp4_body:
|
||||
add sp, sp, -16
|
||||
|
||||
sw a6, 0(sp)
|
||||
mv t6, sp
|
||||
esp.vldbc.8.ip q2, t6, 0
|
||||
|
||||
lw t0, 0(a0) // i_data
|
||||
lw t1, 0(a1) // f_data
|
||||
|
||||
|
||||
lw t2, 8(a0) // step_y
|
||||
lw t4, 12(a0) // stride_x
|
||||
mul t2, t4, t2
|
||||
|
||||
lw t3, 8(a1) // step_y
|
||||
lw t5, 12(a1) // stride_x
|
||||
mul t3, t5, t3
|
||||
|
||||
srli t6, a3, 4 // t5 = len/16
|
||||
|
||||
|
||||
addi a7, a5, -1
|
||||
li t4, 1
|
||||
sll t4, t4, a7
|
||||
esp.zero.xacc
|
||||
esp.movx.w.xacc.l t4
|
||||
|
||||
.loop_count_y:
|
||||
mv t4, t0
|
||||
mv t5, t1
|
||||
esp.vld.128.ip q1, t5, 16 // q0 - i_data
|
||||
|
||||
esp.lp.setup 0, t6, .loop_count_x
|
||||
esp.vld.128.ip q0, t4, 16 // q1 - f_data
|
||||
esp.vadd.u8 q3, q2, q1
|
||||
.loop_count_x: esp.vmulas.u8.xacc.ld.ip q1, t5, 16, q0, q3 // q0 - i_data
|
||||
|
||||
add t0, t0, t2
|
||||
add t1, t1, t3
|
||||
add a4,a4, -1
|
||||
bgtz a4, .loop_count_y
|
||||
|
||||
esp.srs.u.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
|
||||
sh t5, 0(a2) // store result to output buffer
|
||||
|
||||
li a0,0
|
||||
add sp,sp,16
|
||||
ret
|
||||
|
||||
#endif // dspi_dotprod_arp4_enabled
|
||||
@@ -0,0 +1,372 @@
|
||||
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_aes3_enabled == 1)
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.literal .LC0_1_53, 458755
|
||||
|
||||
# Program Unit: dspi_dotprod_s16_aes3
|
||||
.type dspi_dotprod_s16_aes3, @function
|
||||
.align 4
|
||||
.global dspi_dotprod_s16_aes3
|
||||
dspi_dotprod_s16_aes3: # 0x4
|
||||
.LBB1_dspi_dotprod_s16_aes3: # 0x4
|
||||
entry a1,64 #
|
||||
l32i.n a10,a2,4 # [0] id:678
|
||||
l32i.n a11,a2,12 # [1] id:677
|
||||
mull a8,a10,a5 # [2]
|
||||
blt a11,a8,.LBB81_dspi_dotprod_s16_aes3 # [4]
|
||||
|
||||
l32i.n a12,a2,8 # [0] id:679
|
||||
l32i.n a9,a2,16 # [1] id:680
|
||||
mull a13,a12,a6 # [2]
|
||||
blt a9,a13,.LBB81_dspi_dotprod_s16_aes3 # [4]
|
||||
|
||||
l32i.n a15,a3,4 # [0] id:682
|
||||
l32i.n a14,a3,12 # [1] id:681
|
||||
mull a13,a15,a5 # [2]
|
||||
blt a14,a13,.LBB81_dspi_dotprod_s16_aes3 # [4]
|
||||
|
||||
l32i.n a8,a3,16 # [0] id:684
|
||||
l32i.n a9,a3,8 # [1] id:683
|
||||
s32i.n a9,a1,24 # [2] gra_spill_temp_2
|
||||
mull a9,a9,a6 # [3]
|
||||
blt a8,a9,.LBB81_dspi_dotprod_s16_aes3 # [5]
|
||||
|
||||
l32i.n a8,a3,0 # [0] id:685
|
||||
s32i.n a8,a1,20 # [1] gra_spill_temp_1
|
||||
bbsi a8,0,.Lt_0_34050 # [2]
|
||||
|
||||
bne a14,a13,.Lt_0_34050 # [0]
|
||||
|
||||
bnei a15,1,.Lt_0_34050 # [0]
|
||||
|
||||
l32i.n a9,a1,24 # [0] gra_spill_temp_2
|
||||
beqi a9,1,.Lt_0_18178 # [2]
|
||||
|
||||
.Lt_0_34050: # 0x43
|
||||
.Lt_0_18434: # 0x43
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
.type dspi_dotprod_s16_ansi, @function
|
||||
call8 dspi_dotprod_s16_ansi # [6] dspi_dotprod_s16_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB81_dspi_dotprod_s16_aes3: # 0x56
|
||||
l32r a2,.LC0_1_53 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.Lt_0_18178: # 0x5b
|
||||
addi.n a13,a10,-1 # [0]
|
||||
bnez a13,.Lt_0_34818 # [1]
|
||||
|
||||
addi.n a14,a12,-1 # [0]
|
||||
bnez a14,.Lt_0_34818 # [1]
|
||||
|
||||
extui a15,a5,0,3 # [0]
|
||||
bnez.n a15,.Lt_0_34818 # [1]
|
||||
|
||||
blti a6,4,.Lt_0_34818 # [0]
|
||||
|
||||
movi.n a8,32 # [0]
|
||||
bge a8,a5,.Lt_0_35330 # [1]
|
||||
|
||||
extui a9,a5,0,1 # [0]
|
||||
bnez a9,.LBB28_dspi_dotprod_s16_aes3 # [1]
|
||||
|
||||
.Lt_0_35330: # 0x78
|
||||
.Lt_0_20226: # 0x78
|
||||
mov.n a3,a6 # [0]
|
||||
addi a10,a5,-24 # [1]
|
||||
mull a13,a11,a12 # [2]
|
||||
l32i.n a15,a1,20 # [3] gra_spill_temp_1
|
||||
l32i.n a2,a2,0 # [4] id:686
|
||||
movi.n a14,0 # [5]
|
||||
wur.sar_byte a14 # [6]
|
||||
wur.accx_0 a14 # [8]
|
||||
wur.accx_1 a14 # [9]
|
||||
ee.vld.128.ip q0,a15,16 # [10] id:690
|
||||
slli a13,a13,1 # [11]
|
||||
s32i.n a13,a1,16 # [12] gra_spill_temp_0
|
||||
beqz a10,.LBB32_dspi_dotprod_s16_aes3 # [13]
|
||||
|
||||
.Lt_0_23298: # 0x99
|
||||
.Lt_0_22786: # 0x99
|
||||
addi a8,a5,-16 # [0]
|
||||
beqz a8,.LBB38_dspi_dotprod_s16_aes3 # [1]
|
||||
|
||||
.Lt_0_24834: # 0x9f
|
||||
.Lt_0_24322: # 0x9f
|
||||
addi a9,a5,-8 # [0]
|
||||
beqz a9,.LBB44_dspi_dotprod_s16_aes3 # [1]
|
||||
|
||||
.Lt_0_26370: # 0xa5
|
||||
.Lt_0_25858: # 0xa5
|
||||
addi a10,a5,-32 # [0]
|
||||
beqz a10,.LBB50_dspi_dotprod_s16_aes3 # [1]
|
||||
|
||||
.Lt_0_27906: # 0xab
|
||||
.Lt_0_27394: # 0xab
|
||||
addi a11,a5,-64 # [0]
|
||||
beqz a11,.LBB56_dspi_dotprod_s16_aes3 # [1]
|
||||
|
||||
movi.n a12,64 # [0]
|
||||
bge a12,a5,.Lt_0_30722 # [1]
|
||||
|
||||
movi.n a12,0 # [0]
|
||||
ee.ld.128.usar.ip q1,a2,16 # [1] id:762
|
||||
ee.ld.128.usar.ip q2,a2,16 # [2] id:763
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [4] id:764
|
||||
beqz.n a3,.Lt_0_30722 # [5]
|
||||
|
||||
slli a8,a5,1 # [0]
|
||||
l32i.n a14,a1,16 # [1] gra_spill_temp_0
|
||||
addi a13,a5,31 # [2]
|
||||
movgez a13,a5,a5 # [3]
|
||||
srai a13,a13,5 # [4]
|
||||
sub a14,a14,a8 # [5]
|
||||
addi a14,a14,16 # [6]
|
||||
addi.n a13,a13,-1 # [7]
|
||||
|
||||
.Lt_0_31490: # 0xd9
|
||||
addi.n a12,a12,1 # [0]
|
||||
movi.n a9,32 # [1]
|
||||
beqz.n a13,.Lt_0_31746 # [2]
|
||||
|
||||
loopnez a13,.LBB221_dspi_dotprod_s16_aes3 # [0]
|
||||
|
||||
.LBB219_dspi_dotprod_s16_aes3: # 0xe2
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+0] id:766
|
||||
ee.vmulas.s16.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:765
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+2] id:768
|
||||
ee.vmulas.s16.accx.ld.ip.qup q1,a2,16,q5,q2,q3,q4 # [0*II+3] id:767
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+4] id:770
|
||||
ee.vmulas.s16.accx.ld.ip.qup q2,a2,16,q0,q3,q4,q1 # [0*II+5] id:769
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+6] id:772
|
||||
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+7] id:771
|
||||
|
||||
.LBB221_dspi_dotprod_s16_aes3: # 0xfe
|
||||
|
||||
.Lt_0_31746: # 0xfe
|
||||
ee.vmulas.s16.accx.ld.ip.qup q5,a2,16,q0,q1,q2,q3 # [0] id:773
|
||||
movi.n a10,-16 # [1]
|
||||
ee.vld.128.ip q0,a15,16 # [2] id:774
|
||||
ee.vld.128.ip q6,a15,16 # [3] id:776
|
||||
ee.vmulas.s16.accx.ld.xp.qup q7,a2,a14,q0,q2,q3,q5 # [4] id:775
|
||||
ee.vld.128.ip q4,a15,16 # [5] id:779
|
||||
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a10,q6,q3,q5,q7 # [6] id:777
|
||||
ee.ld.128.usar.xp q1,a2,a9 # [7] id:778
|
||||
ee.vld.128.ip q0,a15,16 # [8] id:781
|
||||
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q4,q5,q1,q2 # [9] id:780
|
||||
bne a12,a3,.Lt_0_31490 # [10]
|
||||
|
||||
.Lt_0_30722: # 0x122
|
||||
.Lt_0_30466: # 0x122
|
||||
rur.accx_0 a9 # [0]
|
||||
rur.accx_1 a10 # [1]
|
||||
blti a7,1,.Lt_0_33282 # [2]
|
||||
|
||||
movi.n a2,0 # [0]
|
||||
addi a13,a7,-33 # [1]
|
||||
addi.n a14,a7,-1 # [2]
|
||||
ssr a14 # [3]
|
||||
sra a12,a10 # [4]
|
||||
src a11,a10,a9 # [5]
|
||||
movgez a11,a12,a13 # [6]
|
||||
addi.n a11,a11,1 # [7]
|
||||
srai a11,a11,1 # [8]
|
||||
s16i a11,a4,0 # [9] id:787
|
||||
retw.n # [10]
|
||||
|
||||
.Lt_0_34818: # 0x148
|
||||
.Lt_0_19458: # 0x148
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
call8 dspi_dotprod_s16_ansi # [6] dspi_dotprod_s16_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB32_dspi_dotprod_s16_aes3: # 0x15b
|
||||
ee.ld.128.usar.ip q1,a2,16 # [0] id:691
|
||||
ee.ld.128.usar.ip q2,a2,16 # [1] id:692
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [3] id:693
|
||||
beqz.n a6,.Lt_0_23298 # [4]
|
||||
|
||||
addi a12,a13,-32 # [0]
|
||||
movi.n a10,32 # [1]
|
||||
movi.n a11,-16 # [2]
|
||||
loopgtz a6,.LBB107_dspi_dotprod_s16_aes3 # [3]
|
||||
|
||||
.LBB105_dspi_dotprod_s16_aes3: # 0x170
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:695
|
||||
ee.vmulas.s16.accx.ld.xp.qup q1,a2,a12,q0,q1,q2,q3 # [0*II+1] id:694
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+2] id:697
|
||||
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a11,q4,q2,q3,q1 # [0*II+3] id:696
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+4] id:698
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+5] id:700
|
||||
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q5,q3,q1,q2 # [0*II+6] id:699
|
||||
|
||||
.LBB107_dspi_dotprod_s16_aes3: # 0x188
|
||||
j .Lt_0_23298 # [0]
|
||||
|
||||
.LBB38_dspi_dotprod_s16_aes3: # 0x18b
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
srli a3,a6,1 # [2]
|
||||
l32i.n a12,a1,16 # [3] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a2,16 # [4] id:701
|
||||
ee.ld.128.usar.ip q2,a2,16 # [5] id:702
|
||||
addi a12,a12,-16 # [7]
|
||||
ee.src.q.ld.xp q3,a2,a12,q1,q2 # [8] id:703
|
||||
loopnez a3,.LBB130_dspi_dotprod_s16_aes3 # [9]
|
||||
|
||||
.LBB128_dspi_dotprod_s16_aes3: # 0x1a3
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:705
|
||||
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a11,q0,q1,q2,q3 # [0*II+1] id:704
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+2] id:706
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+3] id:708
|
||||
ee.vmulas.s16.accx.ld.xp.qup q4,a2,a12,q4,q2,q1,q3 # [0*II+4] id:707
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+5] id:710
|
||||
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a11,q0,q1,q3,q4 # [0*II+6] id:709
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+7] id:711
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+8] id:713
|
||||
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a12,q5,q3,q1,q2 # [0*II+9] id:712
|
||||
|
||||
.LBB130_dspi_dotprod_s16_aes3: # 0x1c5
|
||||
j .Lt_0_24834 # [0]
|
||||
|
||||
.LBB44_dspi_dotprod_s16_aes3: # 0x1c8
|
||||
srli a3,a3,2 # [0]
|
||||
movi.n a10,-16 # [1]
|
||||
l32i.n a11,a1,16 # [2] gra_spill_temp_0
|
||||
addi a8,a2,16 # [3]
|
||||
addi a11,a11,16 # [4]
|
||||
ee.ld.128.usar.xp q2,a8,a10 # [5] id:714
|
||||
ee.ld.128.usar.xp q1,a8,a11 # [6] id:715
|
||||
ee.src.q.ld.xp q3,a8,a10,q1,q2 # [8] id:716
|
||||
ee.ld.128.usar.xp q2,a8,a11 # [9] id:717
|
||||
loopnez a3,.LBB153_dspi_dotprod_s16_aes3 # [10]
|
||||
|
||||
.LBB151_dspi_dotprod_s16_aes3: # 0x1e4
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:719
|
||||
ee.vmulas.s16.accx.ld.xp.qup q3,a8,a10,q0,q1,q2,q3 # [0*II+1] id:718
|
||||
ee.ld.128.usar.xp q1,a8,a11 # [0*II+2] id:720
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+3] id:722
|
||||
ee.vmulas.s16.accx.ld.xp.qup q4,a8,a10,q4,q2,q1,q3 # [0*II+4] id:721
|
||||
ee.ld.128.usar.xp q3,a8,a11 # [0*II+5] id:723
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+6] id:725
|
||||
ee.vmulas.s16.accx.ld.xp.qup q4,a8,a10,q0,q1,q3,q4 # [0*II+7] id:724
|
||||
ee.ld.128.usar.xp q1,a8,a11 # [0*II+8] id:726
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+9] id:728
|
||||
ee.vmulas.s16.accx.ld.xp.qup q3,a8,a10,q5,q3,q1,q4 # [0*II+10] id:727
|
||||
ee.ld.128.usar.xp q2,a8,a11 # [0*II+11] id:729
|
||||
|
||||
.LBB153_dspi_dotprod_s16_aes3: # 0x20c
|
||||
mov.n a2,a8 # [0]
|
||||
j .Lt_0_26370 # [1]
|
||||
|
||||
.LBB50_dspi_dotprod_s16_aes3: # 0x211
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
slli a13,a5,1 # [2]
|
||||
l32i.n a12,a1,16 # [3] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a2,16 # [4] id:730
|
||||
ee.ld.128.usar.ip q2,a2,16 # [5] id:731
|
||||
sub a12,a12,a13 # [6]
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [8] id:732
|
||||
addi a12,a12,16 # [9]
|
||||
loopnez a3,.LBB176_dspi_dotprod_s16_aes3 # [10]
|
||||
|
||||
.LBB174_dspi_dotprod_s16_aes3: # 0x22c
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+0] id:734
|
||||
ee.vmulas.s16.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:733
|
||||
ee.vld.128.ip q1,a15,16 # [0*II+2] id:736
|
||||
ee.vmulas.s16.accx.ld.xp.qup q0,a2,a12,q5,q2,q3,q4 # [0*II+3] id:735
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+4] id:739
|
||||
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a11,q1,q3,q4,q0 # [0*II+5] id:737
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+6] id:738
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+7] id:741
|
||||
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+8] id:740
|
||||
|
||||
.LBB176_dspi_dotprod_s16_aes3: # 0x24b
|
||||
j .Lt_0_27906 # [0]
|
||||
|
||||
.LBB56_dspi_dotprod_s16_aes3: # 0x24e
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
slli a13,a5,1 # [2]
|
||||
l32i.n a12,a1,16 # [3] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a2,16 # [4] id:742
|
||||
ee.ld.128.usar.ip q2,a2,16 # [5] id:743
|
||||
sub a12,a12,a13 # [7]
|
||||
addi a12,a12,16 # [8]
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [9] id:744
|
||||
loopnez a3,.LBB198_dspi_dotprod_s16_aes3 # [10]
|
||||
|
||||
.LBB196_dspi_dotprod_s16_aes3: # 0x269
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:746
|
||||
ee.vmulas.s16.accx.ld.ip.qup q1,a2,16,q0,q1,q2,q3 # [0*II+1] id:745
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+2] id:748
|
||||
ee.vmulas.s16.accx.ld.ip.qup q4,a2,16,q4,q2,q3,q1 # [0*II+3] id:747
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+4] id:750
|
||||
ee.vmulas.s16.accx.ld.ip.qup q0,a2,16,q0,q3,q1,q4 # [0*II+5] id:749
|
||||
ee.vld.128.ip q6,a15,16 # [0*II+6] id:752
|
||||
ee.vmulas.s16.accx.ld.ip.qup q1,a2,16,q5,q1,q4,q0 # [0*II+7] id:751
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+8] id:754
|
||||
ee.vmulas.s16.accx.ld.ip.qup q4,a2,16,q6,q4,q0,q1 # [0*II+9] id:753
|
||||
ee.vld.128.ip q6,a15,16 # [0*II+10] id:756
|
||||
ee.vmulas.s16.accx.ld.xp.qup q0,a2,a12,q5,q0,q1,q4 # [0*II+11] id:755
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+12] id:759
|
||||
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a11,q6,q1,q4,q0 # [0*II+13] id:757
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+14] id:758
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+15] id:761
|
||||
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+16] id:760
|
||||
|
||||
.LBB198_dspi_dotprod_s16_aes3: # 0x2a4
|
||||
j .Lt_0_30722 # [0]
|
||||
|
||||
.Lt_0_33282: # 0x2a7
|
||||
movi.n a2,0 # [0]
|
||||
sext a14,a9,15 # [1]
|
||||
s16i a14,a4,0 # [2] id:788
|
||||
retw.n # [3]
|
||||
|
||||
.LBB28_dspi_dotprod_s16_aes3: # 0x2b1
|
||||
mov.n a15,a7 # [0]
|
||||
mov.n a14,a6 # [1]
|
||||
mov.n a13,a5 # [2]
|
||||
mov.n a12,a4 # [3]
|
||||
mov.n a11,a3 # [4]
|
||||
mov.n a10,a2 # [5]
|
||||
call8 dspi_dotprod_s16_ansi # [6] dspi_dotprod_s16_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
|
||||
#endif // dsps_dotprod_s16_aes3_enabled
|
||||
@@ -0,0 +1,49 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
|
||||
esp_err_t dspi_dotprod_s16_ansi(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift)
|
||||
{
|
||||
if (in_image->step_x * count_x > in_image->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (in_image->step_y * count_y > in_image->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_x * count_x > filter->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_y * count_y > filter->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
|
||||
int16_t *i_data = (int16_t *)in_image->data;
|
||||
int16_t *f_data = (int16_t *)filter->data;
|
||||
int i_step = in_image->stride_x * in_image->step_y;
|
||||
int f_step = filter->stride_x * filter->step_y;
|
||||
|
||||
int64_t acc = 0;
|
||||
for (int y = 0; y < count_y; y++) {
|
||||
for (int x = 0; x < count_x; x++) {
|
||||
acc += (int32_t)i_data[in_image->step_x * x] * (int32_t)f_data[filter->step_x * x];
|
||||
}
|
||||
i_data += i_step;
|
||||
f_data += f_step;
|
||||
}
|
||||
acc += 1 << (shift - 1); // round operation
|
||||
acc >>= shift;
|
||||
*out_value = acc;
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,95 @@
|
||||
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_arp4_enabled == 1)
|
||||
#include "dsp_err_codes.h"
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.global dspi_dotprod_s16_arp4
|
||||
.global dspi_dotprod_s16_ansi
|
||||
.type dspi_dotprod_s16_arp4,@function
|
||||
|
||||
// esp_err_t dspi_dotprod_s16_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift);
|
||||
dspi_dotprod_s16_arp4:
|
||||
// in_image - a0
|
||||
// filter - a1
|
||||
// out_value - a2
|
||||
// count_x - a3
|
||||
// count_y - a4
|
||||
// shift - a5
|
||||
|
||||
// i_data - t0
|
||||
// f_data - t1
|
||||
// i_step - t2
|
||||
// f_step - t3
|
||||
// t4 - current i_data
|
||||
// t5 - current f_data
|
||||
|
||||
lw t1, 4(a0) // load in_image->step_x
|
||||
lw t2, 4(a1) // load filter->step_x
|
||||
or t1, t1, t2
|
||||
addi t1, t1, -1 // should be 0 now
|
||||
andi t2, a3, 7
|
||||
or t1, t1, t2
|
||||
|
||||
beqz t1, .dspi_dotprod_s16_arp4_body
|
||||
j dspi_dotprod_s16_ansi
|
||||
|
||||
.dspi_dotprod_s16_arp4_body:
|
||||
add sp, sp, -16
|
||||
lw t0, 0(a0) // i_data
|
||||
lw t1, 0(a1) // f_data
|
||||
|
||||
lw t2, 8(a0) // step_y
|
||||
lw t4, 12(a0) // stride_x
|
||||
mul t2, t4, t2
|
||||
slli t2, t2, 1 // i_step = i_step<<1
|
||||
|
||||
lw t3, 8(a1) // step_y
|
||||
lw t5, 12(a1) // stride_x
|
||||
mul t3, t5, t3
|
||||
slli t3, t3, 1 // f_step = f_step<<1
|
||||
|
||||
srli t6, a3, 3 // t5 = len/8
|
||||
|
||||
addi a6, a5, -1
|
||||
li t4, 1
|
||||
sll t4, t4, a6
|
||||
esp.zero.xacc
|
||||
esp.movx.w.xacc.l t4
|
||||
|
||||
.loop_count_y:
|
||||
mv t4, t0
|
||||
mv t5, t1
|
||||
esp.vld.128.ip q0, t4, 16 // q0 - i_data
|
||||
|
||||
esp.lp.setup 0, t6, .loop_count_x
|
||||
esp.vld.128.ip q1, t5, 16 // q1 - f_data
|
||||
.loop_count_x: esp.vmulas.s16.xacc.ld.ip q0, t4, 16, q0, q1 // q0 - i_data
|
||||
|
||||
add t0, t0, t2
|
||||
add t1, t1, t3
|
||||
add a4,a4, -1
|
||||
bgtz a4, .loop_count_y
|
||||
|
||||
esp.srs.s.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
|
||||
sh t5, 0(a2) // store result to output buffer
|
||||
|
||||
li a0,0
|
||||
add sp,sp,16
|
||||
ret
|
||||
|
||||
#endif // dspi_dotprod_arp4_enabled
|
||||
@@ -0,0 +1,370 @@
|
||||
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_aes3_enabled == 1)
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.literal .LC0_1_52, 458755
|
||||
|
||||
# Program Unit: dspi_dotprod_s8_aes3
|
||||
.type dspi_dotprod_s8_aes3, @function
|
||||
.align 4
|
||||
.global dspi_dotprod_s8_aes3
|
||||
dspi_dotprod_s8_aes3: # 0x4
|
||||
.LBB1_dspi_dotprod_s8_aes3: # 0x4
|
||||
entry a1,48 #
|
||||
l32i.n a10,a2,4 # [0] id:668
|
||||
l32i.n a11,a2,12 # [1] id:667
|
||||
mull a8,a10,a5 # [2]
|
||||
blt a11,a8,.LBB78_dspi_dotprod_s8_aes3 # [4]
|
||||
|
||||
l32i.n a12,a2,8 # [0] id:669
|
||||
l32i.n a9,a2,16 # [1] id:670
|
||||
mull a13,a12,a6 # [2]
|
||||
blt a9,a13,.LBB78_dspi_dotprod_s8_aes3 # [4]
|
||||
|
||||
l32i.n a15,a3,4 # [0] id:672
|
||||
l32i.n a14,a3,12 # [1] id:671
|
||||
mull a13,a15,a5 # [2]
|
||||
blt a14,a13,.LBB78_dspi_dotprod_s8_aes3 # [4]
|
||||
|
||||
l32i.n a8,a3,16 # [0] id:674
|
||||
l32i.n a9,a3,8 # [1] id:673
|
||||
s32i.n a9,a1,8 # [2] gra_spill_temp_2
|
||||
mull a9,a9,a6 # [3]
|
||||
blt a8,a9,.LBB78_dspi_dotprod_s8_aes3 # [5]
|
||||
|
||||
l32i.n a8,a3,0 # [0] id:675
|
||||
s32i.n a8,a1,4 # [1] gra_spill_temp_1
|
||||
bbsi a8,0,.Lt_0_33026 # [2]
|
||||
|
||||
bne a14,a13,.Lt_0_33026 # [0]
|
||||
|
||||
bnei a15,1,.Lt_0_33026 # [0]
|
||||
|
||||
l32i.n a13,a1,8 # [0] gra_spill_temp_2
|
||||
beqi a13,1,.Lt_0_17666 # [2]
|
||||
|
||||
.Lt_0_33026: # 0x43
|
||||
.Lt_0_17922: # 0x43
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
.type dspi_dotprod_s8_ansi, @function
|
||||
call8 dspi_dotprod_s8_ansi # [6] dspi_dotprod_s8_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB78_dspi_dotprod_s8_aes3: # 0x56
|
||||
l32r a2,.LC0_1_52 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.Lt_0_17666: # 0x5b
|
||||
addi.n a14,a10,-1 # [0]
|
||||
bnez a14,.Lt_0_33794 # [1]
|
||||
|
||||
addi.n a15,a12,-1 # [0]
|
||||
bnez a15,.Lt_0_33794 # [1]
|
||||
|
||||
extui a8,a5,0,4 # [0]
|
||||
bnez.n a8,.Lt_0_33794 # [1]
|
||||
|
||||
blti a6,4,.Lt_0_33794 # [0]
|
||||
|
||||
movi.n a9,64 # [0]
|
||||
bge a9,a5,.Lt_0_34306 # [1]
|
||||
|
||||
extui a10,a5,0,1 # [0]
|
||||
bnez a10,.LBB28_dspi_dotprod_s8_aes3 # [1]
|
||||
|
||||
.Lt_0_34306: # 0x78
|
||||
.Lt_0_19714: # 0x78
|
||||
mov.n a3,a6 # [0]
|
||||
addi a13,a5,-48 # [1]
|
||||
movi.n a14,0 # [2]
|
||||
mull a15,a11,a12 # [3]
|
||||
l32i.n a2,a2,0 # [4] id:676
|
||||
s32i.n a15,a1,0 # [6] gra_spill_temp_0
|
||||
wur.accx_0 a14 # [7]
|
||||
l32i.n a15,a1,4 # [8] gra_spill_temp_1
|
||||
wur.accx_1 a14 # [9]
|
||||
ee.vld.128.ip q0,a15,16 # [10] id:679
|
||||
beqz a13,.LBB32_dspi_dotprod_s8_aes3 # [11]
|
||||
|
||||
.Lt_0_22786: # 0x93
|
||||
.Lt_0_22274: # 0x93
|
||||
addi a8,a5,-32 # [0]
|
||||
beqz a8,.LBB38_dspi_dotprod_s8_aes3 # [1]
|
||||
|
||||
.Lt_0_24322: # 0x99
|
||||
.Lt_0_23810: # 0x99
|
||||
addi a9,a5,-16 # [0]
|
||||
beqz a9,.LBB44_dspi_dotprod_s8_aes3 # [1]
|
||||
|
||||
.Lt_0_25858: # 0x9f
|
||||
.Lt_0_25346: # 0x9f
|
||||
addi a10,a5,-64 # [0]
|
||||
beqz a10,.LBB50_dspi_dotprod_s8_aes3 # [1]
|
||||
|
||||
.Lt_0_27394: # 0xa5
|
||||
.Lt_0_26882: # 0xa5
|
||||
addi a11,a5,-128 # [0]
|
||||
beqz a11,.LBB56_dspi_dotprod_s8_aes3 # [1]
|
||||
|
||||
movi a12,128 # [0]
|
||||
bge a12,a5,.Lt_0_30210 # [1]
|
||||
|
||||
movi.n a12,0 # [0]
|
||||
ee.ld.128.usar.ip q1,a2,16 # [1] id:751
|
||||
ee.ld.128.usar.ip q2,a2,16 # [2] id:752
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [4] id:753
|
||||
beqz.n a3,.Lt_0_30210 # [5]
|
||||
|
||||
l32i.n a14,a1,0 # [0] gra_spill_temp_0
|
||||
addi a13,a5,63 # [1]
|
||||
movgez a13,a5,a5 # [2]
|
||||
srai a13,a13,6 # [3]
|
||||
sub a14,a14,a5 # [4]
|
||||
addi a14,a14,16 # [5]
|
||||
addi.n a13,a13,-1 # [6]
|
||||
|
||||
.Lt_0_30978: # 0xd1
|
||||
addi.n a12,a12,1 # [0]
|
||||
movi.n a8,32 # [1]
|
||||
movi.n a9,-16 # [2]
|
||||
beqz.n a13,.Lt_0_31234 # [3]
|
||||
|
||||
loopnez a13,.LBB218_dspi_dotprod_s8_aes3 # [0]
|
||||
|
||||
.LBB216_dspi_dotprod_s8_aes3: # 0xdc
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+0] id:755
|
||||
ee.vmulas.s8.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:754
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+2] id:757
|
||||
ee.vmulas.s8.accx.ld.ip.qup q1,a2,16,q5,q2,q3,q4 # [0*II+3] id:756
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+4] id:759
|
||||
ee.vmulas.s8.accx.ld.ip.qup q2,a2,16,q0,q3,q4,q1 # [0*II+5] id:758
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+6] id:761
|
||||
ee.vmulas.s8.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+7] id:760
|
||||
|
||||
.LBB218_dspi_dotprod_s8_aes3: # 0xf8
|
||||
|
||||
.Lt_0_31234: # 0xf8
|
||||
ee.vmulas.s8.accx.ld.ip.qup q5,a2,16,q0,q1,q2,q3 # [0] id:762
|
||||
ee.vld.128.ip q0,a15,16 # [1] id:763
|
||||
ee.vld.128.ip q6,a15,16 # [2] id:765
|
||||
ee.vmulas.s8.accx.ld.xp.qup q7,a2,a14,q0,q2,q3,q5 # [3] id:764
|
||||
ee.vld.128.ip q4,a15,16 # [4] id:768
|
||||
ee.vmulas.s8.accx.ld.xp.qup q2,a2,a9,q6,q3,q5,q7 # [5] id:766
|
||||
ee.ld.128.usar.xp q1,a2,a8 # [6] id:767
|
||||
ee.vld.128.ip q0,a15,16 # [7] id:770
|
||||
ee.vmulas.s8.accx.ld.ip.qup q3,a2,16,q4,q5,q1,q2 # [8] id:769
|
||||
bne a12,a3,.Lt_0_30978 # [9]
|
||||
|
||||
.Lt_0_30210: # 0x11a
|
||||
.Lt_0_29954: # 0x11a
|
||||
movi.n a2,0 # [0]
|
||||
rur.accx_0 a10 # [1]
|
||||
addi.n a12,a7,-1 # [2]
|
||||
movi.n a11,1 # [3]
|
||||
ssl a12 # [4]
|
||||
sll a11,a11 # [5]
|
||||
ssr a7 # [6]
|
||||
add.n a10,a10,a11 # [7]
|
||||
sra a10,a10 # [8]
|
||||
s8i a10,a4,0 # [9] id:772
|
||||
retw.n # [10]
|
||||
|
||||
.Lt_0_33794: # 0x136
|
||||
.Lt_0_18946: # 0x136
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
call8 dspi_dotprod_s8_ansi # [6] dspi_dotprod_s8_ansi
|
||||
|
||||
#.LBB25_dspi_dotprod_s8_aes3: # 0x145
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB32_dspi_dotprod_s8_aes3: # 0x149
|
||||
ee.ld.128.usar.ip q1,a2,16 # [0] id:680
|
||||
ee.ld.128.usar.ip q2,a2,16 # [1] id:681
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [3] id:682
|
||||
beqz.n a6,.Lt_0_22786 # [4]
|
||||
|
||||
movi.n a10,32 # [0]
|
||||
l32i.n a12,a1,0 # [1] gra_spill_temp_0
|
||||
movi.n a11,-16 # [2]
|
||||
addi a12,a12,-32 # [3]
|
||||
loopgtz a6,.LBB104_dspi_dotprod_s8_aes3 # [4]
|
||||
|
||||
.LBB102_dspi_dotprod_s8_aes3: # 0x160
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:684
|
||||
ee.vmulas.s8.accx.ld.xp.qup q1,a2,a12,q0,q1,q2,q3 # [0*II+1] id:683
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+2] id:686
|
||||
ee.vmulas.s8.accx.ld.xp.qup q2,a2,a11,q4,q2,q3,q1 # [0*II+3] id:685
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+4] id:687
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+5] id:689
|
||||
ee.vmulas.s8.accx.ld.ip.qup q3,a2,16,q5,q3,q1,q2 # [0*II+6] id:688
|
||||
|
||||
.LBB104_dspi_dotprod_s8_aes3: # 0x178
|
||||
j .Lt_0_22786 # [0]
|
||||
|
||||
.LBB38_dspi_dotprod_s8_aes3: # 0x17b
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
srli a3,a6,1 # [2]
|
||||
l32i.n a12,a1,0 # [3] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a2,16 # [4] id:690
|
||||
ee.ld.128.usar.ip q2,a2,16 # [5] id:691
|
||||
addi a12,a12,-16 # [7]
|
||||
ee.src.q.ld.xp q3,a2,a12,q1,q2 # [8] id:692
|
||||
loopnez a3,.LBB127_dspi_dotprod_s8_aes3 # [9]
|
||||
|
||||
.LBB125_dspi_dotprod_s8_aes3: # 0x193
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:694
|
||||
ee.vmulas.s8.accx.ld.xp.qup q3,a2,a11,q0,q1,q2,q3 # [0*II+1] id:693
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+2] id:695
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+3] id:697
|
||||
ee.vmulas.s8.accx.ld.xp.qup q4,a2,a12,q4,q2,q1,q3 # [0*II+4] id:696
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+5] id:699
|
||||
ee.vmulas.s8.accx.ld.xp.qup q2,a2,a11,q0,q1,q3,q4 # [0*II+6] id:698
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+7] id:700
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+8] id:702
|
||||
ee.vmulas.s8.accx.ld.xp.qup q3,a2,a12,q5,q3,q1,q2 # [0*II+9] id:701
|
||||
|
||||
.LBB127_dspi_dotprod_s8_aes3: # 0x1b5
|
||||
j .Lt_0_24322 # [0]
|
||||
|
||||
.LBB44_dspi_dotprod_s8_aes3: # 0x1b8
|
||||
srli a3,a3,2 # [0]
|
||||
movi.n a10,-16 # [1]
|
||||
l32i.n a11,a1,0 # [2] gra_spill_temp_0
|
||||
addi a8,a2,16 # [3]
|
||||
addi a11,a11,16 # [4]
|
||||
ee.ld.128.usar.xp q2,a8,a10 # [5] id:703
|
||||
ee.ld.128.usar.xp q1,a8,a11 # [6] id:704
|
||||
ee.src.q.ld.xp q3,a8,a10,q1,q2 # [8] id:705
|
||||
ee.ld.128.usar.xp q2,a8,a11 # [9] id:706
|
||||
loopnez a3,.LBB150_dspi_dotprod_s8_aes3 # [10]
|
||||
|
||||
.LBB148_dspi_dotprod_s8_aes3: # 0x1d4
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:708
|
||||
ee.vmulas.s8.accx.ld.xp.qup q3,a8,a10,q0,q1,q2,q3 # [0*II+1] id:707
|
||||
ee.ld.128.usar.xp q1,a8,a11 # [0*II+2] id:709
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+3] id:711
|
||||
ee.vmulas.s8.accx.ld.xp.qup q4,a8,a10,q4,q2,q1,q3 # [0*II+4] id:710
|
||||
ee.ld.128.usar.xp q3,a8,a11 # [0*II+5] id:712
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+6] id:714
|
||||
ee.vmulas.s8.accx.ld.xp.qup q4,a8,a10,q0,q1,q3,q4 # [0*II+7] id:713
|
||||
ee.ld.128.usar.xp q1,a8,a11 # [0*II+8] id:715
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+9] id:717
|
||||
ee.vmulas.s8.accx.ld.xp.qup q3,a8,a10,q5,q3,q1,q4 # [0*II+10] id:716
|
||||
ee.ld.128.usar.xp q2,a8,a11 # [0*II+11] id:718
|
||||
|
||||
.LBB150_dspi_dotprod_s8_aes3: # 0x1fc
|
||||
mov.n a2,a8 # [0]
|
||||
j .Lt_0_25858 # [1]
|
||||
|
||||
.LBB50_dspi_dotprod_s8_aes3: # 0x201
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
l32i.n a12,a1,0 # [2] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a2,16 # [3] id:719
|
||||
ee.ld.128.usar.ip q2,a2,16 # [4] id:720
|
||||
sub a12,a12,a5 # [5]
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [7] id:721
|
||||
addi a12,a12,16 # [8]
|
||||
loopnez a3,.LBB173_dspi_dotprod_s8_aes3 # [9]
|
||||
|
||||
.LBB171_dspi_dotprod_s8_aes3: # 0x219
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+0] id:723
|
||||
ee.vmulas.s8.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:722
|
||||
ee.vld.128.ip q1,a15,16 # [0*II+2] id:725
|
||||
ee.vmulas.s8.accx.ld.xp.qup q0,a2,a12,q5,q2,q3,q4 # [0*II+3] id:724
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+4] id:728
|
||||
ee.vmulas.s8.accx.ld.xp.qup q2,a2,a11,q1,q3,q4,q0 # [0*II+5] id:726
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+6] id:727
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+7] id:730
|
||||
ee.vmulas.s8.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+8] id:729
|
||||
|
||||
.LBB173_dspi_dotprod_s8_aes3: # 0x238
|
||||
j .Lt_0_27394 # [0]
|
||||
|
||||
.LBB56_dspi_dotprod_s8_aes3: # 0x23b
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
l32i.n a12,a1,0 # [2] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a2,16 # [3] id:731
|
||||
ee.ld.128.usar.ip q2,a2,16 # [4] id:732
|
||||
sub a12,a12,a5 # [6]
|
||||
addi a12,a12,16 # [7]
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [8] id:733
|
||||
loopnez a3,.LBB195_dspi_dotprod_s8_aes3 # [9]
|
||||
|
||||
.LBB193_dspi_dotprod_s8_aes3: # 0x253
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:735
|
||||
ee.vmulas.s8.accx.ld.ip.qup q1,a2,16,q0,q1,q2,q3 # [0*II+1] id:734
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+2] id:737
|
||||
ee.vmulas.s8.accx.ld.ip.qup q4,a2,16,q4,q2,q3,q1 # [0*II+3] id:736
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+4] id:739
|
||||
ee.vmulas.s8.accx.ld.ip.qup q0,a2,16,q0,q3,q1,q4 # [0*II+5] id:738
|
||||
ee.vld.128.ip q6,a15,16 # [0*II+6] id:741
|
||||
ee.vmulas.s8.accx.ld.ip.qup q1,a2,16,q5,q1,q4,q0 # [0*II+7] id:740
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+8] id:743
|
||||
ee.vmulas.s8.accx.ld.ip.qup q4,a2,16,q6,q4,q0,q1 # [0*II+9] id:742
|
||||
ee.vld.128.ip q6,a15,16 # [0*II+10] id:745
|
||||
ee.vmulas.s8.accx.ld.xp.qup q0,a2,a12,q5,q0,q1,q4 # [0*II+11] id:744
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+12] id:748
|
||||
ee.vmulas.s8.accx.ld.xp.qup q2,a2,a11,q6,q1,q4,q0 # [0*II+13] id:746
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+14] id:747
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+15] id:750
|
||||
ee.vmulas.s8.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+16] id:749
|
||||
|
||||
.LBB195_dspi_dotprod_s8_aes3: # 0x28e
|
||||
movi.n a2,0 # [0]
|
||||
movi.n a11,1 # [1]
|
||||
addi.n a12,a7,-1 # [2]
|
||||
rur.accx_0 a10 # [3]
|
||||
ssl a12 # [4]
|
||||
sll a11,a11 # [5]
|
||||
ssr a7 # [6]
|
||||
add.n a10,a10,a11 # [7]
|
||||
sra a10,a10 # [8]
|
||||
s8i a10,a4,0 # [9] id:772
|
||||
retw.n # [10]
|
||||
|
||||
.LBB28_dspi_dotprod_s8_aes3: # 0x2aa
|
||||
mov.n a15,a7 # [0]
|
||||
mov.n a14,a6 # [1]
|
||||
mov.n a13,a5 # [2]
|
||||
mov.n a12,a4 # [3]
|
||||
mov.n a11,a3 # [4]
|
||||
mov.n a10,a2 # [5]
|
||||
call8 dspi_dotprod_s8_ansi # [6] dspi_dotprod_s8_ansi
|
||||
|
||||
#.LBB29_dspi_dotprod_s8_aes3: # 0x2b9
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
|
||||
#endif // dsps_dotprod_s16_aes3_enabled
|
||||
@@ -0,0 +1,49 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
|
||||
esp_err_t dspi_dotprod_s8_ansi(image2d_t *in_image, image2d_t *filter, int8_t *out_value, int count_x, int count_y, int shift)
|
||||
{
|
||||
if (in_image->step_x * count_x > in_image->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (in_image->step_y * count_y > in_image->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_x * count_x > filter->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_y * count_y > filter->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
|
||||
int8_t *i_data = (int8_t *)in_image->data;
|
||||
int8_t *f_data = (int8_t *)filter->data;
|
||||
int i_step = in_image->stride_x * in_image->step_y;
|
||||
int f_step = filter->stride_x * filter->step_y;
|
||||
|
||||
int32_t acc = 0;
|
||||
for (int y = 0; y < count_y; y++) {
|
||||
for (int x = 0; x < count_x; x++) {
|
||||
acc += (int16_t)i_data[in_image->step_x * x] * (int16_t)f_data[filter->step_x * x];
|
||||
}
|
||||
i_data += i_step;
|
||||
f_data += f_step;
|
||||
}
|
||||
acc += 1 << (shift - 1); // round operation
|
||||
acc >>= shift;
|
||||
*out_value = acc;
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,93 @@
|
||||
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_arp4_enabled == 1)
|
||||
#include "dsp_err_codes.h"
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.global dspi_dotprod_s8_arp4
|
||||
.global dspi_dotprod_s8_ansi
|
||||
.type dspi_dotprod_s8_arp4,@function
|
||||
|
||||
// esp_err_t dspi_dotprod_s8_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift);
|
||||
dspi_dotprod_s8_arp4:
|
||||
// in_image - a0
|
||||
// filter - a1
|
||||
// out_value - a2
|
||||
// count_x - a3
|
||||
// count_y - a4
|
||||
// shift - a5
|
||||
|
||||
// i_data - t0
|
||||
// f_data - t1
|
||||
// i_step - t2
|
||||
// f_step - t3
|
||||
// t4 - current i_data
|
||||
// t5 - current f_data
|
||||
|
||||
lw t1, 4(a0) // load in_image->step_x
|
||||
lw t2, 4(a1) // load filter->step_x
|
||||
or t1, t1, t2
|
||||
addi t1, t1, -1 // should be 0 now
|
||||
andi t2, a3, 15
|
||||
or t1, t1, t2
|
||||
|
||||
beqz t1, .dspi_dotprod_s8_arp4_body
|
||||
j dspi_dotprod_s8_ansi
|
||||
|
||||
.dspi_dotprod_s8_arp4_body:
|
||||
add sp, sp, -16
|
||||
lw t0, 0(a0) // i_data
|
||||
lw t1, 0(a1) // f_data
|
||||
|
||||
lw t2, 8(a0) // step_y
|
||||
lw t4, 12(a0) // stride_x
|
||||
mul t2, t4, t2
|
||||
|
||||
lw t3, 8(a1) // step_y
|
||||
lw t5, 12(a1) // stride_x
|
||||
mul t3, t5, t3
|
||||
|
||||
srli t6, a3, 4 // t5 = len/16
|
||||
|
||||
addi a6, a5, -1
|
||||
li t4, 1
|
||||
sll t4, t4, a6
|
||||
esp.zero.xacc
|
||||
esp.movx.w.xacc.l t4
|
||||
|
||||
.loop_count_y:
|
||||
mv t4, t0
|
||||
mv t5, t1
|
||||
esp.vld.128.ip q0, t4, 16 // q0 - i_data
|
||||
|
||||
esp.lp.setup 0, t6, .loop_count_x
|
||||
esp.vld.128.ip q1, t5, 16 // q1 - f_data
|
||||
.loop_count_x: esp.vmulas.s8.xacc.ld.ip q0, t4, 16, q0, q1 // q0 - i_data
|
||||
|
||||
add t0, t0, t2
|
||||
add t1, t1, t3
|
||||
add a4,a4, -1
|
||||
bgtz a4, .loop_count_y
|
||||
|
||||
esp.srs.s.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
|
||||
sh t5, 0(a2) // store result to output buffer
|
||||
|
||||
li a0,0
|
||||
add sp,sp,16
|
||||
ret
|
||||
|
||||
#endif // dspi_dotprod_arp4_enabled
|
||||
@@ -0,0 +1,371 @@
|
||||
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_aes3_enabled == 1)
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.literal .LC0_1_55, 458755
|
||||
|
||||
# Program Unit: dspi_dotprod_u16_aes3
|
||||
.type dspi_dotprod_u16_aes3, @function
|
||||
.align 4
|
||||
.global dspi_dotprod_u16_aes3
|
||||
dspi_dotprod_u16_aes3: # 0x4
|
||||
.LBB1_dspi_dotprod_u16_aes3: # 0x4
|
||||
entry a1,64 #
|
||||
l32i.n a10,a2,4 # [0] id:681
|
||||
l32i.n a11,a2,12 # [1] id:680
|
||||
mull a8,a10,a5 # [2]
|
||||
blt a11,a8,.LBB81_dspi_dotprod_u16_aes3 # [4]
|
||||
|
||||
l32i.n a12,a2,8 # [0] id:682
|
||||
l32i.n a9,a2,16 # [1] id:683
|
||||
mull a13,a12,a6 # [2]
|
||||
blt a9,a13,.LBB81_dspi_dotprod_u16_aes3 # [4]
|
||||
|
||||
l32i.n a15,a3,4 # [0] id:685
|
||||
l32i.n a14,a3,12 # [1] id:684
|
||||
mull a13,a15,a5 # [2]
|
||||
blt a14,a13,.LBB81_dspi_dotprod_u16_aes3 # [4]
|
||||
|
||||
l32i.n a8,a3,16 # [0] id:687
|
||||
l32i.n a9,a3,8 # [1] id:686
|
||||
s32i.n a9,a1,24 # [2] gra_spill_temp_2
|
||||
mull a9,a9,a6 # [3]
|
||||
blt a8,a9,.LBB81_dspi_dotprod_u16_aes3 # [5]
|
||||
|
||||
l32i.n a8,a3,0 # [0] id:688
|
||||
s32i.n a8,a1,20 # [1] gra_spill_temp_1
|
||||
bbsi a8,0,.Lt_0_34050 # [2]
|
||||
|
||||
bne a14,a13,.Lt_0_34050 # [0]
|
||||
|
||||
bnei a15,1,.Lt_0_34050 # [0]
|
||||
|
||||
l32i.n a9,a1,24 # [0] gra_spill_temp_2
|
||||
beqi a9,1,.Lt_0_18178 # [2]
|
||||
|
||||
.Lt_0_34050: # 0x43
|
||||
.Lt_0_18434: # 0x43
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
.type dspi_dotprod_s16_ansi, @function
|
||||
call8 dspi_dotprod_s16_ansi # [6] dspi_dotprod_s16_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB81_dspi_dotprod_u16_aes3: # 0x56
|
||||
l32r a2,.LC0_1_55 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.Lt_0_18178: # 0x5b
|
||||
addi.n a13,a10,-1 # [0]
|
||||
bnez a13,.Lt_0_34818 # [1]
|
||||
|
||||
addi.n a14,a12,-1 # [0]
|
||||
bnez a14,.Lt_0_34818 # [1]
|
||||
|
||||
extui a15,a5,0,3 # [0]
|
||||
bnez.n a15,.Lt_0_34818 # [1]
|
||||
|
||||
blti a6,4,.Lt_0_34818 # [0]
|
||||
|
||||
movi.n a8,32 # [0]
|
||||
bge a8,a5,.Lt_0_35330 # [1]
|
||||
|
||||
extui a9,a5,0,1 # [0]
|
||||
bnez a9,.LBB28_dspi_dotprod_u16_aes3 # [1]
|
||||
|
||||
.Lt_0_35330: # 0x78
|
||||
.Lt_0_20226: # 0x78
|
||||
mov.n a3,a6 # [0]
|
||||
addi a10,a5,-24 # [1]
|
||||
mull a13,a11,a12 # [2]
|
||||
l32i.n a15,a1,20 # [3] gra_spill_temp_1
|
||||
l32i.n a2,a2,0 # [4] id:689
|
||||
movi.n a14,0 # [5]
|
||||
wur.sar_byte a14 # [6]
|
||||
wur.accx_0 a14 # [8]
|
||||
wur.accx_1 a14 # [9]
|
||||
ee.vld.128.ip q0,a15,16 # [10] id:693
|
||||
slli a13,a13,1 # [11]
|
||||
s32i.n a13,a1,16 # [12] gra_spill_temp_0
|
||||
beqz a10,.LBB32_dspi_dotprod_u16_aes3 # [13]
|
||||
|
||||
.Lt_0_23298: # 0x99
|
||||
.Lt_0_22786: # 0x99
|
||||
addi a8,a5,-16 # [0]
|
||||
beqz a8,.LBB38_dspi_dotprod_u16_aes3 # [1]
|
||||
|
||||
.Lt_0_24834: # 0x9f
|
||||
.Lt_0_24322: # 0x9f
|
||||
addi a9,a5,-8 # [0]
|
||||
beqz a9,.LBB44_dspi_dotprod_u16_aes3 # [1]
|
||||
|
||||
.Lt_0_26370: # 0xa5
|
||||
.Lt_0_25858: # 0xa5
|
||||
addi a10,a5,-32 # [0]
|
||||
beqz a10,.LBB50_dspi_dotprod_u16_aes3 # [1]
|
||||
|
||||
.Lt_0_27906: # 0xab
|
||||
.Lt_0_27394: # 0xab
|
||||
addi a11,a5,-64 # [0]
|
||||
beqz a11,.LBB56_dspi_dotprod_u16_aes3 # [1]
|
||||
|
||||
movi.n a12,64 # [0]
|
||||
bge a12,a5,.Lt_0_30722 # [1]
|
||||
|
||||
movi.n a12,0 # [0]
|
||||
ee.ld.128.usar.ip q1,a2,16 # [1] id:765
|
||||
ee.ld.128.usar.ip q2,a2,16 # [2] id:766
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [4] id:767
|
||||
beqz.n a3,.Lt_0_30722 # [5]
|
||||
|
||||
slli a8,a5,1 # [0]
|
||||
l32i.n a14,a1,16 # [1] gra_spill_temp_0
|
||||
addi a13,a5,31 # [2]
|
||||
movgez a13,a5,a5 # [3]
|
||||
srai a13,a13,5 # [4]
|
||||
sub a14,a14,a8 # [5]
|
||||
addi a14,a14,16 # [6]
|
||||
addi.n a13,a13,-1 # [7]
|
||||
|
||||
.Lt_0_31490: # 0xd9
|
||||
addi.n a12,a12,1 # [0]
|
||||
movi.n a9,32 # [1]
|
||||
beqz.n a13,.Lt_0_31746 # [2]
|
||||
|
||||
loopnez a13,.LBB221_dspi_dotprod_u16_aes3 # [0]
|
||||
|
||||
.LBB219_dspi_dotprod_u16_aes3: # 0xe2
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+0] id:769
|
||||
ee.vmulas.u16.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:768
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+2] id:771
|
||||
ee.vmulas.u16.accx.ld.ip.qup q1,a2,16,q5,q2,q3,q4 # [0*II+3] id:770
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+4] id:773
|
||||
ee.vmulas.u16.accx.ld.ip.qup q2,a2,16,q0,q3,q4,q1 # [0*II+5] id:772
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+6] id:775
|
||||
ee.vmulas.u16.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+7] id:774
|
||||
|
||||
.LBB221_dspi_dotprod_u16_aes3: # 0xfe
|
||||
|
||||
.Lt_0_31746: # 0xfe
|
||||
ee.vmulas.u16.accx.ld.ip.qup q5,a2,16,q0,q1,q2,q3 # [0] id:776
|
||||
movi.n a10,-16 # [1]
|
||||
ee.vld.128.ip q0,a15,16 # [2] id:777
|
||||
ee.vld.128.ip q6,a15,16 # [3] id:779
|
||||
ee.vmulas.u16.accx.ld.xp.qup q7,a2,a14,q0,q2,q3,q5 # [4] id:778
|
||||
ee.vld.128.ip q4,a15,16 # [5] id:782
|
||||
ee.vmulas.u16.accx.ld.xp.qup q2,a2,a10,q6,q3,q5,q7 # [6] id:780
|
||||
ee.ld.128.usar.xp q1,a2,a9 # [7] id:781
|
||||
ee.vld.128.ip q0,a15,16 # [8] id:784
|
||||
ee.vmulas.u16.accx.ld.ip.qup q3,a2,16,q4,q5,q1,q2 # [9] id:783
|
||||
bne a12,a3,.Lt_0_31490 # [10]
|
||||
|
||||
.Lt_0_30722: # 0x122
|
||||
.Lt_0_30466: # 0x122
|
||||
rur.accx_0 a9 # [0]
|
||||
rur.accx_1 a10 # [1]
|
||||
blti a7,1,.Lt_0_33282 # [2]
|
||||
|
||||
movi.n a2,0 # [0]
|
||||
addi a13,a7,-33 # [1]
|
||||
addi.n a14,a7,-1 # [2]
|
||||
ssr a14 # [3]
|
||||
sra a12,a10 # [4]
|
||||
src a11,a10,a9 # [5]
|
||||
movgez a11,a12,a13 # [6]
|
||||
addi.n a11,a11,1 # [7]
|
||||
srli a11,a11,1 # [8]
|
||||
s16i a11,a4,0 # [9] id:790
|
||||
retw.n # [10]
|
||||
|
||||
.Lt_0_34818: # 0x148
|
||||
.Lt_0_19458: # 0x148
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
call8 dspi_dotprod_s16_ansi # [6] dspi_dotprod_s16_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB32_dspi_dotprod_u16_aes3: # 0x15b
|
||||
ee.ld.128.usar.ip q1,a2,16 # [0] id:694
|
||||
ee.ld.128.usar.ip q2,a2,16 # [1] id:695
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [3] id:696
|
||||
beqz.n a6,.Lt_0_23298 # [4]
|
||||
|
||||
addi a12,a13,-32 # [0]
|
||||
movi.n a10,32 # [1]
|
||||
movi.n a11,-16 # [2]
|
||||
loopgtz a6,.LBB107_dspi_dotprod_u16_aes3 # [3]
|
||||
|
||||
.LBB105_dspi_dotprod_u16_aes3: # 0x170
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:698
|
||||
ee.vmulas.u16.accx.ld.xp.qup q1,a2,a12,q0,q1,q2,q3 # [0*II+1] id:697
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+2] id:700
|
||||
ee.vmulas.u16.accx.ld.xp.qup q2,a2,a11,q4,q2,q3,q1 # [0*II+3] id:699
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+4] id:701
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+5] id:703
|
||||
ee.vmulas.u16.accx.ld.ip.qup q3,a2,16,q5,q3,q1,q2 # [0*II+6] id:702
|
||||
|
||||
.LBB107_dspi_dotprod_u16_aes3: # 0x188
|
||||
j .Lt_0_23298 # [0]
|
||||
|
||||
.LBB38_dspi_dotprod_u16_aes3: # 0x18b
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
srli a3,a6,1 # [2]
|
||||
l32i.n a12,a1,16 # [3] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a2,16 # [4] id:704
|
||||
ee.ld.128.usar.ip q2,a2,16 # [5] id:705
|
||||
addi a12,a12,-16 # [7]
|
||||
ee.src.q.ld.xp q3,a2,a12,q1,q2 # [8] id:706
|
||||
loopnez a3,.LBB130_dspi_dotprod_u16_aes3 # [9]
|
||||
|
||||
.LBB128_dspi_dotprod_u16_aes3: # 0x1a3
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:708
|
||||
ee.vmulas.u16.accx.ld.xp.qup q3,a2,a11,q0,q1,q2,q3 # [0*II+1] id:707
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+2] id:709
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+3] id:711
|
||||
ee.vmulas.u16.accx.ld.xp.qup q4,a2,a12,q4,q2,q1,q3 # [0*II+4] id:710
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+5] id:713
|
||||
ee.vmulas.u16.accx.ld.xp.qup q2,a2,a11,q0,q1,q3,q4 # [0*II+6] id:712
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+7] id:714
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+8] id:716
|
||||
ee.vmulas.u16.accx.ld.xp.qup q3,a2,a12,q5,q3,q1,q2 # [0*II+9] id:715
|
||||
|
||||
.LBB130_dspi_dotprod_u16_aes3: # 0x1c5
|
||||
j .Lt_0_24834 # [0]
|
||||
|
||||
.LBB44_dspi_dotprod_u16_aes3: # 0x1c8
|
||||
srli a3,a3,2 # [0]
|
||||
movi.n a10,-16 # [1]
|
||||
l32i.n a11,a1,16 # [2] gra_spill_temp_0
|
||||
addi a8,a2,16 # [3]
|
||||
addi a11,a11,16 # [4]
|
||||
ee.ld.128.usar.xp q2,a8,a10 # [5] id:717
|
||||
ee.ld.128.usar.xp q1,a8,a11 # [6] id:718
|
||||
ee.src.q.ld.xp q3,a8,a10,q1,q2 # [8] id:719
|
||||
ee.ld.128.usar.xp q2,a8,a11 # [9] id:720
|
||||
loopnez a3,.LBB153_dspi_dotprod_u16_aes3 # [10]
|
||||
|
||||
.LBB151_dspi_dotprod_u16_aes3: # 0x1e4
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:722
|
||||
ee.vmulas.u16.accx.ld.xp.qup q3,a8,a10,q0,q1,q2,q3 # [0*II+1] id:721
|
||||
ee.ld.128.usar.xp q1,a8,a11 # [0*II+2] id:723
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+3] id:725
|
||||
ee.vmulas.u16.accx.ld.xp.qup q4,a8,a10,q4,q2,q1,q3 # [0*II+4] id:724
|
||||
ee.ld.128.usar.xp q3,a8,a11 # [0*II+5] id:726
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+6] id:728
|
||||
ee.vmulas.u16.accx.ld.xp.qup q4,a8,a10,q0,q1,q3,q4 # [0*II+7] id:727
|
||||
ee.ld.128.usar.xp q1,a8,a11 # [0*II+8] id:729
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+9] id:731
|
||||
ee.vmulas.u16.accx.ld.xp.qup q3,a8,a10,q5,q3,q1,q4 # [0*II+10] id:730
|
||||
ee.ld.128.usar.xp q2,a8,a11 # [0*II+11] id:732
|
||||
|
||||
.LBB153_dspi_dotprod_u16_aes3: # 0x20c
|
||||
mov.n a2,a8 # [0]
|
||||
j .Lt_0_26370 # [1]
|
||||
|
||||
.LBB50_dspi_dotprod_u16_aes3: # 0x211
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
slli a13,a5,1 # [2]
|
||||
l32i.n a12,a1,16 # [3] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a2,16 # [4] id:733
|
||||
ee.ld.128.usar.ip q2,a2,16 # [5] id:734
|
||||
sub a12,a12,a13 # [6]
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [8] id:735
|
||||
addi a12,a12,16 # [9]
|
||||
loopnez a3,.LBB176_dspi_dotprod_u16_aes3 # [10]
|
||||
|
||||
.LBB174_dspi_dotprod_u16_aes3: # 0x22c
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+0] id:737
|
||||
ee.vmulas.u16.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:736
|
||||
ee.vld.128.ip q1,a15,16 # [0*II+2] id:739
|
||||
ee.vmulas.u16.accx.ld.xp.qup q0,a2,a12,q5,q2,q3,q4 # [0*II+3] id:738
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+4] id:742
|
||||
ee.vmulas.u16.accx.ld.xp.qup q2,a2,a11,q1,q3,q4,q0 # [0*II+5] id:740
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+6] id:741
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+7] id:744
|
||||
ee.vmulas.u16.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+8] id:743
|
||||
|
||||
.LBB176_dspi_dotprod_u16_aes3: # 0x24b
|
||||
j .Lt_0_27906 # [0]
|
||||
|
||||
.LBB56_dspi_dotprod_u16_aes3: # 0x24e
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
slli a13,a5,1 # [2]
|
||||
l32i.n a12,a1,16 # [3] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a2,16 # [4] id:745
|
||||
ee.ld.128.usar.ip q2,a2,16 # [5] id:746
|
||||
sub a12,a12,a13 # [7]
|
||||
addi a12,a12,16 # [8]
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [9] id:747
|
||||
loopnez a3,.LBB198_dspi_dotprod_u16_aes3 # [10]
|
||||
|
||||
.LBB196_dspi_dotprod_u16_aes3: # 0x269
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:749
|
||||
ee.vmulas.u16.accx.ld.ip.qup q1,a2,16,q0,q1,q2,q3 # [0*II+1] id:748
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+2] id:751
|
||||
ee.vmulas.u16.accx.ld.ip.qup q4,a2,16,q4,q2,q3,q1 # [0*II+3] id:750
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+4] id:753
|
||||
ee.vmulas.u16.accx.ld.ip.qup q0,a2,16,q0,q3,q1,q4 # [0*II+5] id:752
|
||||
ee.vld.128.ip q6,a15,16 # [0*II+6] id:755
|
||||
ee.vmulas.u16.accx.ld.ip.qup q1,a2,16,q5,q1,q4,q0 # [0*II+7] id:754
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+8] id:757
|
||||
ee.vmulas.u16.accx.ld.ip.qup q4,a2,16,q6,q4,q0,q1 # [0*II+9] id:756
|
||||
ee.vld.128.ip q6,a15,16 # [0*II+10] id:759
|
||||
ee.vmulas.u16.accx.ld.xp.qup q0,a2,a12,q5,q0,q1,q4 # [0*II+11] id:758
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+12] id:762
|
||||
ee.vmulas.u16.accx.ld.xp.qup q2,a2,a11,q6,q1,q4,q0 # [0*II+13] id:760
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+14] id:761
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+15] id:764
|
||||
ee.vmulas.u16.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+16] id:763
|
||||
|
||||
.LBB198_dspi_dotprod_u16_aes3: # 0x2a4
|
||||
j .Lt_0_30722 # [0]
|
||||
|
||||
.Lt_0_33282: # 0x2a7
|
||||
movi.n a2,0 # [0]
|
||||
sext a14,a9,15 # [1]
|
||||
s16i a14,a4,0 # [2] id:791
|
||||
retw.n # [3]
|
||||
|
||||
.LBB28_dspi_dotprod_u16_aes3: # 0x2b1
|
||||
mov.n a15,a7 # [0]
|
||||
mov.n a14,a6 # [1]
|
||||
mov.n a13,a5 # [2]
|
||||
mov.n a12,a4 # [3]
|
||||
mov.n a11,a3 # [4]
|
||||
mov.n a10,a2 # [5]
|
||||
call8 dspi_dotprod_s16_ansi # [6] dspi_dotprod_s16_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
#endif // dsps_dotprod_s16_aes3_enabled
|
||||
@@ -0,0 +1,49 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
|
||||
esp_err_t dspi_dotprod_u16_ansi(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift)
|
||||
{
|
||||
if (in_image->step_x * count_x > in_image->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (in_image->step_y * count_y > in_image->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_x * count_x > filter->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_y * count_y > filter->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
|
||||
uint16_t *i_data = (uint16_t *)in_image->data;
|
||||
uint16_t *f_data = (uint16_t *)filter->data;
|
||||
int i_step = in_image->stride_x * in_image->step_y;
|
||||
int f_step = filter->stride_x * filter->step_y;
|
||||
|
||||
int64_t acc = 0;
|
||||
for (int y = 0; y < count_y; y++) {
|
||||
for (int x = 0; x < count_x; x++) {
|
||||
acc += (int32_t)i_data[in_image->step_x * x] * (int32_t)f_data[filter->step_x * x];
|
||||
}
|
||||
i_data += i_step;
|
||||
f_data += f_step;
|
||||
}
|
||||
acc += 1 << (shift - 1); // round operation
|
||||
acc >>= shift;
|
||||
*out_value = acc;
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,95 @@
|
||||
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_arp4_enabled == 1)
|
||||
#include "dsp_err_codes.h"
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.global dspi_dotprod_u16_arp4
|
||||
.global dspi_dotprod_u16_ansi
|
||||
.type dspi_dotprod_u16_arp4,@function
|
||||
|
||||
// esp_err_t dspi_dotprod_u16_arp4(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift);
|
||||
dspi_dotprod_u16_arp4:
|
||||
// in_image - a0
|
||||
// filter - a1
|
||||
// out_value - a2
|
||||
// count_x - a3
|
||||
// count_y - a4
|
||||
// shift - a5
|
||||
|
||||
// i_data - t0
|
||||
// f_data - t1
|
||||
// i_step - t2
|
||||
// f_step - t3
|
||||
// t4 - current i_data
|
||||
// t5 - current f_data
|
||||
|
||||
lw t1, 4(a0) // load in_image->step_x
|
||||
lw t2, 4(a1) // load filter->step_x
|
||||
or t1, t1, t2
|
||||
addi t1, t1, -1 // should be 0 now
|
||||
andi t2, a3, 7
|
||||
or t1, t1, t2
|
||||
|
||||
beqz t1, .dspi_dotprod_u16_arp4_body
|
||||
j dspi_dotprod_u16_ansi
|
||||
|
||||
.dspi_dotprod_u16_arp4_body:
|
||||
add sp, sp, -16
|
||||
lw t0, 0(a0) // i_data
|
||||
lw t1, 0(a1) // f_data
|
||||
|
||||
lw t2, 8(a0) // step_y
|
||||
lw t4, 12(a0) // stride_x
|
||||
mul t2, t4, t2
|
||||
slli t2, t2, 1 // i_step = i_step<<1
|
||||
|
||||
lw t3, 8(a1) // step_y
|
||||
lw t5, 12(a1) // stride_x
|
||||
mul t3, t5, t3
|
||||
slli t3, t3, 1 // f_step = f_step<<1
|
||||
|
||||
srli t6, a3, 3 // t5 = len/8
|
||||
|
||||
addi a6, a5, -1
|
||||
li t4, 1
|
||||
sll t4, t4, a6
|
||||
esp.zero.xacc
|
||||
esp.movx.w.xacc.l t4
|
||||
|
||||
.loop_count_y:
|
||||
mv t4, t0
|
||||
mv t5, t1
|
||||
esp.vld.128.ip q0, t4, 16 // q0 - i_data
|
||||
|
||||
esp.lp.setup 0, t6, .loop_count_x
|
||||
esp.vld.128.ip q1, t5, 16 // q1 - f_data
|
||||
.loop_count_x: esp.vmulas.u16.xacc.ld.ip q0, t4, 16, q0, q1 // q0 - i_data
|
||||
|
||||
add t0, t0, t2
|
||||
add t1, t1, t3
|
||||
add a4,a4, -1
|
||||
bgtz a4, .loop_count_y
|
||||
|
||||
esp.srs.u.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
|
||||
sh t5, 0(a2) // store result to output buffer
|
||||
|
||||
li a0,0
|
||||
add sp,sp,16
|
||||
ret
|
||||
|
||||
#endif // dspi_dotprod_arp4_enabled
|
||||
@@ -0,0 +1,367 @@
|
||||
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_aes3_enabled == 1)
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.literal .LC0_1_52, 458755
|
||||
|
||||
.type dspi_dotprod_u8_aes3, @function
|
||||
.align 4
|
||||
.global dspi_dotprod_u8_aes3
|
||||
dspi_dotprod_u8_aes3: # 0x4
|
||||
.LBB1_dspi_dotprod_u8_aes3: # 0x4
|
||||
entry a1,48 #
|
||||
l32i.n a10,a2,4 # [0] id:669
|
||||
l32i.n a11,a2,12 # [1] id:668
|
||||
mull a8,a10,a5 # [2]
|
||||
blt a11,a8,.LBB78_dspi_dotprod_u8_aes3 # [4]
|
||||
|
||||
l32i.n a12,a2,8 # [0] id:670
|
||||
l32i.n a9,a2,16 # [1] id:671
|
||||
mull a13,a12,a6 # [2]
|
||||
blt a9,a13,.LBB78_dspi_dotprod_u8_aes3 # [4]
|
||||
|
||||
l32i.n a15,a3,4 # [0] id:673
|
||||
l32i.n a14,a3,12 # [1] id:672
|
||||
mull a13,a15,a5 # [2]
|
||||
blt a14,a13,.LBB78_dspi_dotprod_u8_aes3 # [4]
|
||||
|
||||
l32i.n a8,a3,16 # [0] id:675
|
||||
l32i.n a9,a3,8 # [1] id:674
|
||||
s32i.n a9,a1,8 # [2] gra_spill_temp_2
|
||||
mull a9,a9,a6 # [3]
|
||||
blt a8,a9,.LBB78_dspi_dotprod_u8_aes3 # [5]
|
||||
|
||||
l32i.n a8,a3,0 # [0] id:676
|
||||
s32i.n a8,a1,4 # [1] gra_spill_temp_1
|
||||
bbsi a8,0,.Lt_0_33026 # [2]
|
||||
|
||||
bne a14,a13,.Lt_0_33026 # [0]
|
||||
|
||||
bnei a15,1,.Lt_0_33026 # [0]
|
||||
|
||||
l32i.n a13,a1,8 # [0] gra_spill_temp_2
|
||||
beqi a13,1,.Lt_0_17666 # [2]
|
||||
|
||||
.Lt_0_33026: # 0x43
|
||||
.Lt_0_17922: # 0x43
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
.type dspi_dotprod_u8_ansi, @function
|
||||
call8 dspi_dotprod_u8_ansi # [6] dspi_dotprod_u8_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB78_dspi_dotprod_u8_aes3: # 0x56
|
||||
l32r a2,.LC0_1_52 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.Lt_0_17666: # 0x5b
|
||||
addi.n a14,a10,-1 # [0]
|
||||
bnez a14,.Lt_0_33794 # [1]
|
||||
|
||||
addi.n a15,a12,-1 # [0]
|
||||
bnez a15,.Lt_0_33794 # [1]
|
||||
|
||||
extui a8,a5,0,4 # [0]
|
||||
bnez.n a8,.Lt_0_33794 # [1]
|
||||
|
||||
blti a6,4,.Lt_0_33794 # [0]
|
||||
|
||||
movi.n a9,64 # [0]
|
||||
bge a9,a5,.Lt_0_34306 # [1]
|
||||
|
||||
extui a10,a5,0,1 # [0]
|
||||
bnez a10,.LBB28_dspi_dotprod_u8_aes3 # [1]
|
||||
|
||||
.Lt_0_34306: # 0x78
|
||||
.Lt_0_19714: # 0x78
|
||||
mov.n a3,a6 # [0]
|
||||
addi a13,a5,-48 # [1]
|
||||
movi.n a14,0 # [2]
|
||||
mull a15,a11,a12 # [3]
|
||||
l32i.n a2,a2,0 # [4] id:677
|
||||
s32i.n a15,a1,0 # [6] gra_spill_temp_0
|
||||
wur.accx_0 a14 # [7]
|
||||
l32i.n a15,a1,4 # [8] gra_spill_temp_1
|
||||
wur.accx_1 a14 # [9]
|
||||
ee.vld.128.ip q0,a15,16 # [10] id:680
|
||||
beqz a13,.LBB32_dspi_dotprod_u8_aes3 # [11]
|
||||
|
||||
.Lt_0_22786: # 0x93
|
||||
.Lt_0_22274: # 0x93
|
||||
addi a8,a5,-32 # [0]
|
||||
beqz a8,.LBB38_dspi_dotprod_u8_aes3 # [1]
|
||||
|
||||
.Lt_0_24322: # 0x99
|
||||
.Lt_0_23810: # 0x99
|
||||
addi a9,a5,-16 # [0]
|
||||
beqz a9,.LBB44_dspi_dotprod_u8_aes3 # [1]
|
||||
|
||||
.Lt_0_25858: # 0x9f
|
||||
.Lt_0_25346: # 0x9f
|
||||
addi a10,a5,-64 # [0]
|
||||
beqz a10,.LBB50_dspi_dotprod_u8_aes3 # [1]
|
||||
|
||||
.Lt_0_27394: # 0xa5
|
||||
.Lt_0_26882: # 0xa5
|
||||
addi a11,a5,-128 # [0]
|
||||
beqz a11,.LBB56_dspi_dotprod_u8_aes3 # [1]
|
||||
|
||||
movi a12,128 # [0]
|
||||
bge a12,a5,.Lt_0_30210 # [1]
|
||||
|
||||
movi.n a12,0 # [0]
|
||||
ee.ld.128.usar.ip q1,a2,16 # [1] id:752
|
||||
ee.ld.128.usar.ip q2,a2,16 # [2] id:753
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [4] id:754
|
||||
beqz.n a3,.Lt_0_30210 # [5]
|
||||
|
||||
l32i.n a14,a1,0 # [0] gra_spill_temp_0
|
||||
addi a13,a5,31 # [1]
|
||||
movgez a13,a5,a5 # [2]
|
||||
srai a13,a13,5 # [3]
|
||||
sub a14,a14,a5 # [4]
|
||||
addi a14,a14,16 # [5]
|
||||
addi.n a13,a13,-1 # [6]
|
||||
|
||||
.Lt_0_30978: # 0xd1
|
||||
addi.n a12,a12,1 # [0]
|
||||
movi.n a8,32 # [1]
|
||||
movi.n a9,-16 # [2]
|
||||
beqz.n a13,.Lt_0_31234 # [3]
|
||||
|
||||
loopnez a13,.LBB218_dspi_dotprod_u8_aes3 # [0]
|
||||
|
||||
.LBB216_dspi_dotprod_u8_aes3: # 0xdc
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+0] id:756
|
||||
ee.vmulas.u8.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:755
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+2] id:758
|
||||
ee.vmulas.u8.accx.ld.ip.qup q1,a2,16,q5,q2,q3,q4 # [0*II+3] id:757
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+4] id:760
|
||||
ee.vmulas.u8.accx.ld.ip.qup q2,a2,16,q0,q3,q4,q1 # [0*II+5] id:759
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+6] id:762
|
||||
ee.vmulas.u8.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+7] id:761
|
||||
|
||||
.LBB218_dspi_dotprod_u8_aes3: # 0xf8
|
||||
|
||||
.Lt_0_31234: # 0xf8
|
||||
ee.vmulas.u8.accx.ld.ip.qup q5,a2,16,q0,q1,q2,q3 # [0] id:763
|
||||
ee.vld.128.ip q0,a15,16 # [1] id:764
|
||||
ee.vld.128.ip q6,a15,16 # [2] id:766
|
||||
ee.vmulas.u8.accx.ld.xp.qup q7,a2,a14,q0,q2,q3,q5 # [3] id:765
|
||||
ee.vld.128.ip q4,a15,16 # [4] id:769
|
||||
ee.vmulas.u8.accx.ld.xp.qup q2,a2,a9,q6,q3,q5,q7 # [5] id:767
|
||||
ee.ld.128.usar.xp q1,a2,a8 # [6] id:768
|
||||
ee.vld.128.ip q0,a15,16 # [7] id:771
|
||||
ee.vmulas.u8.accx.ld.ip.qup q3,a2,16,q4,q5,q1,q2 # [8] id:770
|
||||
bne a12,a3,.Lt_0_30978 # [9]
|
||||
|
||||
.Lt_0_30210: # 0x11a
|
||||
.Lt_0_29954: # 0x11a
|
||||
movi.n a2,0 # [0]
|
||||
rur.accx_0 a10 # [1]
|
||||
addi.n a12,a7,-1 # [2]
|
||||
movi.n a11,1 # [3]
|
||||
ssl a12 # [4]
|
||||
sll a11,a11 # [5]
|
||||
ssr a7 # [6]
|
||||
add.n a10,a10,a11 # [7]
|
||||
srl a10,a10 # [8]
|
||||
s8i a10,a4,0 # [9] id:773
|
||||
retw.n # [10]
|
||||
|
||||
.Lt_0_33794: # 0x136
|
||||
.Lt_0_18946: # 0x136
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
call8 dspi_dotprod_u8_ansi # [6] dspi_dotprod_u8_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB32_dspi_dotprod_u8_aes3: # 0x149
|
||||
ee.ld.128.usar.ip q1,a2,16 # [0] id:681
|
||||
ee.ld.128.usar.ip q2,a2,16 # [1] id:682
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [3] id:683
|
||||
beqz.n a6,.Lt_0_22786 # [4]
|
||||
|
||||
movi.n a10,32 # [0]
|
||||
l32i.n a12,a1,0 # [1] gra_spill_temp_0
|
||||
movi.n a11,-16 # [2]
|
||||
addi a12,a12,-32 # [3]
|
||||
loopgtz a6,.LBB104_dspi_dotprod_u8_aes3 # [4]
|
||||
|
||||
.LBB102_dspi_dotprod_u8_aes3: # 0x160
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:685
|
||||
ee.vmulas.u8.accx.ld.xp.qup q1,a2,a12,q0,q1,q2,q3 # [0*II+1] id:684
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+2] id:687
|
||||
ee.vmulas.u8.accx.ld.xp.qup q2,a2,a11,q4,q2,q3,q1 # [0*II+3] id:686
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+4] id:688
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+5] id:690
|
||||
ee.vmulas.u8.accx.ld.ip.qup q3,a2,16,q5,q3,q1,q2 # [0*II+6] id:689
|
||||
|
||||
.LBB104_dspi_dotprod_u8_aes3: # 0x178
|
||||
j .Lt_0_22786 # [0]
|
||||
|
||||
.LBB38_dspi_dotprod_u8_aes3: # 0x17b
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
srli a3,a6,1 # [2]
|
||||
l32i.n a12,a1,0 # [3] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a2,16 # [4] id:691
|
||||
ee.ld.128.usar.ip q2,a2,16 # [5] id:692
|
||||
addi a12,a12,-16 # [7]
|
||||
ee.src.q.ld.xp q3,a2,a12,q1,q2 # [8] id:693
|
||||
loopnez a3,.LBB127_dspi_dotprod_u8_aes3 # [9]
|
||||
|
||||
.LBB125_dspi_dotprod_u8_aes3: # 0x193
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:695
|
||||
ee.vmulas.u8.accx.ld.xp.qup q3,a2,a11,q0,q1,q2,q3 # [0*II+1] id:694
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+2] id:696
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+3] id:698
|
||||
ee.vmulas.u8.accx.ld.xp.qup q4,a2,a12,q4,q2,q1,q3 # [0*II+4] id:697
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+5] id:700
|
||||
ee.vmulas.u8.accx.ld.xp.qup q2,a2,a11,q0,q1,q3,q4 # [0*II+6] id:699
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+7] id:701
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+8] id:703
|
||||
ee.vmulas.u8.accx.ld.xp.qup q3,a2,a12,q5,q3,q1,q2 # [0*II+9] id:702
|
||||
|
||||
.LBB127_dspi_dotprod_u8_aes3: # 0x1b5
|
||||
j .Lt_0_24322 # [0]
|
||||
|
||||
.LBB44_dspi_dotprod_u8_aes3: # 0x1b8
|
||||
srli a3,a3,2 # [0]
|
||||
movi.n a10,-16 # [1]
|
||||
l32i.n a11,a1,0 # [2] gra_spill_temp_0
|
||||
addi a8,a2,16 # [3]
|
||||
addi a11,a11,16 # [4]
|
||||
ee.ld.128.usar.xp q2,a8,a10 # [5] id:704
|
||||
ee.ld.128.usar.xp q1,a8,a11 # [6] id:705
|
||||
ee.src.q.ld.xp q3,a8,a10,q1,q2 # [8] id:706
|
||||
ee.ld.128.usar.xp q2,a8,a11 # [9] id:707
|
||||
loopnez a3,.LBB150_dspi_dotprod_u8_aes3 # [10]
|
||||
|
||||
.LBB148_dspi_dotprod_u8_aes3: # 0x1d4
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:709
|
||||
ee.vmulas.u8.accx.ld.xp.qup q3,a8,a10,q0,q1,q2,q3 # [0*II+1] id:708
|
||||
ee.ld.128.usar.xp q1,a8,a11 # [0*II+2] id:710
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+3] id:712
|
||||
ee.vmulas.u8.accx.ld.xp.qup q4,a8,a10,q4,q2,q1,q3 # [0*II+4] id:711
|
||||
ee.ld.128.usar.xp q3,a8,a11 # [0*II+5] id:713
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+6] id:715
|
||||
ee.vmulas.u8.accx.ld.xp.qup q4,a8,a10,q0,q1,q3,q4 # [0*II+7] id:714
|
||||
ee.ld.128.usar.xp q1,a8,a11 # [0*II+8] id:716
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+9] id:718
|
||||
ee.vmulas.u8.accx.ld.xp.qup q3,a8,a10,q5,q3,q1,q4 # [0*II+10] id:717
|
||||
ee.ld.128.usar.xp q2,a8,a11 # [0*II+11] id:719
|
||||
|
||||
.LBB150_dspi_dotprod_u8_aes3: # 0x1fc
|
||||
mov.n a2,a8 # [0]
|
||||
j .Lt_0_25858 # [1]
|
||||
|
||||
.LBB50_dspi_dotprod_u8_aes3: # 0x201
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
l32i.n a12,a1,0 # [2] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a2,16 # [3] id:720
|
||||
ee.ld.128.usar.ip q2,a2,16 # [4] id:721
|
||||
sub a12,a12,a5 # [5]
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [7] id:722
|
||||
addi a12,a12,16 # [8]
|
||||
loopnez a3,.LBB173_dspi_dotprod_u8_aes3 # [9]
|
||||
|
||||
.LBB171_dspi_dotprod_u8_aes3: # 0x219
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+0] id:724
|
||||
ee.vmulas.u8.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:723
|
||||
ee.vld.128.ip q1,a15,16 # [0*II+2] id:726
|
||||
ee.vmulas.u8.accx.ld.xp.qup q0,a2,a12,q5,q2,q3,q4 # [0*II+3] id:725
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+4] id:729
|
||||
ee.vmulas.u8.accx.ld.xp.qup q2,a2,a11,q1,q3,q4,q0 # [0*II+5] id:727
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+6] id:728
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+7] id:731
|
||||
ee.vmulas.u8.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+8] id:730
|
||||
|
||||
.LBB173_dspi_dotprod_u8_aes3: # 0x238
|
||||
j .Lt_0_27394 # [0]
|
||||
|
||||
.LBB56_dspi_dotprod_u8_aes3: # 0x23b
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
l32i.n a12,a1,0 # [2] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a2,16 # [3] id:732
|
||||
ee.ld.128.usar.ip q2,a2,16 # [4] id:733
|
||||
sub a12,a12,a5 # [6]
|
||||
addi a12,a12,16 # [7]
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [8] id:734
|
||||
loopnez a3,.LBB195_dspi_dotprod_u8_aes3 # [9]
|
||||
|
||||
.LBB193_dspi_dotprod_u8_aes3: # 0x253
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:736
|
||||
ee.vmulas.u8.accx.ld.ip.qup q1,a2,16,q0,q1,q2,q3 # [0*II+1] id:735
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+2] id:738
|
||||
ee.vmulas.u8.accx.ld.ip.qup q4,a2,16,q4,q2,q3,q1 # [0*II+3] id:737
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+4] id:740
|
||||
ee.vmulas.u8.accx.ld.ip.qup q0,a2,16,q0,q3,q1,q4 # [0*II+5] id:739
|
||||
ee.vld.128.ip q6,a15,16 # [0*II+6] id:742
|
||||
ee.vmulas.u8.accx.ld.ip.qup q1,a2,16,q5,q1,q4,q0 # [0*II+7] id:741
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+8] id:744
|
||||
ee.vmulas.u8.accx.ld.ip.qup q4,a2,16,q6,q4,q0,q1 # [0*II+9] id:743
|
||||
ee.vld.128.ip q6,a15,16 # [0*II+10] id:746
|
||||
ee.vmulas.u8.accx.ld.xp.qup q0,a2,a12,q5,q0,q1,q4 # [0*II+11] id:745
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+12] id:749
|
||||
ee.vmulas.u8.accx.ld.xp.qup q2,a2,a11,q6,q1,q4,q0 # [0*II+13] id:747
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+14] id:748
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+15] id:751
|
||||
ee.vmulas.u8.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+16] id:750
|
||||
|
||||
.LBB195_dspi_dotprod_u8_aes3: # 0x28e
|
||||
movi.n a2,0 # [0]
|
||||
movi.n a11,1 # [1]
|
||||
addi.n a12,a7,-1 # [2]
|
||||
rur.accx_0 a10 # [3]
|
||||
ssl a12 # [4]
|
||||
sll a11,a11 # [5]
|
||||
ssr a7 # [6]
|
||||
add.n a10,a10,a11 # [7]
|
||||
srl a10,a10 # [8]
|
||||
s8i a10,a4,0 # [9] id:773
|
||||
retw.n # [10]
|
||||
|
||||
.LBB28_dspi_dotprod_u8_aes3: # 0x2aa
|
||||
mov.n a15,a7 # [0]
|
||||
mov.n a14,a6 # [1]
|
||||
mov.n a13,a5 # [2]
|
||||
mov.n a12,a4 # [3]
|
||||
mov.n a11,a3 # [4]
|
||||
mov.n a10,a2 # [5]
|
||||
call8 dspi_dotprod_u8_ansi # [6] dspi_dotprod_u8_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
|
||||
#endif // dsps_dotprod_s16_aes3_enabled
|
||||
@@ -0,0 +1,49 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
|
||||
esp_err_t dspi_dotprod_u8_ansi(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift)
|
||||
{
|
||||
if (in_image->step_x * count_x > in_image->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (in_image->step_y * count_y > in_image->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_x * count_x > filter->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_y * count_y > filter->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
|
||||
uint8_t *i_data = (uint8_t *)in_image->data;
|
||||
uint8_t *f_data = (uint8_t *)filter->data;
|
||||
int i_step = in_image->stride_x * in_image->step_y;
|
||||
int f_step = filter->stride_x * filter->step_y;
|
||||
|
||||
int32_t acc = 0;
|
||||
for (int y = 0; y < count_y; y++) {
|
||||
for (int x = 0; x < count_x; x++) {
|
||||
acc += (int16_t)i_data[in_image->step_x * x] * (int16_t)f_data[filter->step_x * x];
|
||||
}
|
||||
i_data += i_step;
|
||||
f_data += f_step;
|
||||
}
|
||||
acc += 1 << (shift - 1); // round operation
|
||||
acc >>= shift;
|
||||
*out_value = acc;
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,93 @@
|
||||
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_arp4_enabled == 1)
|
||||
#include "dsp_err_codes.h"
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.global dspi_dotprod_u8_arp4
|
||||
.global dspi_dotprod_u8_ansi
|
||||
.type dspi_dotprod_u8_arp4,@function
|
||||
|
||||
// esp_err_t dspi_dotprod_u8_arp4(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift);
|
||||
dspi_dotprod_u8_arp4:
|
||||
// in_image - a0
|
||||
// filter - a1
|
||||
// out_value - a2
|
||||
// count_x - a3
|
||||
// count_y - a4
|
||||
// shift - a5
|
||||
|
||||
// i_data - t0
|
||||
// f_data - t1
|
||||
// i_step - t2
|
||||
// f_step - t3
|
||||
// t4 - current i_data
|
||||
// t5 - current f_data
|
||||
|
||||
lw t1, 4(a0) // load in_image->step_x
|
||||
lw t2, 4(a1) // load filter->step_x
|
||||
or t1, t1, t2
|
||||
addi t1, t1, -1 // should be 0 now
|
||||
andi t2, a3, 15
|
||||
or t1, t1, t2
|
||||
|
||||
beqz t1, .dspi_dotprod_u8_arp4_body
|
||||
j dspi_dotprod_u8_ansi
|
||||
|
||||
.dspi_dotprod_u8_arp4_body:
|
||||
add sp, sp, -16
|
||||
lw t0, 0(a0) // i_data
|
||||
lw t1, 0(a1) // f_data
|
||||
|
||||
lw t2, 8(a0) // step_y
|
||||
lw t4, 12(a0) // stride_x
|
||||
mul t2, t4, t2
|
||||
|
||||
lw t3, 8(a1) // step_y
|
||||
lw t5, 12(a1) // stride_x
|
||||
mul t3, t5, t3
|
||||
|
||||
srli t6, a3, 4 // t5 = len/16
|
||||
|
||||
addi a6, a5, -1
|
||||
li t4, 1
|
||||
sll t4, t4, a6
|
||||
esp.zero.xacc
|
||||
esp.movx.w.xacc.l t4
|
||||
|
||||
.loop_count_y:
|
||||
mv t4, t0
|
||||
mv t5, t1
|
||||
esp.vld.128.ip q0, t4, 16 // q0 - i_data
|
||||
|
||||
esp.lp.setup 0, t6, .loop_count_x
|
||||
esp.vld.128.ip q1, t5, 16 // q1 - f_data
|
||||
.loop_count_x: esp.vmulas.u8.xacc.ld.ip q0, t4, 16, q0, q1 // q0 - i_data
|
||||
|
||||
add t0, t0, t2
|
||||
add t1, t1, t3
|
||||
add a4,a4, -1
|
||||
bgtz a4, .loop_count_y
|
||||
|
||||
esp.srs.u.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
|
||||
sh t5, 0(a2) // store result to output buffer
|
||||
|
||||
li a0,0
|
||||
add sp,sp,16
|
||||
ret
|
||||
|
||||
#endif // dspi_dotprod_arp4_enabled
|
||||
@@ -0,0 +1,80 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dsps_dotprod_platform.h"
|
||||
#if (dsps_dotprod_s16_ae32_enabled == 1)
|
||||
#include "dsps_dotprod_s16_m_ae32.S"
|
||||
#include "dsp_err_codes.h"
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.global dsps_dotprod_s16_ae32
|
||||
.type dsps_dotprod_s16_ae32,@function
|
||||
|
||||
|
||||
//esp_err_t dsps_dotprod_s16_ae32(const int16_t* src1, const int16_t* src2, int16_t* dest, int len, int8_t shift);
|
||||
dsps_dotprod_s16_ae32:
|
||||
// src1 - a2
|
||||
// src2 - a3
|
||||
// dest - a4
|
||||
// len - a5
|
||||
// shift - a6
|
||||
|
||||
entry a1, 16
|
||||
|
||||
// Check minimum length
|
||||
movi a8, 4
|
||||
blt a5, a8, dsps_dotprod_s16_ae32_error
|
||||
|
||||
// Clear accumulator
|
||||
movi a8, 0
|
||||
wsr a8, acchi
|
||||
|
||||
// Prepare and load round value
|
||||
movi a8, 0x7fff
|
||||
ssr a6
|
||||
srl a8, a8
|
||||
wsr a8, acclo // initialize acc with shifted round value
|
||||
|
||||
// Compensate for pre-increment
|
||||
// Right shift to 16 bits
|
||||
// RS = -shift + 15
|
||||
neg a6, a6
|
||||
addi a6, a6, 15
|
||||
|
||||
/* number of loop iterations (see below):
|
||||
* a7 = count / 4 - 1
|
||||
*/
|
||||
|
||||
srli a7, a5, 2
|
||||
addi a7, a7, -1
|
||||
|
||||
movi.n a10, 0 // load 0 to the a10 to increment second array
|
||||
|
||||
dotprod_s16_ae32_full a2, a3, a7, a5
|
||||
|
||||
/* Get accumulator */
|
||||
ssr a6
|
||||
rsr a2, acchi
|
||||
rsr a3, acclo
|
||||
src a2, a2, a3
|
||||
|
||||
s16i a2, a4, 0
|
||||
movi.n a2, 0
|
||||
retw.n
|
||||
dsps_dotprod_s16_ae32_error:
|
||||
movi.n a2, ESP_ERR_DSP_INVALID_LENGTH
|
||||
retw.n
|
||||
|
||||
#endif // dsps_dotprod_s16_ae32_enabled
|
||||
@@ -0,0 +1,33 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dsps_dotprod.h"
|
||||
|
||||
esp_err_t dsps_dotprod_s16_ansi(const int16_t *src1, const int16_t *src2, int16_t *dest, int len, int8_t shift)
|
||||
{
|
||||
// To make correct round operation we have to shift round value
|
||||
long long acc = 0x7fff >> shift;
|
||||
|
||||
for (int i = 0 ; i < len ; i++) {
|
||||
acc += (int32_t)src1[i] * (int32_t)src2[i];
|
||||
}
|
||||
|
||||
int final_shift = shift - 15;
|
||||
if (final_shift > 0) {
|
||||
*dest = (acc << final_shift);
|
||||
} else {
|
||||
*dest = (acc >> (-final_shift));
|
||||
}
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,74 @@
|
||||
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dsps_dotprod_platform.h"
|
||||
#if (dsps_dotprod_s16_arp4_enabled == 1)
|
||||
#include "dsp_err_codes.h"
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.global dsps_dotprod_s16_arp4
|
||||
.global dsps_dotprod_s16_ansi
|
||||
.type dsps_dotprod_s16_arp4,@function
|
||||
|
||||
//esp_err_t dsps_dotprod_s16_arp4(const int16_t* src1, const int16_t* src2, int16_t* dest, int len, int8_t shift);
|
||||
dsps_dotprod_s16_arp4:
|
||||
// src1 - a0
|
||||
// src2 - a1
|
||||
// dest - a2
|
||||
// len - a3
|
||||
// shift - a4
|
||||
andi a5, a3, 7
|
||||
beqz a5, .dsps_dotprod_s16_arp4_body
|
||||
j dsps_dotprod_s16_ansi
|
||||
|
||||
.dsps_dotprod_s16_arp4_body:
|
||||
add sp,sp,-16
|
||||
|
||||
// Enable analigned data access
|
||||
esp.movx.r.cfg t6
|
||||
or t6, t6, 2
|
||||
esp.movx.w.cfg t6
|
||||
|
||||
add t6, a4, -15
|
||||
neg t6, t6 // t6 - real_shift
|
||||
|
||||
li t3, 0x7fff
|
||||
srl t3, t3, a4
|
||||
esp.zero.xacc
|
||||
esp.movx.w.xacc.l t3
|
||||
|
||||
mv t3, a0
|
||||
mv t4, a1
|
||||
|
||||
esp.vld.128.ip q0, t3, 16 //q0 - src1
|
||||
srli t5, a3, 3 // t5 = len>>3
|
||||
# esp.lp.setup 0, t5, .main_loop
|
||||
# esp.vld.128.ip q1, t4, 16 // q1 - src1
|
||||
# .main_loop: esp.vmulas.s16.xacc.ld.ip q0, t3, 16, q0, q1 // q0 - src2
|
||||
|
||||
.main_loop:
|
||||
esp.vld.128.ip q1, t4, 16 // q1 - src1
|
||||
esp.vmulas.s16.xacc.ld.ip q0, t3, 16, q0, q1 // q0 - src2
|
||||
add t5, t5, -1
|
||||
bgtz t5, .main_loop
|
||||
|
||||
esp.srs.s.xacc t5, t6 // shift accx register by final_shift amount (a6), save the lower 32bits to a15
|
||||
sh t5, 0(a2) // store result to output buffer
|
||||
|
||||
li a0,0
|
||||
add sp,sp,16
|
||||
ret
|
||||
|
||||
#endif // dsps_dotprod_s16_ae32_enabled
|
||||
@@ -0,0 +1,104 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
.macro dotprod_s16_ae32 x1, x2, count
|
||||
// This macro calculates fixed point dot product for ((count + 1)*4) int16 samples
|
||||
// x1 - input array1 register (for example a2)
|
||||
// x2 - input array2 register (for example a3)
|
||||
// count - counter register (for example a7)
|
||||
// count - samples_count / 4 - 1
|
||||
// acc += x1[i + 0]*x2[i + 0] + x1[i + 1]*x2[i + 1] + x1[i + 2]*x2[i + 2] + x1[i + 3]*x2[i + 3]; i: 0..count
|
||||
// acchi, and acclo have to be initialize before
|
||||
// Result - acchi || acclo
|
||||
// Modifies:
|
||||
// m0, m1, m2, m3
|
||||
// acchi || acclo - must be loaded before (for example 0x3fff to acclo).
|
||||
|
||||
/*
|
||||
* Data schedule. Each line represents instruction, columns represent
|
||||
* register contents. Last column (MUL) shows the multiplication which
|
||||
* takes place. Values loaded in the given cycle are shown in square brackets.
|
||||
*
|
||||
* m0 m1 m2 m3 MUL
|
||||
* --------- pre-load ------------
|
||||
*[x0 x1] (no MULs in the first 3 instructions)
|
||||
* x0 x1 [y0 y1]
|
||||
* x0 x1 [x2 x3] y0 y1
|
||||
* x0 x1 x2 x3 y0 y1 [y2 y3] x0*y0
|
||||
* ---------- loop -------------- (the following 4 instructions are
|
||||
*[x4 x5] x2 x3 y0 y1 y2 y3 x1*y1 repeated as much as needed)
|
||||
* x4 x5 x2 x3 [y4 y5] y2 y3 x2*y2
|
||||
* x4 x5 [x6 x7] y4 y5 y2 y3 x3*y3
|
||||
* x4 x5 x6 x7 y4 y5 [y6 y7] x4*y4
|
||||
* --------- finalize ------------
|
||||
* x4 x5 x6 x7 y4 y5 y6 y7 x5*y5 (nothing is load)
|
||||
* x4 x5 x6 x7 y4 y5 y6 y7 x6*y6
|
||||
* x4 x5 x6 x7 y4 y5 y6 y7 x7*y7
|
||||
*/
|
||||
|
||||
addi \x1, \x1, -4 // To arrange fist pointer
|
||||
addi \x2, \x2, -4 // To arrange fist pointer
|
||||
//lddec m0, \x1
|
||||
//lddec m2, \x2 // To arrange fist pointer
|
||||
|
||||
ldinc m0, \x1
|
||||
ldinc m2, \x2
|
||||
ldinc m1, \x1
|
||||
|
||||
mula.dd.ll.ldinc m3, \x2, m0, m2
|
||||
loopnez \count, .loop_end
|
||||
.loop:
|
||||
mula.dd.hh.ldinc m0, \x1, m0, m2
|
||||
mula.dd.ll.ldinc m2, \x2, m1, m3
|
||||
mula.dd.hh.ldinc m1, \x1, m1, m3
|
||||
mula.dd.ll.ldinc m3, \x2, m0, m2
|
||||
.loop_end:
|
||||
|
||||
mula.dd.hh m0, m2
|
||||
mula.dd.ll m1, m3
|
||||
mula.dd.hh m1, m3
|
||||
|
||||
.endm // dotprod_s16_ae32
|
||||
|
||||
|
||||
.macro dotprod_s16_ae32_full x1, x2, count, full_count
|
||||
// This macro calculates fixed point dot product for ((count + 1)*4) int16 samples
|
||||
// x1 - input array1 register (for example a2)
|
||||
// x2 - input array2 register (for example a3)
|
||||
// count - counter register (for example a7)
|
||||
// count - samples_count / 4 - 1
|
||||
// full_count - samples_count
|
||||
// acc += x1[i + 0]*x2[i + 0] + x1[i + 1]*x2[i + 1] + x1[i + 2]*x2[i + 2] + x1[i + 3]*x2[i + 3]; i: 0..count
|
||||
// acchi, and acclo have to be initialize before
|
||||
// Result - acchi || acclo
|
||||
// Modifies:
|
||||
// m0, m1, m2, m3
|
||||
// acchi || acclo - must be loaded before (for example 0x3fff to acclo).
|
||||
|
||||
dotprod_s16_ae32 \x1, \x2, \count
|
||||
|
||||
bbci \full_count, 1, .mod2chk
|
||||
ldinc m0, \x1
|
||||
ldinc m2, \x2
|
||||
mula.dd.hh m0, m2
|
||||
mula.dd.ll m0, m2
|
||||
.mod2chk:
|
||||
bbci \full_count, 0, .mod1chk
|
||||
ldinc m0, \x1
|
||||
ldinc m2, \x2
|
||||
mula.dd.ll m0, m2
|
||||
.mod1chk:
|
||||
|
||||
.endm // dotprod_s16_ae32_full
|
||||
Reference in New Issue
Block a user